From 8494275d7608942aa584c9c13bd5e2d77be9906c Mon Sep 17 00:00:00 2001
From: Chen Feiyue <69809761+chenfeiyue-cfy@users.noreply.github.com>
Date: Wed, 8 Jan 2025 13:22:46 +0800
Subject: [PATCH] Update internal ovxlib to release/1.2.22 (#706)

* Update internal ovxlib to release/1.2.22

Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com>

* Refine yaml file for blocking tfhub model tests

Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com>

---------

Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com>
---
 .github/workflows/cmake_x86_vsim.yml          |  106 +-
 VERSION                                       |    2 +-
 .../include/custom/custom_node_type.def       |    1 +
 .../vx/internal/include/custom/custom_ops.def |    1 +
 .../custom/ops/vsi_nn_op_custom_letterbox.h   |   61 +
 .../include/custom/vsi_nn_custom_node_type.h  |    1 +
 src/tim/vx/internal/include/interface/ops.def |    1 +
 .../include/ops/vsi_nn_op_pre_process_rgb.h   |    2 +-
 .../vx/internal/include/ops/vsi_nn_op_rope.h  |   49 +
 .../vx/internal/include/ops/vsi_nn_op_topk.h  |    1 +
 .../include/utils/vsi_nn_dtype_util_prv.h     |   16 +-
 src/tim/vx/internal/include/vsi_nn/vsi_nn.h   | 2034 ----------
 src/tim/vx/internal/include/vsi_nn_context.h  |   18 +-
 .../internal/include/vsi_nn_feature_config.h  |    3 +
 .../vx/internal/include/vsi_nn_node_type.h    |    2 +
 src/tim/vx/internal/include/vsi_nn_tensor.h   |    4 +-
 .../vx/internal/include/vsi_nn_tensor_util.h  |   28 +
 src/tim/vx/internal/include/vsi_nn_version.h  |    2 +-
 .../ops/kernel/evis/custom_letterbox_evis.c   |  475 +++
 .../ops/kernel/evis/custom_softmax_evis.c     |  124 +-
 .../custom/ops/vsi_nn_op_custom_letterbox.c   |  227 ++
 src/tim/vx/internal/src/kernel/cl/cumsum_cl.c |    6 +
 .../vx/internal/src/kernel/cl/matrixmul_cl.c  |    4 +-
 .../vx/internal/src/kernel/cl/one_hot_cl.c    |    1 +
 src/tim/vx/internal/src/kernel/cl/prelu_cl.c  |    2 +-
 src/tim/vx/internal/src/kernel/cl/rope_cl.c   |  329 ++
 src/tim/vx/internal/src/kernel/cl/swish_cl.c  |    3 +-
 src/tim/vx/internal/src/kernel/cl/topk_cl.c   |    8 +-
 .../internal/src/kernel/evis/bucketize_evis.c |    3 +-
 .../src/kernel/evis/depthwise_conv1d_evis.c   |    4 +-
 .../kernel/evis/group_normalization_evis.c    |   21 +-
 .../vx/internal/src/kernel/evis/prelu_evis.c  |  230 +-
 .../src/kernel/evis/resize_bilinear_evis.c    |  607 +--
 .../vx/internal/src/kernel/evis/rope_evis.c   |  744 ++++
 .../src/kernel/evis/scatter_nd_update_evis.c  |   29 +
 .../vx/internal/src/kernel/evis/swish_evis.c  |    3 +-
 .../vx/internal/src/kernel/vsi_nn_kernel.c    |   24 +-
 .../src/kernel/vsi_nn_kernel_selector.c       |    7 +-
 .../vx/internal/src/kernel/vx/group_norm_vx.c |   89 +
 .../internal/src/kernel/vx/instance_norm_vx.c |   87 +
 .../vx/internal/src/kernel/vx/layer_norm_vx.c |   22 +-
 src/tim/vx/internal/src/kernel/vx/pad2_vx.c   |    3 +-
 .../internal/src/kernel/vx/relationalops_vx.c |   27 +-
 src/tim/vx/internal/src/kernel/vx/swish_vx.c  |    3 +-
 .../vx/internal/src/libnnext/ops/cl/cumsum.cl |  129 +-
 .../internal/src/libnnext/ops/cl/cumsum_2d.cl |  554 +--
 .../internal/src/libnnext/ops/cl/one_hot.cl   |   27 +
 .../vx/internal/src/libnnext/ops/cl/rope_0.cl |  373 ++
 .../src/libnnext/ops/vx/custom_letterbox.vx   |  307 ++
 .../src/libnnext/ops/vx/custom_softmax.vx     |   94 +-
 .../libnnext/ops/vx/group_normalization_2.vx  |   44 +-
 .../vx/internal/src/libnnext/ops/vx/prelu.vx  |  161 +-
 ...resize_bilinear_U8_half_pixel_centers_3.vx |  181 +
 ...resize_bilinear_U8_half_pixel_centers_4.vx |  102 +
 ...resize_bilinear_U8_half_pixel_centers_5.vx |  167 +
 .../vx/internal/src/libnnext/ops/vx/rope_0.vx |  303 ++
 .../vx/internal/src/libnnext/ops/vx/rope_1.vx |  245 ++
 .../vx/internal/src/libnnext/ops/vx/rope_2.vx |  312 ++
 .../vx/internal/src/libnnext/ops/vx/rope_3.vx |  312 ++
 .../ops/vx/scatter_nd_update_special.vx       |   98 +
 .../src/libnnext/vsi_nn_libnnext_resource.c   | 3438 ++++++++++++++---
 .../internal/src/libnnext/vsi_nn_vxkernel.c   |   21 +-
 .../internal/src/ops/vsi_nn_op_batch_norm.c   |  192 +-
 .../vx/internal/src/ops/vsi_nn_op_bitcast.c   |    1 +
 .../vx/internal/src/ops/vsi_nn_op_cumsum.c    |    1 +
 .../internal/src/ops/vsi_nn_op_dataconvert.c  |    1 +
 .../vx/internal/src/ops/vsi_nn_op_eltwise.c   |   13 +-
 .../vx/internal/src/ops/vsi_nn_op_gather.c    |    1 +
 .../src/ops/vsi_nn_op_groupnormalize.c        |    2 +
 .../vx/internal/src/ops/vsi_nn_op_grucell.c   |    5 +-
 .../src/ops/vsi_nn_op_l1_layer_norm.c         |   22 -
 .../src/ops/vsi_nn_op_l2normalizescale.c      |    3 +-
 .../src/ops/vsi_nn_op_layernormalize.c        |    2 +-
 .../src/ops/vsi_nn_op_lstmunit_activation.c   |    3 +-
 .../vx/internal/src/ops/vsi_nn_op_one_hot.c   |    6 +
 .../src/ops/vsi_nn_op_pre_process_rgb.c       |   98 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c |   96 +-
 .../vx/internal/src/ops/vsi_nn_op_reduce.c    |    4 +-
 .../vx/internal/src/ops/vsi_nn_op_reshape.c   |   31 +-
 .../vx/internal/src/ops/vsi_nn_op_reshape2.c  |   27 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_rope.c  |  145 +
 .../src/ops/vsi_nn_op_softmax_internal.c      |    3 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_topk.c  |   97 +-
 .../src/utils/vsi_nn_code_generator.c         |    1 +
 src/tim/vx/internal/src/utils/vsi_nn_dtype.c  |    2 +-
 src/tim/vx/internal/src/utils/vsi_nn_util.c   |   17 +-
 src/tim/vx/internal/src/vsi_nn_context.c      |  127 +
 src/tim/vx/internal/src/vsi_nn_graph.c        |    5 +-
 .../internal/src/vsi_nn_graph_optimization.c  |    9 +-
 .../vx/internal/src/vsi_nn_pre_post_process.c |   46 +-
 src/tim/vx/internal/src/vsi_nn_rnn_helper.c   |    3 +-
 src/tim/vx/internal/src/vsi_nn_tensor.c       |   91 +
 .../vx/internal/src/vsi_nn_tensor_util_prv.h  |   10 +
 src/tim/vx/internal/src/vsi_nn_types_prv.h    |    5 +
 94 files changed, 9466 insertions(+), 3885 deletions(-)
 create mode 100644 src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_letterbox.h
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_rope.h
 delete mode 100644 src/tim/vx/internal/include/vsi_nn/vsi_nn.h
 create mode 100644 src/tim/vx/internal/src/custom/ops/kernel/evis/custom_letterbox_evis.c
 create mode 100644 src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_letterbox.c
 create mode 100644 src/tim/vx/internal/src/kernel/cl/rope_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/evis/rope_evis.c
 create mode 100644 src/tim/vx/internal/src/kernel/vx/group_norm_vx.c
 create mode 100644 src/tim/vx/internal/src/kernel/vx/instance_norm_vx.c
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/rope_0.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/custom_letterbox.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_3.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_4.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_5.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/rope_0.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/rope_1.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/rope_2.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/rope_3.vx
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_rope.c

diff --git a/.github/workflows/cmake_x86_vsim.yml b/.github/workflows/cmake_x86_vsim.yml
index 02a60b0..45d399a 100644
--- a/.github/workflows/cmake_x86_vsim.yml
+++ b/.github/workflows/cmake_x86_vsim.yml
@@ -124,7 +124,7 @@ jobs:
         run: |
           git config --global user.email "xiang.zhang@verisilicon.com"
           git config --global user.name "xiang.zhang"
-          git clone https://github.com/tensorflow/tensorflow.git ${{github.workspace}}/3rd-party/tensorflow && cd ${{github.workspace}}/3rd-party/tensorflow/ && git checkout v2.10.0
+          git clone https://github.com/tensorflow/tensorflow.git ${{github.workspace}}/3rd-party/tensorflow && cd ${{github.workspace}}/3rd-party/tensorflow/ && git checkout v2.16.1
           git clone https://github.com/VeriSilicon/tflite-vx-delegate.git ${{github.workspace}}/vx-delegate
           cmake -B ${{github.workspace}}/vx-delegate/build -S ${{github.workspace}}/vx-delegate -DFETCHCONTENT_SOURCE_DIR_TENSORFLOW=${{github.workspace}}/3rd-party/tensorflow -DTIM_VX_INSTALL=${{github.workspace}}/tim-vx.install.dir/ -DTFLITE_ENABLE_NNAPI=OFF -DTFLITE_ENABLE_XNNPACK=OFF
           cmake --build ${{github.workspace}}/vx-delegate/build --config ${{env.BUILD_TYPE}}
@@ -283,61 +283,61 @@ jobs:
   #         chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
   #         ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/tfhub.movenet.multipose.tflite
 
-  tfhub-efficientdet-lite0:
-    runs-on: ubuntu-latest
-    needs: [vx-delegate-build, tim-vx-unit-test]
-    steps:
-      - name: download test binary
-        uses: actions/download-artifact@v3
-      - name: download model
-        run: |
-          wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite0/detection/metadata/1.tflite
-      - name: benchmark-model
-        run: |
-          chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
-          ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
+  # tfhub-efficientdet-lite0:
+  #   runs-on: ubuntu-latest
+  #   needs: [vx-delegate-build, tim-vx-unit-test]
+  #   steps:
+  #     - name: download test binary
+  #       uses: actions/download-artifact@v3
+  #     - name: download model
+  #       run: |
+  #         wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite0/detection/metadata/1.tflite
+  #     - name: benchmark-model
+  #       run: |
+  #         chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
+  #         ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
 
-  tfhub-efficientdet-lite1:
-    runs-on: ubuntu-latest
-    needs: [vx-delegate-build, tim-vx-unit-test]
-    steps:
-      - name: download test binary
-        uses: actions/download-artifact@v3
-      - name: download model
-        run: |
-          wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite1/detection/metadata/1.tflite
-      - name: benchmark-model
-        run: |
-          chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
-          ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
+  # tfhub-efficientdet-lite1:
+  #   runs-on: ubuntu-latest
+  #   needs: [vx-delegate-build, tim-vx-unit-test]
+  #   steps:
+  #     - name: download test binary
+  #       uses: actions/download-artifact@v3
+  #     - name: download model
+  #       run: |
+  #         wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite1/detection/metadata/1.tflite
+  #     - name: benchmark-model
+  #       run: |
+  #         chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
+  #         ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
 
-  tfhub-efficientdet-lite2:
-    runs-on: ubuntu-latest
-    needs: [vx-delegate-build, tim-vx-unit-test]
-    steps:
-      - name: download test binary
-        uses: actions/download-artifact@v3
-      - name: download model
-        run: |
-          wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
-      - name: benchmark-model
-        run: |
-          chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
-          ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
+  # tfhub-efficientdet-lite2:
+  #   runs-on: ubuntu-latest
+  #   needs: [vx-delegate-build, tim-vx-unit-test]
+  #   steps:
+  #     - name: download test binary
+  #       uses: actions/download-artifact@v3
+  #     - name: download model
+  #       run: |
+  #         wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
+  #     - name: benchmark-model
+  #       run: |
+  #         chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
+  #         ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
 
-  tfhub-efficientdet-lite3:
-    runs-on: ubuntu-latest
-    needs: [vx-delegate-build, tim-vx-unit-test]
-    steps:
-      - name: download test binary
-        uses: actions/download-artifact@v3
-      - name: download model
-        run: |
-          wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
-      - name: benchmark-model
-        run: |
-          chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
-          ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
+  # tfhub-efficientdet-lite3:
+  #   runs-on: ubuntu-latest
+  #   needs: [vx-delegate-build, tim-vx-unit-test]
+  #   steps:
+  #     - name: download test binary
+  #       uses: actions/download-artifact@v3
+  #     - name: download model
+  #       run: |
+  #         wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
+  #     - name: benchmark-model
+  #       run: |
+  #         chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
+  #         ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
 
   # acuity-yolov3-608-quant:
   #   runs-on: ubuntu-latest
diff --git a/VERSION b/VERSION
index fd9d1a5..9a83513 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.14
+1.2.22
diff --git a/src/tim/vx/internal/include/custom/custom_node_type.def b/src/tim/vx/internal/include/custom/custom_node_type.def
index c5ef3e0..9ac9424 100644
--- a/src/tim/vx/internal/include/custom/custom_node_type.def
+++ b/src/tim/vx/internal/include/custom/custom_node_type.def
@@ -9,3 +9,4 @@ DEF_NODE_TYPE(custom_sample)
 DEF_NODE_TYPE(custom_tiny_yolov4_postprocess)
 DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_confidence)
 DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_box)
+DEF_NODE_TYPE(custom_letterbox)
\ No newline at end of file
diff --git a/src/tim/vx/internal/include/custom/custom_ops.def b/src/tim/vx/internal/include/custom/custom_ops.def
index 2074b8f..47f25f2 100644
--- a/src/tim/vx/internal/include/custom/custom_ops.def
+++ b/src/tim/vx/internal/include/custom/custom_ops.def
@@ -9,3 +9,4 @@ DEF_OP(CUSTOM_SAMPLE)
 DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS)
 DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE)
 DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX)
+DEF_OP(CUSTOM_LETTERBOX)
diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_letterbox.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_letterbox.h
new file mode 100644
index 0000000..ef01263
--- /dev/null
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_letterbox.h
@@ -0,0 +1,61 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VSI_NN_OP_CUSTOM_LETTERBOX_H
+#define _VSI_NN_OP_CUSTOM_LETTERBOX_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_custom_letterbox_param
+{
+    struct _custom_letterbox_local_data_t* local;
+    int32_t new_shape_w;
+    int32_t new_shape_h;
+    vx_bool auto_bool;
+    vx_bool scaleFill;
+    vx_bool scaleup;
+    int32_t stride;
+    vx_bool center;
+    float mean_r;
+    float mean_g;
+    float mean_b;
+    float scale_r;
+    float scale_g;
+    float scale_b;
+    int32_t pad_value_r;
+    int32_t pad_value_g;
+    int32_t pad_value_b;
+    vx_bool reverse_channel;
+} vsi_nn_custom_letterbox_param;
+_compiler_assert(offsetof(vsi_nn_custom_letterbox_param, local) == 0, \
+    vsi_nn_custom_lertterbox_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
index eb23a20..2c83d81 100644
--- a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
+++ b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
@@ -34,5 +34,6 @@
 #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h"
 #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h"
 #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h"
+#include "custom/ops/vsi_nn_op_custom_letterbox.h"
 
 #endif
diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def
index 23d3f74..28f5716 100644
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@@ -203,3 +203,4 @@ DEF_OP(BITCAST)
 DEF_OP(GROUPED_CONV3D)
 DEF_OP(COL2IM)
 DEF_OP(L1_LAYER_NORM)
+DEF_OP(ROPE)
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
index 9e05a59..0944ae6 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
@@ -80,7 +80,7 @@ typedef struct _vsi_nn_pre_process_rgb_param
     float g_scale;
     float b_scale;
     /* pre process rgb layer local data structure */
-    vsi_nn_pre_process_rgb_lcl_data local;
+    vsi_nn_pre_process_rgb_lcl_data *local;
 } vsi_nn_pre_process_rgb_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_rope.h b/src/tim/vx/internal/include/ops/vsi_nn_op_rope.h
new file mode 100644
index 0000000..7d16fb0
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_rope.h
@@ -0,0 +1,49 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_ROPE_H
+#define _VSI_NN_OP_ROPE_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_rope_param
+{
+    struct _rope_local_data_t* local;
+    // Add parameters here
+    int32_t axis;
+    vsi_bool interleaved;
+} vsi_nn_rope_param;
+_compiler_assert(offsetof(vsi_nn_rope_param, local) == 0, \
+    vsi_nn_rope_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
index bccc0b5..99d57e2 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
@@ -34,6 +34,7 @@ typedef struct _vsi_nn_topk_param
 {
     uint32_t     k;
     int32_t      axis;
+    struct _topk_local_data_t* local;
 } vsi_nn_topk_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
index ed78571..b005473 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
@@ -384,25 +384,17 @@ static VSI_INLINE_API float fp16_to_fp32
 
 static VSI_INLINE_API float bfp16_to_fp32
     (
-    int16_t in
+    uint16_t in
     )
 {
-    uint32_t t1, t2, t3;
     float out;
     fp32_bit_cast_t fp32_bit_cast;
 
-    t1 = in & 0x00FF;                       // Mantissa
-    t2 = in & 0xFF00;                       // Sign bit + Exponent
-    t3 = in & 0x7F00;                       // Exponent
+    fp32_bit_cast.data = (uint32_t)(in << 16);
 
-    t1 <<= 16;
-    t2 <<= 16;                              // Shift (sign + Exponent) bit into position
-    t1 |= t2;                               // Re-insert (sign + Exponent) bit
-
-    fp32_bit_cast.data = t1;
     out = fp32_bit_cast.val;
 
-    return t3 == 0 ? 0.0f : out;
+    return out;
 } /* bfp16_to_fp32() */
 
 static VSI_INLINE_API uint16_t fp32_to_fp16
@@ -720,7 +712,7 @@ static VSI_INLINE_API vsi_status dtype_to_float32
         *dst = fp16_to_fp32( *(int16_t *)src );
         break;
     case VSI_NN_TYPE_BFLOAT16:
-        *dst = bfp16_to_fp32( *(int16_t *)src );
+        *dst = bfp16_to_fp32( *(uint16_t *)src );
         break;
     case VSI_NN_TYPE_FLOAT8_E4M3:
         *dst = fp8_e4m3_to_fp32(*(int8_t*)src, src_dtype->scale);
diff --git a/src/tim/vx/internal/include/vsi_nn/vsi_nn.h b/src/tim/vx/internal/include/vsi_nn/vsi_nn.h
deleted file mode 100644
index 115a2e8..0000000
--- a/src/tim/vx/internal/include/vsi_nn/vsi_nn.h
+++ /dev/null
@@ -1,2034 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-/**
- * @file vsi_nn.h
- */
-#ifndef _VSI_NN_INTERFACE_H
-#define _VSI_NN_INTERFACE_H
-
-#if defined(_MSC_VER)
-#define EXPORT  __declspec(dllexport)
-#elif defined(__linux__)
-#define EXPORT __attribute__((visibility("default")))
-#else
-#define EXPORT
-#endif
-
-#if !defined(_IN)
-#define _IN
-#endif
-#if !defined(_OUT)
-#define _OUT
-#endif
-#if !defined(_INOUT)
-#define _INOUT
-#endif
-#if !defined(_OPTIONAL)
-#define _OPTIONAL
-#endif
-
-#include <stdint.h>
-#include <stddef.h>
-
-#if defined(__cplusplus)
-#define __BEGIN_DECLS extern "C" {
-#define __END_DECLS }
-#else
-#define __BEGIN_DECLS
-#define __END_DECLS
-#endif
-
-__BEGIN_DECLS
-
-
-#ifndef TRUE
-#define TRUE    (1)
-#endif
-#ifndef FALSE
-#define FALSE   (0)
-#endif
-
-
-/**
- * Return codes.
- */
-typedef enum
-{
-    /**
-     * Operation was succesful.
-     */
-    VSI_NN_ERROR_OK = 0,
-
-    /**
-     * Failure caused by vsi_nn api fail.
-     */
-    VSI_NN_ERROR_API_FAIL = 1,
-
-    /**
-     * Failure caused by not enough available memory.
-     */
-    VSI_NN_ERROR_OUT_OF_MEMORY = 2,
-
-    /**
-     * Failure caused by unexpected null argument.
-     */
-    VSI_NN_ERROR_UNEXPECTED_NULL = 3,
-
-    /**
-     * Failure caused by invalid function arguments, invalid model definition,
-     * invalid execution definition or invalid data at execution time.
-     */
-    VSI_NN_ERROR_VALUED_ERROR = 4,
-
-    /**
-     * Failure caused by operations that need completed graph.
-     */
-    VSI_NN_ERROR_UNCOMPLETE_GRAPH = 5,
-
-    /**
-     * Failure caused by insearting a keyword argument repeatly.
-     */
-    VSI_NN_ERROR_KWARGS_REPEAT = 6,
-} VSI_NN_error_e;
-
-/**
- * Implicit padding algorithms.
- */
-typedef enum
-{
-    /**
-     * Pad with const value which are specific by others parameters.
-     */
-    VSI_NN_IMPLICIT_PADDING_NONE = 0,
-
-    /**
-     * Implicit(VALID) padding.
-     * No padding.
-     */
-    VSI_NN_IMPLICIT_PADDING_VALID = 1,
-
-    /**
-     * Implicit(SAME) padding.
-     * Padding on both ends are the "same".
-     */
-    VSI_NN_IMPLICIT_PADDING_SAME = 2,
-} VSI_NN_implicit_padding_e;
-
-/**
- * Padding mode.
- */
-typedef enum
-{
-    /**
-     * Pad with const value which are specific by others parameters, default 0.
-     */
-    VSI_NN_PADDING_MODE_CONSTANT = 0,
-
-    /**
-     * Reflect padding mode
-     */
-    VSI_NN_PADDING_MODE_REFLECT = 1,
-
-    /**
-     * Symmetric padding mode
-     */
-    VSI_NN_PADDING_MODE_SYMMETRIC = 2,
-
-    /**
-     * Replicate padding mode
-     */
-    VSI_NN_PADDING_MODE_REPLICATE = 3,
-} VSI_NN_padding_mode_e;
-
-/**
- * Rounding methods
- */
-typedef enum
-{
-    /**
-     * Floor rounding
-     */
-    VSI_NN_ROUNDING_FLOOR = 0,
-    /**
-     * Ceiling rounding
-     */
-    VSI_NN_ROUNDING_CEIL = 1,
-} VSI_NN_rounding_e;
-
-/**
- * LSH Projection supported types.
- */
-typedef enum
-{
-    /**
-     * Computed bit vector is considered to be sparse.
-     */
-    VSI_NN_LSH_PROJECTION_SPARSE = 1,
-    /**
-     * Computed bit vector is considered to be dense.
-     */
-    VSI_NN_LSH_PROJECTION_DENSE = 2,
-} VSI_NN_lsh_projection_type_e;
-
-/**
- * Supported activation function types.
- */
-typedef enum
-{
-    /** No activation */
-    VSI_NN_ACTIVATION_NONE = 0,
-    /** ReLU activation */
-    VSI_NN_ACTIVATION_RELU = 1,
-    /** ReLU1 activation */
-    VSI_NN_ACTIVATION_RELU1 = 2,
-    /** ReLU6 activation */
-    VSI_NN_ACTIVATION_RELU6 = 3,
-    /** TanH activation */
-    VSI_NN_ACTIVATION_TANH = 4,
-    /** Sigmoid activation */
-    VSI_NN_ACTIVATION_SIGMOID = 5,
-} VSI_NN_activation_e;
-
-/**
- * Tensor types.
- *
- * The type of tensors that can be added to a graph.
- */
-typedef enum
-{
-    /** A tensor of IEEE 754 16 bit floating point values */
-    VSI_NN_TENSOR_FLOAT16 = 0,
-    /** A tensor of 32 bit floating point values */
-    VSI_NN_TENSOR_FLOAT32 = 1,
-    /** A tensor of 64 bit floating point values */
-    VSI_NN_TENSOR_FLOAT64 = 2,
-    /**
-     * A tensor of 8 bit boolean values.
-     *
-     * Values of this operand type are either true or false. A zero value
-     * represents false; any other value represents true.
-     */
-    VSI_NN_TENSOR_BOOL8 = 3,
-    /** A tensor of 8 bit integer values */
-    VSI_NN_TENSOR_INT8 = 4,
-    /** A tensor of 16 bit integer values */
-    VSI_NN_TENSOR_INT16 = 5,
-    /** A tensor of 32 bit integer values */
-    VSI_NN_TENSOR_INT32 = 6,
-    /** A tensor of 64 bit integer values */
-    VSI_NN_TENSOR_INT64 = 7,
-    /** A tensor of 8 bit unsigned integer values */
-    VSI_NN_TENSOR_UINT8 = 8,
-    /** A tensor of 16 bit unsigned integer values */
-    VSI_NN_TENSOR_UINT16 = 9,
-    /** A tensor of 32 bit unsigned integer values */
-    VSI_NN_TENSOR_UINT32 = 10,
-    /** A tensor of 64 bit unsigned integer values */
-    VSI_NN_TENSOR_UINT64 = 11,
-    /** A tensor of 16 bit truncate floating point values */
-    VSI_NN_TENSOR_BFLOAT16 = 12,
-} VSI_NN_tensor_type_e;
-
-typedef enum {
-    /** Not a quantized tensor */
-    VSI_NN_TENSOR_QUANT_NONE = 0,
-    /**
-     * A tensor of 8 bit signed integer values that represent real numbers
-     *
-     * Attached to this tensor is a number that can be used to convert
-     * the 8 bit integer to the real value.
-     *
-     * fraction_length: a 32 bit signed integer, in range [-128, 127].
-     *
-     * The formula is:
-     *  real_value = integer_value / pow(2, fraction_length).
-     */
-    VSI_NN_TENSOR_QUANT8_DFP = 1,
-    /**
-     * A tensor of 16 bit signed integer values that represent real numbers
-     *
-     * Attached to this tensor is a number that can be used to convert
-     * the 16 bit integer to the real value.
-     *
-     * fraction_length: a 32 bit signed integer, in range [-128, 127].
-     *
-     * The formula is:
-     *  real_value = integer_value / pow(2, fraction_length).
-     */
-    VSI_NN_TENSOR_QUANT16_DFP = 2,
-    /**
-     * A tensor of 32 bit signed integer values that represent real numbers
-     *
-     * Attached to this tensor is a number that can be used to convert
-     * the 16 bit integer to the real value.
-     *
-     * fraction_length: a 32 bit signed integer, in range [-128, 127].
-     *
-     * The formula is:
-     *  real_value = integer_value / pow(2, fraction_length).
-     */
-    VSI_NN_TENSOR_QUANT32_DFP = 3,
-    /**
-     * A tensor of 64 bit signed integer values that represent real numbers
-     *
-     * Attached to this tensor is a number that can be used to convert
-     * the 16 bit integer to the real value.
-     *
-     * fraction_length: a 32 bit signed integer, in range [-128, 127].
-     *
-     * The formula is:
-     *  real_value = integer_value / pow(2, fraction_length).
-     */
-    VSI_NN_TENSOR_QUANT64_DFP = 4,
-    /**
-     * A tensor of 8 bit signed integer values that represent real numbers
-     *
-     * Attached to this tensor is a numbers that can be used to convert
-     * the 8 bit integer to the real value.
-     *
-     * scale: a 32 bit floating point value greater than zero.
-     *
-     * The formula is:
-     *  real_value = integer_value * scale.
-     */
-    VSI_NN_TENSOR_QUANT8_SYMM = 5,
-    /**
-     * A tensor of 32 bit signed integer values that represent real numbers
-     *
-     * Attached to this tensor is a numbers that can be used to convert
-     * the 8 bit integer to the real value.
-     *
-     * scale: a 32 bit floating point value greater than zero.
-     *
-     * The formula is:
-     *  real_value = integer_value * scale.
-     */
-    VSI_NN_TENSOR_QUANT32_SYMM = 6,
-    /**
-     * A tensor of 8 bit unsigned integer values that represent real numbers
-     *
-     * Attached to this tensor are two numbers that can be used to convert
-     * the 8 bit integer to the real value.
-     *
-     * scale: a 32 bit floating point value greater than zero.
-     * zero_point: a 32 bit signed integer, in range [0, 255].
-     *
-     * The formula is:
-     *  real_value = (integer_value - zero_point) * scale.
-     */
-    VSI_NN_TENSOR_QUANT8_ASYMM = 7,
-    /**
-     * A tensor of 8 bit signed integers that represent real numbers.
-     *
-     * Attached to this tensor are two numbers that can be used to convert
-     * the 8 bit integer to the real value.
-     *
-     * channel_dim: a 32 bit unsigned integer indicating channel dimension.
-     * scales: an array of positive 32 bit floating point values.
-     * The size of the scales array must be equal to shape[channel_dim].
-     *
-     * The formula is:
-     * realValue[..., C, ...] = integerValue[..., C, ...] * scales[C]
-     * where C is an index in the Channel dimension.
-     */
-    VSI_NN_TENSOR_QUANT8_PERCHANNEL_SYMM = 8,
-    /**
-     * A tensor of 32 bit signed integers that represent real numbers.
-     *
-     * Attached to this tensor are two numbers that can be used to convert
-     * the 8 bit integer to the real value.
-     *
-     * channel_dim: a 32 bit unsigned integer indicating channel dimension.
-     * scales: an array of positive 32 bit floating point values.
-     * The size of the scales array must be equal to shape[channel_dim].
-     *
-     * The formula is:
-     * realValue[..., C, ...] = integerValue[..., C, ...] * scales[C]
-     * where C is an index in the Channel dimension.
-     */
-    VSI_NN_TENSOR_QUANT32_PERCHANNEL_SYMM = 9,
-} VSI_NN_tensor_quant_type_e;
-
-/** Parameters for VSI_NN_TENSOR_QUANT8_ASYMM */
-typedef struct
-{
-    float   scale;
-    int32_t zero_point;
-} VSI_NN_quant_param_asymm;
-
-/** Parameters for VSI_NN_TENSOR_QUANT8_SYMM */
-typedef struct
-{
-    float   scale;
-} VSI_NN_quant_param_symm;
-
-/** Parameters for VSI_NN_TENSOR_QUANT8_DFP */
-typedef struct
-{
-    int32_t fraction_length;
-} VSI_NN_quant_param_dfp;
-
-/** Parameters for VSI_NN_TENSOR_QUANT8_PERCHANNEL_SYMM */
-typedef struct
-{
-    /** The index of the channel dimension. */
-    int32_t channel_dim;
-
-    /**
-     * The array of scaling values for each channel.
-     * Each value must be greater than zero.
-     */
-    const float* scales;
-
-    /**
-     * The size of the scale array.
-     * Should be equal to shape[channel_dim] of the tensor.
-     * */
-    int32_t scale_count;
-} VSI_NN_quant_param_perchannel_symm;
-
-/** Parameters for quantization */
-typedef struct
-{
-    /** Tensor quantize type */
-    VSI_NN_tensor_quant_type_e type;
-    union
-    {
-        /** Dynamic fixed point quantization */
-        VSI_NN_quant_param_dfp dfp;
-        /** Asymmetric affine quantization */
-        VSI_NN_quant_param_asymm asymm;
-        /** Symmetric affine quantization */
-        VSI_NN_quant_param_symm symm;
-        /** Perchannel symmetric affine quantization */
-        VSI_NN_quant_param_perchannel_symm perchannel_symm;
-    } param;
-} VSI_NN_tensor_quant_param;
-
-/**
- * NN Runtime context
- */
-typedef struct _vsi_nn_context_t VSI_NN_context;
-
-/**
- * VSI_NN_graph is an opaque type that contains a description of the network operations.
- *
- * Create graph by calling VSI_NN_graph_create.
- * A graph is completed by calling VSI_NN_graph_verify.
- * A graph is destroyed by calling VSI_NN_graph_release.
- *
- */
-typedef struct _vsi_nn_graph VSI_NN_graph;
-
-/**
- * VSI_NN_tensor is an opaque type that can be used to describe a tensor.
- *
- * Create tensor by calling VSI_NN_tensor_create.
- *
- */
-typedef struct _vsi_nn_tensor VSI_NN_tensor;
-
-/**
- * Create context
- *
- * @return Context handle on success or NULL otherwise.
- */
-EXPORT VSI_NN_context* VSI_NN_context_create();
-
-/**
- *  Release context
- *
- * @param[in] ctx_ptr The pointer to context to release, and reset point to null.
- */
-EXPORT void VSI_NN_context_release
-    (
-    _IN VSI_NN_context** ctx_ptr
-    );
-
-/**
- * Create graph
- * Create a net graph.
- *
- * @param[in] ctx The context used to create graph.
- * @return The graph on success, or NULL otherwise.
- */
-EXPORT VSI_NN_graph* VSI_NN_graph_create
-    (
-    VSI_NN_context* ctx
-    );
-
-/**
- * Release graph
- * Release a graph and free its resource.
- *
- * @param[in] graph_ptr The graph to be release.
- */
-EXPORT void VSI_NN_graph_release
-    (
-    _IN VSI_NN_graph** graph_ptr
-    );
-
-/**
- * Identify graph inputs and outputs
- * Identify the input and output tensors of a graph. User should call this to
- * specific the inputs and outputs, they are used to exchange data between application
- * level and VSI_NN level.
- *
- * @param[in] graph The graph to be identify.
- * @param[in] input_tensors Input tensors.
- * @param[in] input_tensors_num Number of input tensors.
- * @param[in] output_tensors Output tensors.
- * @param[in] output_tensors_num Number of output tensors.
- * @return VSI_NN_ERROR_OK on success
- */
-EXPORT VSI_NN_error_e VSI_NN_graph_identify_input_output
-    (
-    _IN VSI_NN_graph* graph,
-    _IN const VSI_NN_tensor** input_tensors,
-    _IN const int32_t input_tensors_num,
-    _IN const VSI_NN_tensor** output_tensors,
-    _IN const int32_t output_tensors_num
-    );
-
-/**
- * To freeze a graph with verifying and compiling.
- *
- * This function may take a long time to compile the graph, and it must only be called
- * once for a given graph.
- *
- * A frozen graph cannot be modified.
- *
- * @param[in] graph The graph to be finished.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_graph_verify
-    (
-    _IN VSI_NN_graph* graph
-    );
-
-/**
- * Compute a frozen graph.
- *
- * @param[in] graph The graph to be executed.
- *
- * @return VSI_NN_ERROR_OK on success. VSI_NN_ERROR_UNCOMPLETE_GRAPH if
- *         the graph is not finished.
- */
-EXPORT VSI_NN_error_e VSI_NN_graph_compute
-    (
-    _IN const VSI_NN_graph* graph
-    );
-
-//EXPORT VSI_NN_error_e VSI_NN_GRPAH_profile(_IN const VSI_NN_graph* graph);
-
-/**
- * Add a tensor to a graph.
- *
- * @param[in] graph The graph to be added.
- * @param[in] dtype The data type.
- * @param[in] shape The shape for the tensor.
- * @param[in] ndim  The rank for the tensor.
- * @param[in] memory The memory address to the data, the memory address
- *            must be 64-byte align. If it's set to null, vsi_nn can
- *            optimize the memory allocation and this is default behavior.
- * @param[in] memory_size The size of memory.
- * @param[in] quant_param The quantization parameters for the tensor, set
- *            null if it's not quantized tensor.
- *
- * @return Tensor handle on success, or NULL if get failure.
- */
-EXPORT VSI_NN_tensor* VSI_NN_tensor_create
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor_type_e dtype,
-    _IN const int32_t* shape,
-    _IN int32_t ndim,
-    _IN const VSI_NN_tensor_quant_param* quant_param,
-    _IN void* memory,
-    _IN size_t memory_size,
-    _IN int32_t is_constant
-    );
-
-/**
- * Add a virtual tensor to a graph.
- *
- * @param[in] graph The graph to be added.
- * @param[in] dtype The data type.
- *
- * @return Tensor handle on success, or NULL if get failure.
- */
-EXPORT VSI_NN_tensor* VSI_NN_tensor_create_virtual
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor_type_e dtype,
-    _IN const VSI_NN_tensor_quant_param* quant_param
-    );
-
-/**
- * Get element size of a tensor.
- *
- * @param[in] tensor Tensor to query element size.
- *
- * @return Element size of the tensor.
- */
-EXPORT int32_t VSI_NN_tensor_get_size
-    (
-    _IN const VSI_NN_tensor* tensor
-    );
-
-/**
- * Get bytes of a tensor.
- *
- * @param[in] tensor Tensor to query element size.
- *
- * @return Bytes of the tensor.
- */
-EXPORT int32_t VSI_NN_tensor_get_bytes
-    (
-    _IN const VSI_NN_tensor* tensor
-    );
-
-/**
- * Read tensor data.
- *
- * @param[in] tensor Tensor to read.
- * @param[in] memory Memory to fill the data.
- * @param[in] memory_size Element size of the read data,
- *            must be equal to tensor size.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_tensor_read
-    (
-    _IN VSI_NN_tensor* tensor,
-    _IN void* memory,
-    _IN size_t memory_size
-    );
-
-/**
- * Write data to tensor.
- *
- * @param[in] tensor Tensor to write.
- * @param[in] memory Memory with the data.
- * @param[in] memory_size Element size of the write data,
- *            must be equal to tensor size.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_tensor_write
-    (
-    _IN VSI_NN_tensor* tensor,
-    _IN void* memory,
-    _IN size_t memory_size
-    );
-
-/**
- * Swap tensors' memories.
- *
- * @param[in] tensor1 Tensor to swap the memory.
- * @param[in] tensor2 Tensor to swap the memory.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_tensor_swap
-    (
-    _IN VSI_NN_tensor* tensor1,
-    _IN VSI_NN_tensor* tensor2
-    );
-
-/**
- * Swap tensor memories.
- * User can use this api to get tensor's original memory.
- *
- * @param[in] tensor Tensor to swap the memory.
- * @param[in] new_memory The new memory for the tensor,
- *            if NULL, there is no memory swapped.
- * @param[in] old_memory Pointer for the tensor's original memory.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_tensor_swap_memory
-    (
-    _IN VSI_NN_tensor* tensor,
-    _IN _OPTIONAL void* new_memory,
-    _INOUT void** old_memory
-    );
-
-/**
- * Flush tensor memory
- * Once a tensor's memory is dirty, user should call this api to sync NPU memory.
- *
- * @param[in] tensor Tensor to flush memory
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_tensor_flush_memory
-    (
-    _IN const VSI_NN_tensor* tensor
-    );
-
-/** Convolutional */
-/**
- * Convolution 1D node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] kernel Kernel with a 3D tensor.
- * @param[in] bias Bias with a 1D tensor.
- * @param[in] output Node output tensor.
- * @param[in] stride Convolution stride.
- * @param[in] dilation Convolution dilation rate.
- * @param[in] pad_front Padding front value,
- *            this field only effect when implicit
- *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
- * @param[in] pad_end Padding end value.
- *            this field only effect when implicit
- *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
- * @param[in] implicit_padding Implicit_padding with value VSI_NN_implicit_padding_e.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_conv_1d
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* kernel,
-    _IN _OPTIONAL VSI_NN_tensor* bias,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t stride,
-    _IN int32_t dilation,
-    _IN int32_t pad_front, _IN int32_t pad_end,
-    _IN VSI_NN_implicit_padding_e implicit_padding
-    );
-
-/**
- * Convolution 2D node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] kernel Kernel with a 4D tensor.
- * @param[in] bias Bias with a 1D tensor.
- * @param[in] output Node output tensor.
- * @param[in] stride_h Convolution stride height.
- * @param[in] stride_w Convolution stride width.
- * @param[in] dilation_h Convolution height dilation rate.
- * @param[in] dilation_w Convolution width dilation rate.
- * @param[in] pad_h_front Padding height front value,
- *            this field only effect when implicit
- *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
- * @param[in] pad_h_end Padding height front value,
- *            this field only effect when implicit
- *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
- * @param[in] pad_w_front Padding width front value,
- *            this field only effect when implicit
- *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
- * @param[in] pad_w_end Padding widht front value,
- *            this field only effect when implicit
- *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
- * @param[in] implicit_padding Implicit_padding with value VSI_NN_implicit_padding_e.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_conv_2d
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* kernel,
-    _IN _OPTIONAL VSI_NN_tensor* bias,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t stride_h, _IN int32_t stride_w,
-    _IN int32_t dilation_h, _IN int32_t dilation_w,
-    _IN int32_t pad_h_front, _IN int32_t pad_h_end,
-    _IN int32_t pad_w_front, _IN int32_t pad_w_end,
-    _IN VSI_NN_implicit_padding_e implicit_padding
-    );
-
-/**
- * Depthwise Convolution 2D node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] kernel Kernel with a 4D tensor.
- * @param[in] bias Bias with a 1D tensor.
- * @param[in] output Node output tensor.
- * @param[in] multiplier Depthwise convolution multiplier.
- * @param[in] stride_h Convolution stride height.
- * @param[in] stride_w Convolution stride width.
- * @param[in] dilation_h Convolution height dilation rate.
- * @param[in] dilation_w Convolution width dilation rate.
- * @param[in] pad_h_front Padding height front value,
- *            this field only effect when implicit
- *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
- * @param[in] pad_h_end Padding height front value,
- *            this field only effect when implicit
- *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
- * @param[in] pad_w_front Padding width front value,
- *            this field only effect when implicit
- *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
- * @param[in] pad_w_end Padding widht front value,
- *            this field only effect when implicit
- *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
- * @param[in] implicit_padding Implicit_padding with value VSI_NN_implicit_padding_e.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_depthwise_conv_2d
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* kernel,
-    _IN _OPTIONAL VSI_NN_tensor* bias,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t multiplier,
-    _IN int32_t stride_h, _IN int32_t stride_w,
-    _IN int32_t dilation_h, _IN int32_t dilation_w,
-    _IN int32_t pad_h_front, _IN int32_t pad_h_end,
-    _IN int32_t pad_w_front, _IN int32_t pad_w_end,
-    _IN VSI_NN_implicit_padding_e implicit_padding
-    );
-
-/**
- * Grouped Convolution 2D node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] kernel Kernel with a 4D tensor.
- * @param[in] bias Bias with a 1D tensor.
- * @param[in] output Node output tensor.
- * @param[in] group_number Group number for the convolution.
- * @param[in] stride_h Convolution stride height.
- * @param[in] stride_w Convolution stride width.
- * @param[in] dilation_h Convolution height dilation rate.
- * @param[in] dilation_w Convolution width dilation rate.
- * @param[in] pad_h_front Padding height front value,
- *            this field only effect when implicit
- *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
- * @param[in] pad_h_end Padding height front value,
- *            this field only effect when implicit
- *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
- * @param[in] pad_w_front Padding width front value,
- *            this field only effect when implicit
- *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
- * @param[in] pad_w_end Padding widht front value,
- *            this field only effect when implicit
- *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
- * @param[in] implicit_padding Implicit_padding with value VSI_NN_implicit_padding_e.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_grouped_conv_2d
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* kernel,
-    _IN _OPTIONAL VSI_NN_tensor* bias,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t group_number,
-    _IN int32_t stride_h, _IN int32_t stride_w,
-    _IN int32_t dilation_h, _IN int32_t dilation_w,
-    _IN int32_t pad_h_front, _IN int32_t pad_h_end,
-    _IN int32_t pad_w_front, _IN int32_t pad_w_end,
-    _IN VSI_NN_implicit_padding_e implicit_padding
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_transposed_conv_2d
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* kernel,
-    _IN _OPTIONAL VSI_NN_tensor* bias,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t stride_h, _IN int32_t stride_w,
-    _IN int32_t dilation_h, _IN int32_t dilation_w,
-    _IN int32_t pad_h_front, _IN int32_t pad_h_end,
-    _IN int32_t pad_w_front, _IN int32_t pad_w_end,
-    _IN int32_t output_pad_h, _IN int32_t output_pad_w
-    );
-
-/** Pooling */
-EXPORT VSI_NN_error_e VSI_NN_node_average_pool_2d
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t ksize_h, _IN int32_t ksize_w,
-    _IN int32_t stride_h, _IN int32_t stride_w,
-    _IN int32_t pad_h_front, _IN int32_t pad_h_end,
-    _IN int32_t pad_w_front, _IN int32_t pad_w_end,
-    _IN VSI_NN_implicit_padding_e implicit_padding,
-    _IN VSI_NN_rounding_e size_rounding
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_max_pool_2d
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t ksize_h, _IN int32_t ksize_w,
-    _IN int32_t stride_h, _IN int32_t stride_w,
-    _IN int32_t pad_h_front, _IN int32_t pad_h_end,
-    _IN int32_t pad_w_front, _IN int32_t pad_w_end,
-    _IN VSI_NN_implicit_padding_e implicit_padding,
-    _IN VSI_NN_rounding_e size_rounding
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_l2_pool_2d
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t ksize_h, _IN int32_t ksize_w,
-    _IN int32_t stride_h, _IN int32_t stride_w,
-    _IN int32_t pad_h_front, _IN int32_t pad_h_end,
-    _IN int32_t pad_w_front, _IN int32_t pad_w_end,
-    _IN VSI_NN_implicit_padding_e implicit_padding,
-    _IN VSI_NN_rounding_e size_rounding
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_unpool_2d();
-
-/** Normalization */
-EXPORT VSI_NN_error_e VSI_NN_node_batch_normalization
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* mean,
-    _IN VSI_NN_tensor* variance,
-    _IN VSI_NN_tensor* offset,
-    _IN VSI_NN_tensor* scale,
-    _IN VSI_NN_tensor* output,
-    _IN float variance_epsilon
-    );
-
-/**
- * L2 Normalization node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- * @param[in] axis Normalize axis.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_l2_normalization
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t axis
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_local_response_normalization
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t depth_radius,
-    _IN float bias,
-    _IN float alpha,
-    _IN float beta,
-    _IN int32_t axis
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_instance_normalization
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* offset,
-    _IN VSI_NN_tensor* scale,
-    _IN VSI_NN_tensor* output,
-    _IN float variance_epsilon
-    );
-
-/** Math */
-/**
- * Add node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_add
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input1,
-    _IN VSI_NN_tensor* input2,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Multiply node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_mul
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input1,
-    _IN VSI_NN_tensor* input2,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Divide node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_div
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input1,
-    _IN VSI_NN_tensor* input2,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Subtract node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_sub
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input1,
-    _IN VSI_NN_tensor* input2,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Floor node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_floor
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Square node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_square
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Sqrt node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_sqrt
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Rsqrt node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_rsqrt
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Matmul node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- * @param[in] transpose_input1 Whether to do transpose on input1.
- * @param[in] transpose_input2 Whether to do transpose on input2.
- * @param[in] transpose_output Whether to do transpose on output.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_matmul
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input1,
-    _IN VSI_NN_tensor* input2,
-    _IN VSI_NN_tensor* output,
-    _IN int transpose_input1,
-    _IN int transpose_input2,
-    _IN int transpose_output
-    );
-
-/**
- * Abs node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_abs
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Pow node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_pow
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Maximum node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_maximum
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input1,
-    _IN VSI_NN_tensor* input2,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Minimum node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_minimum
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input1,
-    _IN VSI_NN_tensor* input2,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Exp node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_exp
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Reverse node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- * @param[in] axes Axes to reverse.
- * @param[in] axes_size Number of axis to reverse.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_reverse
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN const int32_t* axes,
-    _IN int32_t axes_size
-    );
-
-/**
- * Transpose node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- * @param[in] perm Transpose order.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_transpose
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN const int32_t* perm
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_gather
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* indices,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t axis
-    );
-
-/**
- * Neg node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_neg
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Reduce max node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- * @param[in] axes Axes to reduce.
- * @param[in] axes_size Number of axis to reduce.
- * @param[in] keep_dim Whether to keep dims on output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_reduce_max
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN const int32_t* axes,
-    _IN int32_t axes_size,
-    _IN int32_t keep_dim
-    );
-
-/**
- * Reduce min node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- * @param[in] axes Axes to reduce.
- * @param[in] axes_size Number of axis to reduce.
- * @param[in] keep_dim Whether to keep dims on output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_reduce_min
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN const int32_t* axes,
-    _IN int32_t axes_size,
-    _IN int32_t keep_dim
-    );
-
-/**
- * Reduce sum node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- * @param[in] axes Axes to reduce.
- * @param[in] axes_size Number of axis to reduce.
- * @param[in] keep_dim Whether to keep dims on output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_reduce_sum
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN const int32_t* axes,
-    _IN int32_t axes_size,
-    _IN int32_t keep_dim
-    );
-
-/**
- * Reduce mean node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- * @param[in] axes Axes to reduce.
- * @param[in] axes_size Number of axis to reduce.
- * @param[in] keep_dim Whether to keep dims on output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_reduce_mean
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN const int32_t* axes,
-    _IN int32_t axes_size,
-    _IN int32_t keep_dim
-    );
-
-/**
- * Sin node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_sin
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_tile
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN const int32_t* multiples,
-    _IN int32_t multiples_size
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_topk
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN VSI_NN_tensor* output_indices,
-    _IN int32_t k
-    );
-
-/** Logical */
-/**
- * Equal node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_equal
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input1,
-    _IN VSI_NN_tensor* input2,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Greater node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_greater
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input1,
-    _IN VSI_NN_tensor* input2,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Greater equal node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_greater_equal
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input1,
-    _IN VSI_NN_tensor* input2,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Less node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_less
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input1,
-    _IN VSI_NN_tensor* input2,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Less equal node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_less_equal
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input1,
-    _IN VSI_NN_tensor* input2,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Logical and node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_logical_and
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input1,
-    _IN VSI_NN_tensor* input2,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Logical or node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_logical_or
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input1,
-    _IN VSI_NN_tensor* input2,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Logical not node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_logical_not
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input1,
-    _IN VSI_NN_tensor* input2,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Not equal node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_not_equal
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input1,
-    _IN VSI_NN_tensor* input2,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Select node.
- * If conditon is true, then output input1 tensor,
- * else output input2 tensor.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] condition Conditon tensor..
- * @param[in] input1 Node input tensor.
- * @param[in] input2 Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_select
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* condition,
-    _IN VSI_NN_tensor* input1,
-    _IN VSI_NN_tensor* input2,
-    _IN VSI_NN_tensor* output
-    );
-
-/** Activation */
-/**
- * relu node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_relu
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * ReLU1 node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_relu1
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * ReLU6 node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_relu6
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_tanh
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN float scale_a,
-    _IN float scale_b
-    );
-
-/**
- * Sigmoid node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_sigmoid
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Hard sigmoid node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_hard_sigmoid
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Mish node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_mish
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_leaky_relu
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN float ratio
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_prelu
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* alpha,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Soft relu node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_soft_relu
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Elu node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_elu
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-/** Misc */
-EXPORT VSI_NN_error_e VSI_NN_node_pad
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN VSI_NN_padding_mode_e mode,
-    _IN const int32_t* pad_front,
-    _IN const int32_t* pad_end,
-    _IN int32_t pad_value
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_fully_connected
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* kernel,
-    _IN _OPTIONAL VSI_NN_tensor* bias,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t axis
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_concate
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* const inputs[],
-    _IN int32_t input_num,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t axis
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_split
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* const outputs[],
-    _IN int32_t output_num,
-    _IN const int32_t* slices,
-    _IN int32_t slices_size,
-    _IN int32_t axis
-    );
-
-/**
- * Cast node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_cast
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Quantize node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_quantize
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-/**
- * Dequantize node.
- *
- * @param[in] graph Graph to create the node.
- * @param[in] input Node input tensor.
- * @param[in] output Node output tensor.
- *
- * @return VSI_NN_ERROR_OK on success.
- */
-EXPORT VSI_NN_error_e VSI_NN_node_dequantize
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_space_to_batch
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN const int32_t* block_size,
-    _IN int32_t block_size_num,
-    _IN const int32_t* pad_front,
-    _IN const int32_t* pad_end
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_batch_to_space
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN const int32_t* block_size,
-    _IN int32_t block_size_num,
-    _IN const int32_t* crop_front,
-    _IN const int32_t* crop_end
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_space_to_depth
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN const int32_t* block_size,
-    _IN int32_t block_size_num,
-    _IN const int32_t* pad_front,
-    _IN const int32_t* pad_end
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_depth_to_space
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN const int32_t* block_size,
-    _IN int32_t block_size_num,
-    _IN const int32_t* crop_front,
-    _IN const int32_t* crop_end
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_channel_shuffle
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t group_number,
-    _IN int32_t axis
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_expand_dims
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t axis
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_hashtable_lookup
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* lookups,
-    _IN VSI_NN_tensor* keys,
-    _IN VSI_NN_tensor* values,
-    _IN VSI_NN_tensor* output,
-    _IN VSI_NN_tensor* output_hits
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_embedding_lookup
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* lookups,
-    _IN VSI_NN_tensor* values,
-    _IN VSI_NN_tensor* output
-     );
-
-EXPORT VSI_NN_error_e VSI_NN_node_lsh_projection
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* hash_func,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* weight,
-    _IN VSI_NN_tensor* output,
-    _IN VSI_NN_lsh_projection_type_e type
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_slice
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN const int32_t* begin,
-    _IN const int32_t* size
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_strided_slice
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN const int32_t* begin,
-    _IN const int32_t* end,
-    _IN const int32_t* strides,
-    _IN int32_t begin_mask,
-    _IN int32_t end_mask,
-    _IN int32_t shrink_axis_mask
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_argmax
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t axis
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_argmin
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t axis
-    );
-
-/** Detection */
-EXPORT VSI_NN_error_e VSI_NN_node_roi_pool
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* feature_map,
-    _IN VSI_NN_tensor* loc,
-    _IN VSI_NN_tensor* batch_index,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t output_h,
-    _IN int32_t output_w,
-    _IN float ratio_h,
-    _IN float ratio_w
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_roi_align
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* feature_map,
-    _IN VSI_NN_tensor* loc,
-    _IN VSI_NN_tensor* batch_index,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t output_h,
-    _IN int32_t output_w,
-    _IN float ratio_h,
-    _IN float ratio_w,
-    _IN int32_t sample_num_h,
-    _IN int32_t sample_num_w
-    );
-
-/** Image transform */
-EXPORT VSI_NN_error_e VSI_NN_node_resize_bilinear
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t scale_h,
-    _IN int32_t scale_w
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_resize_nearest
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* output,
-    _IN int32_t scale_h,
-    _IN int32_t scale_w
-    );
-
-/** RNN */
-EXPORT VSI_NN_error_e VSI_NN_node_svdf
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* weights_feature,
-    _IN VSI_NN_tensor* weights_time,
-    _IN VSI_NN_tensor* bias,
-    _IN VSI_NN_tensor* input_state,
-    _IN VSI_NN_tensor* output,
-    _IN VSI_NN_tensor* output_state,
-    _IN int32_t rank
-    );
-
-//EXPORT VSI_NN_error_e VSI_NN_node_rnn();
-
-EXPORT VSI_NN_error_e VSI_NN_node_rnn_unit
-    (
-    _IN VSI_NN_graph* graph,
-    _IN VSI_NN_tensor* input,
-    _IN VSI_NN_tensor* input_state,
-    _IN VSI_NN_tensor* weight, _IN VSI_NN_tensor* recrrent_weight,
-    _IN VSI_NN_tensor* bias,
-    _IN VSI_NN_tensor* output,
-    _IN VSI_NN_tensor* output_state,
-    _IN VSI_NN_activation_e activation
-    );
-
-EXPORT VSI_NN_error_e VSI_NN_node_lstm_unit
-    (
-    _IN VSI_NN_graph* graph
-    );
-
-__END_DECLS
-#endif
diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h
index b426e4b..d10a29b 100644
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@@ -61,14 +61,13 @@ typedef struct _vsi_nn_hw_config_t
 {
     char target_name[VSI_NN_MAX_TARGET_NAME];
     vsi_nn_hw_evis_t evis;
-#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
     uint32_t subGroupSize;
-#endif
     uint32_t use_40bits_va;
     uint32_t support_stream_processor;
     uint32_t sp_exec_count;
     uint32_t sp_vector_depth;
     uint32_t sp_per_core_vector_depth;
+    uint32_t support_ffd;
 } vsi_nn_hw_config_t;
 
 typedef struct _vsi_nn_runtime_option_t
@@ -89,6 +88,7 @@ typedef struct _vsi_nn_runtime_option_t
     int32_t enable_save_file_type;
     int32_t enable_use_image_process;
     int32_t enable_use_from_handle;
+    vsi_nn_hw_config_t config;
 } vsi_nn_runtime_option_t;
 
 /**
@@ -101,6 +101,15 @@ typedef struct _vsi_nn_context_t
     vsi_nn_runtime_option_t options;
 } VSI_PUBLIC_TYPE *vsi_nn_context_t;
 
+/**
+ * Query and set options->config hw params.
+ */
+OVXLIB_API vsi_status query_hardware_caps_runtime
+    (
+    vsi_nn_context_t ctx,
+    vsi_nn_runtime_option_t *options
+    );
+
 /**
  * Create context
  * Create ovxlib NN runtime context.
@@ -113,6 +122,11 @@ OVXLIB_API vsi_status vsi_nn_initOptions
     (
     vsi_nn_runtime_option_t *options
     );
+OVXLIB_API vsi_status vsi_nn_initOptions_runtime
+    (
+    vsi_nn_runtime_option_t *options,
+    vsi_nn_context_t ctx
+    );
 /**
  * Release context
  * Release ovxlib NN runtime resource and reset context handle to NULL.
diff --git a/src/tim/vx/internal/include/vsi_nn_feature_config.h b/src/tim/vx/internal/include/vsi_nn_feature_config.h
index b70b1dc..66361bd 100644
--- a/src/tim/vx/internal/include/vsi_nn_feature_config.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h
@@ -57,5 +57,8 @@
 #define VSI_PER_GROUP_QUANTIZATION_SUPPORT
 #endif
 #define VSI_GRAPH_RUNTIME_ENV_SUPPORT
+#if defined(VX_TENSOR_SPARSITY_SUPPORT)
+#define VSI_TENSOR_SPARSITY_SUPPORT
+#endif
 
 #endif
diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h
index dc82aeb..ddb21ef 100644
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@@ -216,6 +216,7 @@
 #include "ops/vsi_nn_op_grouped_conv3d.h"
 #include "ops/vsi_nn_op_col2im.h"
 #include "ops/vsi_nn_op_l1_layer_norm.h"
+#include "ops/vsi_nn_op_rope.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 #include "ops/vsi_nn_op_inverse_sigmoid.h"
@@ -420,6 +421,7 @@ typedef union _vsi_nn_nn_param
     vsi_nn_grouped_conv3d_param     grouped_conv3d;
     vsi_nn_col2im_param             col2im;
     vsi_nn_l1_layer_norm_param      l1_layer_norm;
+    vsi_nn_rope_param               rope;
     void*                         client_param;
 
     /* custom node data struct define */
diff --git a/src/tim/vx/internal/include/vsi_nn_tensor.h b/src/tim/vx/internal/include/vsi_nn_tensor.h
index 90dcb22..2efb763 100644
--- a/src/tim/vx/internal/include/vsi_nn_tensor.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor.h
@@ -86,8 +86,10 @@ typedef enum
     VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6,
     /** perchannel float8 */
     VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7,
-    /** GPQT */
+    /** pergroup symmetric */
     VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC = 0x8,
+    /** pergroup asymmetric */
+    VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC = 0x9,
     /** undefined type */
     VSI_NN_QNT_TYPE_NA = 0xff,
 } vsi_nn_qnt_type_e;
diff --git a/src/tim/vx/internal/include/vsi_nn_tensor_util.h b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
index 4c88f95..9a0acca 100644
--- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
@@ -418,6 +418,34 @@ OVXLIB_API vsi_status vsi_nn_SetTensorIsScalar
     int8_t is_scalar
 );
 
+/**
+ * Get Tensor is_scalar
+ * Get the is_sparsity of the tensor
+ *
+ * @param[in] tensor Tensor.
+ *
+ * @return is_sparsity flag of the tensor.
+ */
+OVXLIB_API int32_t vsi_nn_GetTensorIsSparsity
+(
+    vsi_nn_tensor_t* tensor
+);
+
+/**
+ * Set Weight Tensor whether is sparsity
+ * Set the is_sparsity for the tensor
+ *
+ * @param[in] tensor Tensor.
+ * @param[in] new is_sparsity value of the tensor.
+ *
+ * @return VSI_SUCCESS on success, or error core otherwise.
+**/
+
+OVXLIB_API vsi_status vsi_nn_SetTensorIsSparsity(
+    vsi_nn_tensor_t* tensor,
+    int32_t is_sparsity
+);
+
 OVXLIB_API vsi_status vsi_nn_CopyRawDataToTensor
     (
     vsi_nn_graph_t*         graph,
diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h
index 37368a4..30d0adb 100644
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@@ -33,7 +33,7 @@ extern "C"{
 
 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 2
-#define VSI_NN_VERSION_PATCH 14
+#define VSI_NN_VERSION_PATCH 22
 #define VSI_NN_VERSION \
     (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
 
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_letterbox_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_letterbox_evis.c
new file mode 100644
index 0000000..67a0833
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_letterbox_evis.c
@@ -0,0 +1,475 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define _CUSTOM_LETTERBOX_KERNEL_SOURCE      "custom_letterbox"
+
+// Add kernel hashtable here
+#define CUSTOM_LETTERBOX_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        (( IN_DTYPE ) | ( OUT_DTYPE << 8 ))
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { CUSTOM_LETTERBOX_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+          CVIVANTE_NAMESPACE("evis.custom_letterbox_"#IN_DTYPE"to"#OUT_DTYPE), \
+          _CUSTOM_LETTERBOX_KERNEL_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _custom_letterbox_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( U8, U8 ),
+    PACK_KERNEL_MAP( U8, I8 ),
+    PACK_KERNEL_MAP( U8, F16 ),
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _custom_letterbox_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+#define _CUSTOM_LETTERBOX_PARAM_NUM  _cnt_of_array( _custom_letterbox_kernel_param_def )
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_custom_letterbox_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        2,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    VSI_UNREFERENCED(param_size);
+    int32_t      top = 0;
+    int32_t      bottom = 0;
+    int32_t      left = 0;
+    int32_t      right = 0;
+    float        scale_w = 0;
+    float        scale_h = 0;
+    int32_t      resize_w = 0;
+    int32_t      resize_h = 0;
+    int32_t      resize_max_w = 0;
+    int32_t      resize_max_h = 0;
+    float        output_scale = 1.0f;
+    float        output_zp = 0;
+    float        out_scale_r = 0;
+    float        out_zp_r = 0;
+    float        out_scale_g = 0;
+    float        out_zp_g = 0;
+    float        out_scale_b = 0;
+    float        out_zp_b = 0;
+    float        pad_v_r = 0;
+    float        pad_v_g = 0;
+    float        pad_v_b = 0;
+    int32_t      in_width  = 0;
+    int32_t      in_height = 0;
+    int32_t      out_width  = 0;
+    int32_t      out_height = 0;
+    float        mean_r = 0;
+    float        mean_g = 0;
+    float        mean_b = 0;
+    float        scale_r = 0;
+    float        scale_g = 0;
+    float        scale_b = 0;
+    vx_int32     pad_value_r = 0;
+    vx_int32     pad_value_g = 0;
+    vx_int32     pad_value_b = 0;
+    vx_int32     r_order = 0;
+    vx_int32     b_order = 0;
+    vx_int32     reverse_channel = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &top);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &bottom);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &left);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &right);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[6], &mean_r);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &mean_g);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &mean_b);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &scale_r);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &scale_g);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[11], &scale_b);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &pad_value_r);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &pad_value_g);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[14], &pad_value_b);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[15], &reverse_channel);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    in_width = (int32_t)attr[0]->shape->data[0] / 3;
+    in_height = (int32_t)attr[0]->shape->data[1];
+    out_width = (int32_t)attr[1]->shape->data[0];
+    out_height = (int32_t)attr[1]->shape->data[1] / 3;
+
+    output_scale = 1.0f / attr[1]->scale;
+    output_zp = (float)(attr[1]->zero_point);
+
+    resize_w = out_width - left - right;
+    resize_h = out_height - top - bottom;
+    resize_max_w = out_width - right;
+    resize_max_h = out_height - bottom;
+    scale_w = (float)in_width / resize_w;
+    scale_h = (float)in_height / resize_h;
+    out_scale_r = scale_r / output_scale;
+    out_zp_r = output_zp - out_scale_r * mean_r;
+    out_scale_g = scale_g / output_scale;
+    out_zp_g = output_zp - out_scale_g * mean_g;
+    out_scale_b = scale_b / output_scale;
+    out_zp_b = output_zp - out_scale_b * mean_b;
+    pad_v_r = pad_value_r * out_scale_r + out_zp_r;
+    pad_v_g = pad_value_g * out_scale_g + out_zp_g;
+    pad_v_b = pad_value_b * out_scale_b + out_zp_b;
+
+    if (reverse_channel)
+    {
+        r_order = out_height * 2;
+        b_order = 0;
+    }
+    else
+    {
+        r_order = 0;
+        b_order = out_height * 2;
+    }
+
+    {
+        gpu_dp_inst_t uniU8RightSubLeft_4x4 = {{
+            0x00090909, // TCfg
+            0x00000000, // ASelt
+            0x00140003, 0x00000025, // ABin
+            0x000a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniLeftToFloat32_4x4 = {{
+            0x00010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00000002, // ABin
+            0x00020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtactHalf8_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtract8Data_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniLeftToFloat32_4x4", &uniLeftToFloat32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Data_2x8", &uniExtract8Data_2x8 );
+    }
+    status |= vsi_nn_kernel_gpu_add_param( node, "top", &top );
+    status |= vsi_nn_kernel_gpu_add_param( node, "left", &left );
+    status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_r", &out_scale_r );
+    status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_g", &out_scale_g );
+    status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_b", &out_scale_b );
+    status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_r", &out_zp_r );
+    status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_g", &out_zp_g );
+    status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_b", &out_zp_b );
+    status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_r", &pad_v_r );
+    status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_g", &pad_v_g );
+    status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_b", &pad_v_b );
+    status |= vsi_nn_kernel_gpu_add_param( node, "scale_w", &scale_w );
+    status |= vsi_nn_kernel_gpu_add_param( node, "scale_h", &scale_h );
+    status |= vsi_nn_kernel_gpu_add_param( node, "resize_max_w", &resize_max_w );
+    status |= vsi_nn_kernel_gpu_add_param( node, "resize_max_h", &resize_max_h );
+    status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height );
+    status |= vsi_nn_kernel_gpu_add_param( node, "r_order", &r_order );
+    status |= vsi_nn_kernel_gpu_add_param( node, "b_order", &b_order );
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_size[0] = out_width;
+    gpu_param.global_size[1] = out_height;
+
+    status |= vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+} /* _custom_warp_affine_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _custom_letterbox_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _custom_letterbox_kernel_map );
+    vx_param_description_t * param_def  = _custom_letterbox_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array( _custom_letterbox_kernel_param_def );
+    vx_kernel_initialize_f  initializer = _custom_letterbox_initializer;
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = CUSTOM_LETTERBOX_HASH_KEY( in_dtype, out_dtype );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (vx_uint32)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CUSTOM_LETTERBOX_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    size_t i = 0;
+
+    int32_t top = vsi_nn_kernel_param_get_int32( params, "top");
+    int32_t bottom = vsi_nn_kernel_param_get_int32( params, "bottom");
+    int32_t left = vsi_nn_kernel_param_get_int32( params, "left");
+    int32_t right = vsi_nn_kernel_param_get_int32( params, "right");
+    float   mean_r = vsi_nn_kernel_param_get_float32( params, "mean_r");
+    float   mean_g = vsi_nn_kernel_param_get_float32( params, "mean_g");
+    float   mean_b = vsi_nn_kernel_param_get_float32( params, "mean_b");
+    float   scale_r = vsi_nn_kernel_param_get_float32( params, "scale_r");
+    float   scale_g = vsi_nn_kernel_param_get_float32( params, "scale_g");
+    float   scale_b = vsi_nn_kernel_param_get_float32( params, "scale_b");
+    int32_t pad_value_r = vsi_nn_kernel_param_get_int32( params, "pad_value_r");
+    int32_t pad_value_g = vsi_nn_kernel_param_get_int32( params, "pad_value_g");
+    int32_t pad_value_b = vsi_nn_kernel_param_get_int32( params, "pad_value_b");
+    int32_t reverse_channel = vsi_nn_kernel_param_get_int32( params, "reverse_channel");
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+
+    uint32_t param_num = _CUSTOM_LETTERBOX_PARAM_NUM;
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+    shapes[0][0] = inputs[0]->attr.size[1] * 3;
+    shapes[0][1] = inputs[0]->attr.size[2];
+    shapes[1][0] = outputs[0]->attr.size[0];
+    shapes[1][1] = outputs[0]->attr.size[1] * 3;
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+            inputs[0], shapes[0], 2 );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shapes[1], 2 );
+
+    if (reshape_tensors[0] == NULL ||
+        reshape_tensors[1] == NULL)
+    {
+        goto final;
+    }
+
+    if (reverse_channel)
+    {
+        float mean_temp = mean_r;
+        float scale_temp = scale_r;
+        int32_t pad_value_temp = pad_value_r;
+        mean_r = mean_b;
+        mean_b = mean_temp;
+        scale_r = scale_b;
+        scale_b = scale_temp;
+        pad_value_r = pad_value_b;
+        pad_value_b = pad_value_temp;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 2;
+
+            vsi_nn_kernel_node_pack_io( node_params, param_num,
+                    reshape_tensors, 1, &reshape_tensors[1], 1 );
+
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &bottom );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &right );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_r );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_g );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_b );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_r );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_g );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_b );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_r );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_g );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_b );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse_channel );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, param_num );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+            vsi_nn_kernel_scalar_release( &node_params[13] );
+            vsi_nn_kernel_scalar_release( &node_params[14] );
+            vsi_nn_kernel_scalar_release( &node_params[15] );
+
+            CHECK_STATUS(status);
+        }
+    }
+
+final:
+    for (i = 0; i < 2; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( custom_letterbox, _setup )
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
index 6dc60ce..7889891 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
@@ -35,6 +35,7 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vsi_nn_vxkernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 
 #define _CPU_ARG_NUM            (1)
 #define _CPU_INPUT_NUM          (1)
@@ -42,6 +43,7 @@
 #define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
 #define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
 #define _KERNEL_NAME            ("com.vivantecorp.extension.Softmax2VXC")
+#define _KERNEL_NAME_U8         ("com.vivantecorp.extension.Softmax2VXC_u8")
 
 #define SCALAR_INPUT_AXIS          (2)
 
@@ -64,7 +66,11 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
 {
     vsi_status status = VSI_FAILURE;
     int sf_size = 0;
-    vsi_nn_kernel_tensor_attr_t* attr = NULL;
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    float srcZP = 0.0f;
+    float srcScale = 1.0f;
+    float dstZP = 0.0f;
+    float dstScale = 1.0f;
     // Alignment with a power of two value.
     gpu_param_t gpu_param = {
         2,          // workdim
@@ -75,14 +81,19 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
 
     VSI_UNREFERENCED(param_size);
 
-    attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
-    if (!attr)
+    attr[0] = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
+    attr[1] = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]);
+    if ((!attr[0]) || (!attr[1]))
     {
         VSILOGE("Query failure! at line");
         return status;
     }
 
-    sf_size  =  (int)attr->shape->data[0];
+    sf_size  =  (int)attr[0]->shape->data[0];
+    srcScale = attr[0]->scale;
+    srcZP = (float)attr[0]->zero_point;
+    dstScale = 1.0f / attr[1]->scale;
+    dstZP = (float)attr[1]->zero_point;
 
     gpu_param.global_offset[0] = 0;
     gpu_param.global_offset[1] = 0;
@@ -91,7 +102,7 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
     gpu_param.local_size[0]    = 1;
     gpu_param.local_size[1]    = 1;
     gpu_param.global_size[0]   =
-        gpu_align_p2((1 + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0],
+        gpu_align_p2((attr[0]->shape->data[1] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0],
                 gpu_param.local_size[0]);
     gpu_param.global_size[1]   =
         gpu_align_p2((1 + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1],
@@ -107,25 +118,50 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
             0x00000001, 0x00000000, 0x00000001, 0x00000000,
             0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtract8Bin_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
 
         status = vsi_nn_kernel_gpu_add_param( node,
                 "Uni4x4_Fp16ToFp32", &Uni4x4_Fp16ToFp32 );
-        vsi_nn_kernel_gpu_add_param(node,
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                "uniExtract8Bin_2x8", &uniExtract8Bin_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param(node,
                 "sf_size", &sf_size);
+        status |= vsi_nn_kernel_gpu_add_param(node, "srcScale", &srcScale);
+        status |= vsi_nn_kernel_gpu_add_param(node, "srcZP", &srcZP);
+        status |= vsi_nn_kernel_gpu_add_param(node, "dstScale", &dstScale);
+        status |= vsi_nn_kernel_gpu_add_param(node, "dstZP", &dstZP);
     }
 
-    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    status |= vsi_nn_kernel_gpu_config( node, &gpu_param );
 
     if(status != VSI_SUCCESS)
     {
         VSILOGE("Initializer  failure!");
     }
-    if (attr) vsi_nn_kernel_tensor_attr_release( &attr );
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
 
     return status;
 }
 
-static const vx_kernel_description_t _kernel_info =
+static const vx_kernel_description_t _kernel_info1 =
 {
     KERNEL_ID_PLACEHOLDER,
     _KERNEL_NAME,
@@ -139,6 +175,20 @@ static const vx_kernel_description_t _kernel_info =
     vsi_nn_KernelDeinitializer
 };
 
+static const vx_kernel_description_t _kernel_info2 =
+{
+    KERNEL_ID_PLACEHOLDER,
+    _KERNEL_NAME_U8,
+    NULL,
+    kernel_param_def,
+    _cnt_of_array( kernel_param_def ),
+    vsi_nn_KernelValidator,
+    NULL,
+    NULL,
+    _softmax_initializer,
+    vsi_nn_KernelDeinitializer
+};
+
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -146,9 +196,20 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    VSI_UNREFERENCED(inputs);
-    VSI_UNREFERENCED(outputs);
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+
+    in_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
+    out_dtype = vsi_nn_kernel_map_dtype(outputs[0]->attr.dtype.vx_type);
+
+    if (in_dtype == U8 && out_dtype == U8)
+    {
+        memmove( &kernel->info, &_kernel_info2, sizeof(vx_kernel_description_t) );
+    }
+    else
+    {
+        memmove( &kernel->info, &_kernel_info1, sizeof(vx_kernel_description_t) );
+    }
 
     vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
             "vsi_nn_kernel_header",
@@ -173,12 +234,42 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
     vsi_nn_kernel_node_t node = NULL;
     int32_t axis = 0;
+    vsi_nn_tensor_t* reshape_tensors[2] = {NULL};
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
+    uint32_t rank_in = 0;
+    int32_t new_axis = 0;
+    uint32_t i = 0;
+    vsi_bool ret = vx_false_e;
 
     VSI_UNREFERENCED(input_num);
     VSI_UNREFERENCED(output_num);
 
     axis = vsi_nn_kernel_param_get_int32(params, "axis");
 
+    ret = vsi_nn_kernel_optimize_softmax_shape(inputs[0]->attr.size,
+                                           inputs[0]->attr.dim_num,
+                                           axis,
+                                           shapes[0],
+                                           &rank_in,
+                                           &new_axis);
+
+    if (ret)
+    {
+        reshape_tensors[0] = vsi_nn_reshape_tensor(graph, inputs[0], shapes[0], rank_in);
+        reshape_tensors[1] = vsi_nn_reshape_tensor(graph, outputs[0], shapes[0], rank_in);
+    }
+    else
+    {
+        return NULL;
+    }
+
+    if (!vsi_nn_kernel_gpu_check_shape(reshape_tensors[0]->attr.size,
+                                       reshape_tensors[0]->attr.dim_num) ||
+        new_axis > 2)
+    {
+        return NULL;
+    }
+
     status = _query_kernel( inputs, outputs, kernel );
     if( VSI_SUCCESS == status)
     {
@@ -187,9 +278,9 @@ static vsi_nn_kernel_node_t _setup
         {
             /* Set inputs and outputs */
             vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
+                    reshape_tensors, _CPU_INPUT_NUM, &reshape_tensors[1], _CPU_OUTPUT_NUM );
             backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
+                    graph, I32, &new_axis );
 
             /* Pass parameters to node. */
             status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
@@ -200,6 +291,11 @@ static vsi_nn_kernel_node_t _setup
             status = VSI_FAILURE;
         }
     }
+
+    for (i = 0; i < 2; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_letterbox.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_letterbox.c
new file mode 100644
index 0000000..6567838
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_letterbox.c
@@ -0,0 +1,227 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_internal_node.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _custom_letterbox_local_data_t {
+    int32_t placeholder;
+} custom_letterbox_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+int32_t my_round(float in)
+{
+    if (in >= 0)
+    {
+        return (int)(in + 0.5f);
+    }
+    else
+    {
+        return (int)(in - 0.5f);
+    }
+}
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_custom_letterbox_param * p;
+    p = &(self->nn_param.custom_letterbox);
+    int32_t shape_w = (int32_t)inputs[0]->attr.size[1];
+    int32_t shape_h = (int32_t)inputs[0]->attr.size[2];
+    int32_t new_shape_w = (int32_t)outputs[0]->attr.size[0];
+    int32_t new_shape_h = (int32_t)outputs[0]->attr.size[1];
+    vx_bool auto_bool = p->auto_bool;
+    vx_bool scaleFill = p->scaleFill;
+    vx_bool scaleup = p->scaleup;
+    int32_t stride = p->stride;
+    vx_bool center = p->center;
+
+    float r = 1.0f;
+    int32_t new_unpad_w = 0;
+    int32_t new_unpad_h = 0;
+    int32_t dw = 0;
+    int32_t dh = 0;
+    int32_t top = 0;
+    int32_t bottom = 0;
+    int32_t left = 0;
+    int32_t right = 0;
+
+    r = (float)fmin((float)new_shape_w / shape_w, (float)new_shape_h / shape_h);
+    if (!scaleup)
+    {
+        r = (float)fmin(r, 1.0f);
+    }
+
+    new_unpad_w = my_round(r * shape_w);
+    new_unpad_h = my_round(r * shape_h);
+    dw = new_shape_w - new_unpad_w;
+    dh = new_shape_h - new_unpad_h;
+    if (auto_bool)
+    {
+        dw = dw % stride;
+        dh = dh % stride;
+    }
+    else if (scaleFill)
+    {
+        dw = 0;
+        dh = 0;
+        new_unpad_w = new_shape_w;
+        new_unpad_h = new_shape_h;
+    }
+    if (center)
+    {
+        top = my_round(dh / 2.0f - 0.1f);
+        bottom = my_round(dh / 2.0f + 0.1f);
+        left = my_round(dw / 2.0f - 0.1f);
+        right = my_round(dw / 2.0f + 0.1f);
+    }
+    else
+    {
+        top = 0;
+        bottom = my_round(dh + 0.1f);
+        left = 0;
+        right = my_round(dw + 0.1f);
+    }
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32( param, "top", top);
+    vsi_nn_kernel_param_add_int32( param, "bottom", bottom);
+    vsi_nn_kernel_param_add_int32( param, "left", left);
+    vsi_nn_kernel_param_add_int32( param, "right", right);
+    vsi_nn_kernel_param_add_float32( param, "mean_r", p->mean_r);
+    vsi_nn_kernel_param_add_float32( param, "mean_g", p->mean_g);
+    vsi_nn_kernel_param_add_float32( param, "mean_b", p->mean_b);
+    vsi_nn_kernel_param_add_float32( param, "scale_r", p->scale_r);
+    vsi_nn_kernel_param_add_float32( param, "scale_g", p->scale_g);
+    vsi_nn_kernel_param_add_float32( param, "scale_b", p->scale_b);
+    vsi_nn_kernel_param_add_int32( param, "pad_value_r", p->pad_value_r);
+    vsi_nn_kernel_param_add_int32( param, "pad_value_g", p->pad_value_g);
+    vsi_nn_kernel_param_add_int32( param, "pad_value_b", p->pad_value_b);
+    vsi_nn_kernel_param_add_int32( param, "reverse_channel", p->reverse_channel);
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+            "custom_letterbox",
+            inputs, 1,
+            outputs, 1, param );
+
+    vsi_nn_kernel_param_release( &param );
+
+    return VSI_SUCCESS;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(LETTERBOX, 1, 1)
+        IO_TYPE(D_U8,         D_F16)
+        IO_TYPE(D_U8,         D_U8|Q_ASYM)
+        IO_TYPE(D_U8,         D_I8|Q_DFP)
+        IO_TYPE(D_U8,         D_I8|Q_ASYM)
+        IO_TYPE(D_U8,         D_I8|Q_SYM)
+    END_IO_TYPE_DECL(LETTERBOX)
+    if (!VALIDATE_OP_IO_TYPES(LETTERBOX, self, inputs, self->input.num, outputs, self->output.num)) {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        outputs[0]->attr.size[0] = self->nn_param.custom_letterbox.new_shape_w;
+        outputs[0]->attr.size[1] = self->nn_param.custom_letterbox.new_shape_h;
+        outputs[0]->attr.size[2] = 3;
+        outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    status = vsi_nn_op_common_deinit(self);
+
+    return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ CUSTOM_LETTERBOX,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
index 50c435b..bcde042 100644
--- a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
@@ -85,18 +85,24 @@ static const struct {
     HASH_CUMSUM_KERNELS(0, U8,  U8)
     HASH_CUMSUM_KERNELS(0, F32, F32)
     HASH_CUMSUM_KERNELS(0, F32, U8)
+    HASH_CUMSUM_KERNELS(0, I32, I32)
     HASH_CUMSUM_KERNELS(1, U8,  U8)
     HASH_CUMSUM_KERNELS(1, F32, F32)
     HASH_CUMSUM_KERNELS(1, F32, U8)
+    HASH_CUMSUM_KERNELS(1, I32, I32)
     HASH_CUMSUM_KERNELS(2, U8,  U8)
     HASH_CUMSUM_KERNELS(2, F32, F32)
     HASH_CUMSUM_KERNELS(2, F32, U8)
+    HASH_CUMSUM_KERNELS(2, I32, I32)
+
     HASH_CUMSUM_KERNELS_2D(0, U8,  U8)
     HASH_CUMSUM_KERNELS_2D(0, F32, F32)
     HASH_CUMSUM_KERNELS_2D(0, F32, U8)
+    HASH_CUMSUM_KERNELS_2D(0, I32, I32)
     HASH_CUMSUM_KERNELS_2D(1, U8,  U8)
     HASH_CUMSUM_KERNELS_2D(1, F32, F32)
     HASH_CUMSUM_KERNELS_2D(1, F32, U8)
+    HASH_CUMSUM_KERNELS_2D(1, I32, I32)
 
     HASH_CUMSUM_ARRAY_KERNELS(0, U8,  U8, KERNEL_SOURCE_3)
     HASH_CUMSUM_ARRAY_KERNELS(0, F32, F32, KERNEL_SOURCE_3)
diff --git a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
index f139ccb..44c14d6 100644
--- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
@@ -26,6 +26,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@@ -644,7 +645,8 @@ static vsi_nn_kernel_node_t _setup
 
 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
             shader_cnt_support =
-                (graph->ctx->config.subGroupSize >= 64 && graph->ctx->config.use_40bits_va) ? TRUE : FALSE;
+                (((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize >= 64 &&
+                    ((vsi_nn_graph_prv_t*)graph)->options->config.use_40bits_va) ? TRUE : FALSE;
 #endif
             if ((in1_h % 64 == 0) && (transFlg == 1) && (out_h % 8 == 0) && shader_cnt_support)
             {
diff --git a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
index eb0e556..60ab16b 100644
--- a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
@@ -75,6 +75,7 @@ static const _kernel_map_type _one_hot_kernel_map[] =
     PACK_ONE_HOT_KERNEL_MAP( F32, F32 ),
     PACK_ONE_HOT_KERNEL_MAP( I32, I32 ),
     PACK_ONE_HOT_KERNEL_MAP( I32, F32 ),
+    PACK_ONE_HOT_KERNEL_MAP( I32, BF16 ),
     PACK_ONE_HOT_KERNEL_MAP( I32, U8 ),
     PACK_ONE_HOT_KERNEL_MAP( U8,  U8 ),
 };
diff --git a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c
index 87c8593..1f0d4a9 100644
--- a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c
@@ -79,7 +79,7 @@ static const struct {
         const char* source_name;
     } kernel_map[] =
 {
-    PRELU_KERNELS_FLOAT(F32, F32, F32,  KERNEL_SOURCE_1)
+    PRELU_KERNELS_FLOAT(F32, F32, F32, KERNEL_SOURCE_1)
     PRELU_KERNELS_FLOAT(F16, F16, F16, KERNEL_SOURCE_1)
     PRELU_KERNELS(U8, U8, U8, KERNEL_SOURCE_1)
     PRELU_KERNELS(I32, I32, I32, KERNEL_SOURCE_1)
diff --git a/src/tim/vx/internal/src/kernel/cl/rope_cl.c b/src/tim/vx/internal/src/kernel/cl/rope_cl.c
new file mode 100644
index 0000000..90c60c3
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/rope_cl.c
@@ -0,0 +1,329 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_ROPE,
+} _internal_kernel_e;
+
+#define _ROPE_KERNEL_SOURCE      "rope"
+#define _ROPE_KERNEL_NAME        CVIVANTE_NAMESPACE("cl.rope")
+
+// Add kernel hashtable here
+#define STR(a) #a
+#define ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ) \
+        ((IN0_DTYPE) | (IN0_DTYPE << 8) | (OUT_DTYPE << 16) | (AXIS << 25))
+#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ) \
+        { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ), \
+          CVIVANTE_NAMESPACE("cl.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_axis"STR(AXIS)), \
+         "rope_0" }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _rope_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( F32, F32, F32, 0 ),
+    PACK_KERNEL_MAP( F32, F32, F32, 1 ),
+    PACK_KERNEL_MAP( F32, F32, F32, 2 ),
+    PACK_KERNEL_MAP( I32, I32, I32, 0 ),
+    PACK_KERNEL_MAP( I32, I32, I32, 1 ),
+    PACK_KERNEL_MAP( I32, I32, I32, 2 ),
+    PACK_KERNEL_MAP( U32, U32, U32, 0 ),
+    PACK_KERNEL_MAP( U32, U32, U32, 1 ),
+    PACK_KERNEL_MAP( U32, U32, U32, 2 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _rope_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _ROPE_PARAM_NUM  _cnt_of_array( _rope_kernel_param_def )
+#define SCALAR_AXIS                 (4)
+#define SCALAR_IN_ZP                (5)
+#define SCALAR_COS_ZP               (6)
+#define SCALAR_SIN_ZP               (7)
+#define SCALAR_SCALE0               (8)
+#define SCALAR_SCALE1               (9)
+#define SCALAR_OUT_ZP               (10)
+#define SCALAR_HALF_HEAD_SIZE       (11)
+#define SCALAR_STEP                 (12)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_rope_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,         // workdim
+        {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0}, // localWorkSize: local group size in thread
+        {0, 0, 0}  // globalWorkSize: image size in thread
+    };
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_attr_t* attr[2] = { NULL };
+    int32_t axis = 0;
+    vsi_size_array_t* out_shape = NULL;
+    vsi_size_t shape[3] = { 1 };
+
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+    out_shape = attr[1]->shape;
+    shape[0] = out_shape->data[0];
+    shape[1] = out_shape->data[1];
+    shape[2] = out_shape->data[2];
+    shape[axis] = shape[axis] / 2;
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+    gpu_param.global_size[0] = shape[0];
+    gpu_param.global_size[1] = shape[1];
+    gpu_param.global_size[2] = out_shape->size > 2 ? shape[2] : 1;
+
+    status = vsi_nn_kernel_gpu_config(node, &gpu_param);
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(attr[0]);
+    SAFE_FREE_TENSOR_ATTR(attr[1]);
+#undef SAFE_FREE_TENSOR_ATTR
+
+    return status;
+} /* _rope_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t axis
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e in2_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _rope_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _rope_kernel_map );
+    vx_param_description_t * param_def  = _rope_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _rope_initializer;
+
+    uint32_t key = 0;
+    uint32_t i;
+
+    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type);
+    in2_dtype = vsi_nn_kernel_map_dtype(inputs[2]->attr.dtype.vx_type);
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+#define _PACK_SELECT_KEY( in0_type, in1_type, in2_type, out_type ) \
+    ((in0_type) | (in1_type << 8) | (in2_type << 16) | (out_type << 24))
+    switch (_PACK_SELECT_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32, F32, F32):
+    case _PACK_SELECT_KEY(F16, F16, F16, F16):
+        key = ROPE_HASH_KEY(F32, F32, F32, axis);
+        break;
+    case _PACK_SELECT_KEY(U8,  U8,  U8,  U8):
+    case _PACK_SELECT_KEY(U16, U16, U16, U16):
+        key = ROPE_HASH_KEY(U32, U32, U32, axis);
+        break;
+    case _PACK_SELECT_KEY(I8,  I8,  I8,  I8):
+    case _PACK_SELECT_KEY(I16, I16, I16, I16):
+    case _PACK_SELECT_KEY(I32, I32, I32, I32):
+        key = ROPE_HASH_KEY(I32, I32, I32, axis);
+        break;
+    default:
+        break;
+    }
+#undef _PACK_SELECT_KEY
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _rope_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_ROPE_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
+    int32_t interleaved = vsi_nn_kernel_param_get_int32(params, "interleaved");
+    float in_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float cos_scale = vsi_nn_get_tensor_scale(inputs[1]);
+    float sin_scale = vsi_nn_get_tensor_scale(inputs[2]);
+    float out_scale = vsi_nn_get_tensor_scale(outputs[0]);
+    float in_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float cos_zp = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
+    float sin_zp = (float)vsi_nn_get_tensor_zero_point(inputs[2]);
+    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    int32_t half_head_size = interleaved ? 1 : (int32_t)(inputs[0]->attr.size[axis] / 2);
+    float scale0 = in_scale * cos_scale / out_scale;
+    float scale1 = in_scale * sin_scale / out_scale;
+    int32_t step = interleaved ? 2 : 1;
+    int32_t i = 0;
+
+    // Check if gpu can support the size
+    if ( !vsi_nn_kernel_gpu_check_shape(
+        inputs[0]->attr.size, inputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, axis );
+    if (VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _ROPE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_AXIS] = vsi_nn_kernel_scalar_create(
+                graph, I32, &axis);
+            node_params[SCALAR_IN_ZP] = vsi_nn_kernel_scalar_create(
+                graph, F32, &in_zp);
+            node_params[SCALAR_COS_ZP] = vsi_nn_kernel_scalar_create(
+                graph, F32, &cos_zp);
+            node_params[SCALAR_SIN_ZP] = vsi_nn_kernel_scalar_create(
+                graph, F32, &sin_zp);
+            node_params[SCALAR_SCALE0] = vsi_nn_kernel_scalar_create(
+                graph, F32, &scale0);
+            node_params[SCALAR_SCALE1] = vsi_nn_kernel_scalar_create(
+                graph, F32, &scale1);
+            node_params[SCALAR_OUT_ZP] = vsi_nn_kernel_scalar_create(
+                graph, F32, &output_zp);
+            node_params[SCALAR_HALF_HEAD_SIZE] = vsi_nn_kernel_scalar_create(
+                graph, I32, &half_head_size);
+            node_params[SCALAR_STEP] = vsi_nn_kernel_scalar_create(
+                graph, I32, &step);
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ROPE_PARAM_NUM );
+        }
+    }
+
+    for (i = SCALAR_AXIS; i < (int32_t)_ROPE_PARAM_NUM; i++)
+    {
+        if (node_params[i])
+        {
+            vsi_nn_kernel_scalar_release(&node_params[i]);
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( rope, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/swish_cl.c b/src/tim/vx/internal/src/kernel/cl/swish_cl.c
index 97d0db9..97cbd4a 100644
--- a/src/tim/vx/internal/src/kernel/cl/swish_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/swish_cl.c
@@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@@ -299,7 +300,7 @@ static vsi_nn_kernel_node_t _setup
     VSI_UNREFERENCED(output_num);
 
 #if (VX_ACTIVATION_EXT_SUPPORT)
-    if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
+    if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
     {
         return NULL;
     }
diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
index 78b9a9b..3a698fe 100644
--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@@ -26,6 +26,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@@ -457,7 +458,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool is_odd_even_sort = FALSE;
     vsi_bool is_bitnoic_segment = FALSE;
     size_t param_num = _TOPK_PARAM_NUM;
-    int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2);
+    int32_t max_stages = 7 + (int32_t)log2(((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize >> 2);
     vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
@@ -483,6 +484,11 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
+    if (block_size >= GPU_TENSOR_MAX_WIDTH)
+    {
+        return NULL;
+    }
+
     shape[0][0] = block_size;
     shape[0][1] = block_num;
     shape[1][0] = top_k;
diff --git a/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c b/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c
index 75623dd..95435d2 100644
--- a/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c
@@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@@ -192,7 +193,7 @@ static vsi_bool _bucketize_support_types
         return FALSE;
     }
 
-    if (in_dtype == F16 && graph->ctx->config.evis.ver != VSI_NN_HW_EVIS_2)
+    if (in_dtype == F16 && ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver != VSI_NN_HW_EVIS_2)
     {
         return FALSE;
     }
diff --git a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
index 0e4e1fe..ee8c8fe 100644
--- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
@@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@@ -771,7 +772,8 @@ static vsi_nn_kernel_node_t _setup
     temp_tensor[1] = weights;
     temp_tensor[2] = biases;
 
-    ks = get_kernel_size(weights->attr.size[0], dilation, stride, graph->ctx->config.evis.ver);
+    ks = get_kernel_size(weights->attr.size[0], dilation, stride,
+        ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver);
 
     status = _query_kernel( kernel, temp_tensor, outputs, dilation, ks);
 
diff --git a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
index ce13b84..79e5b02 100644
--- a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
@@ -121,7 +121,9 @@ static const _kernel_map_type _groupnorm_sums_kernel_map[] =
     TENSOR_GROUPNORM_SUMS_KERNELS( U8, F32, KERNEL_SOURCE_0 )
     TENSOR_GROUPNORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_0 )
     TENSOR_GROUPNORM_SUMS_KERNELS( I16, F32, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SUMS_KERNELS( U16, F32, KERNEL_SOURCE_2 )
     TENSOR_GROUPNORM_SUMS_KERNELS_2D( I16, F32, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SUMS_KERNELS_2D( U16, F32, KERNEL_SOURCE_2 )
     TENSOR_GROUPNORM_SUMS_KERNELS( F16, F32, KERNEL_SOURCE_2 )
     TENSOR_GROUPNORM_SUMS_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 )
 };
@@ -174,6 +176,9 @@ static const _kernel_map_type _groupnorm_kernel_map[] =
     TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, U8, KERNEL_SOURCE_2 )
     TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, F16, KERNEL_SOURCE_2 )
     TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, F16, KERNEL_SOURCE_2 )
+
+    TENSOR_GROUPNORM_SCALE_KERNELS( U16, F32, U16, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( U16, F32, U16, KERNEL_SOURCE_2 )
 };
 
 /*
@@ -245,6 +250,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
     float sum_x2_tail0 = 1;
     float sum_x2_tail1 = 1;
     float work_item_pixels = 1;
+    vsi_bool is_input_8bits = FALSE;
 
     VSI_UNREFERENCED(param_size);
 
@@ -263,12 +269,13 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
     width = (int32_t)(input_shape->data[0]);
     height = (int32_t)(input_shape->data[1]);
     chn = (int32_t)(attr[1]->shape->data[1]);
+    is_input_8bits = attr[0]->dtype == I8 || attr[0]->dtype == U8;
     if (is2D)
     {
         height = 1;
     }
 
-    work_item_pixels = (float)height * 16;
+    work_item_pixels = is_input_8bits ? 16 * (float)height : 8 * (float)height;
 
     sum_x_tail = -work_item_pixels * input_zp * input_scale;
     sum_x2_tail0 = work_item_pixels * input_zp * input_zp * input_scale2;
@@ -281,11 +288,11 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
     shaderParam.local_size[1]  = 1;
     shaderParam.local_size[2]  = 1;
 
-    if (attr[0]->dtype == I8 || attr[0]->dtype == U8)
+    if (is_input_8bits)
     {
         shaderParam.global_size[0]   = (width + 255) / 256 * 16;
     }
-    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
     {
         shaderParam.global_size[0]   = (width + 127) / 128 * 16;
     }
@@ -324,7 +331,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
         status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
     }
-    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
     {
         gpu_dp_inst_t uniSum_X_X2_8x2 = {{
             0x55555555, // TCfg
@@ -483,7 +490,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
     }
 
     shaderParam.global_scale[0]  = 16;
-    if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
     {
         shaderParam.global_scale[0]  = 8;
     }
@@ -610,6 +617,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
                     CHECK_STATUS_FAIL_GOTO(status, OnError );
                 }
                 break;
+            case _PACK_SELECT_KEY( U16, U16 ):
             case _PACK_SELECT_KEY( I16, I16 ):
             case _PACK_SELECT_KEY( I16, F16 ):
             case _PACK_SELECT_KEY( F16, F16 ):
@@ -838,8 +846,7 @@ static vsi_nn_kernel_node_t _setup
     attr.is_const = FALSE;
     attr.vtl = TRUE;
     attr.size[0] = ((new_shape[0] + 255) / 256) * 4;
-    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
-        || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
+    if (in0_dtype == I16 || in0_dtype == F16 || in0_dtype == U16)
     {
         attr.size[0] = ((new_shape[0] + 127) / 128) * 4;
     }
diff --git a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
index 4703424..5d7ae5a 100644
--- a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
@@ -124,22 +124,23 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
         {0, 0, 0}
         };
     int8_t      in0_fl          = 0;
-    int32_t     inputZP0        = 0;
-    float       input_scale0    = 1.0f;
-    int32_t     inputZP1        = 0;
-    float       input_scale1    = 1.0f;
+    int32_t     input0_zp       = 0;
+    float       input0_scale    = 1.0f;
+    int32_t     input1_zp       = 0;
+    float       input1_scale    = 1.0f;
+    float       output_zp       = 0;
     int8_t      out_fl          = 0;
-    float       outputZP        = 0;
 
-    int32_t  shift0             = 0;
-    vsi_bool is_ge_fl           = FALSE;
+    int32_t     shift0          = 0;
+    vsi_bool    is_ge_fl        = FALSE;
+
     vsi_bool is_2d_img          = FALSE;
     uint32_t evis_version       = 0;
 
     vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
     vsi_size_array_t * out_shape = NULL;
     uint32_t pack_key;
-    vx_context                  ctx       = vxGetContext((vx_reference)node);
+    vx_context ctx = vxGetContext((vx_reference)node);
     vx_hardware_caps_params_t   hw_param;
 
     VSI_UNREFERENCED(param_size);
@@ -165,34 +166,30 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
     CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
 
     out_shape     = attr[2]->shape;
-    inputZP0      = attr[0]->zero_point;
-    input_scale0  = attr[0]->scale;
-    inputZP1      = attr[1]->zero_point;
-    input_scale1  = attr[1]->scale;
-    outputZP      = (float)attr[2]->zero_point;
-    input_scale0  = input_scale0 / attr[2]->scale;
+    input0_zp     = attr[0]->zero_point;
+    input0_scale  = attr[0]->scale;
+    input1_zp     = attr[1]->zero_point;
+    input1_scale  = attr[1]->scale;
+    output_zp     = (float)attr[2]->zero_point;
+    input0_scale  = input0_scale / attr[2]->scale;
 
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP &&
+        attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
     {
         in0_fl = (int8_t)attr[0]->dfp.fl;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
         out_fl = (int8_t)attr[2]->dfp.fl;
+        shift0 = in0_fl - out_fl;
+        is_ge_fl = shift0 >= 0;
     }
 
-    shift0 = in0_fl - out_fl;
-
     is_2d_img = (out_shape->size < 3) || (out_shape->data[2] == 1);
-    is_ge_fl  = shift0 >= 0;
 
 #define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, GE_FL, IMG_2D, EVIS2 )    \
         (IN0_TYPE  | ( OUT_TYPE << 16) | (GE_FL << 24) | (IMG_2D << 25) | (EVIS2 << 26))
 
-    pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype, is_ge_fl, is_2d_img, evis_version );
+    pack_key = _PACK_SELECT_KEY(attr[0]->dtype, attr[2]->dtype, is_ge_fl, is_2d_img, evis_version);
 
-    if ( attr[0]->dtype == I8 && attr[2]->dtype == I8 && is_ge_fl)
+    if (attr[0]->dtype == I8 && attr[2]->dtype == I8 && is_ge_fl)
     {
         gpu_param.global_scale[0] = 16;
         gpu_param.global_scale[1] = 1;
@@ -204,7 +201,6 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
         gpu_param.global_scale[1] = 1;
         gpu_param.global_scale[2] = 1;
     }
-
     gpu_param.global_size[0] = gpu_align_p2(
             (out_shape->data[0] + gpu_param.global_scale[0] - 1)
             / gpu_param.global_scale[0], 4);
@@ -215,97 +211,97 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
 
     switch( pack_key )
     {
-        case _PACK_SELECT_KEY( I8,  I8,  1, 1, 2 ):
-        case _PACK_SELECT_KEY( I16, I16, 1, 1, 2 ):
+    case _PACK_SELECT_KEY(I8,  I8,  1, 1, 2):
+    case _PACK_SELECT_KEY(I16, I16, 1, 1, 2):
+    {
+        gpu_dp_inst_t uniPreluDFPLo_2x8b = { {
+            0x77777777, // TCfg
+            0x44444444, // ASelt
+            0x33221100, 0x77665544, // ABin
+            0x00000000, // BSelt
+            0x30201000, 0x70605040, // BBin
+            0x00004000, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniPreluDFPHi_2x8b = { {
+            0x77777777, // TCfg
+            0x44444444, // ASelt
+            0xbbaa9988, 0xffeeddcc, // ABin
+            0x00000000, // BSelt
+            0x30201000, 0x70605040, // BBin
+            0x00004000, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        if (attr[0]->dtype == I16)
         {
-            gpu_dp_inst_t uniPreluDFPLo_2x8b = {{
-                0x77777777, // TCfg
-                0x44444444, // ASelt
-                0x33221100, 0x77665544, // ABin
-                0x00000000, // BSelt
-                0x30201000, 0x70605040, // BBin
-                0x00004000, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniPreluDFPHi_2x8b = {{
-                0x77777777, // TCfg
-                0x44444444, // ASelt
-                0xbbaa9988, 0xffeeddcc, // ABin
-                0x00000000, // BSelt
-                0x30201000, 0x70605040, // BBin
-                0x00004000, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            if ( attr[0]->dtype == I16 )
-            {
-                uniPreluDFPLo_2x8b.data[7] = 0x00003000;
-                uniPreluDFPHi_2x8b.data[7] = 0x00003000;
-            }
-
-            gpu_dp_inst_update_postshfit( &uniPreluDFPLo_2x8b, shift0 );
-            gpu_dp_inst_update_postshfit( &uniPreluDFPHi_2x8b, shift0 );
-
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniPreluDFPLo_2x8b", &uniPreluDFPLo_2x8b );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniPreluDFPHi_2x8b", &uniPreluDFPHi_2x8b );
-            CHECK_STATUS_FAIL_GOTO(status, final );
+            uniPreluDFPLo_2x8b.data[7] = 0x00003000;
+            uniPreluDFPHi_2x8b.data[7] = 0x00003000;
         }
-        break;
-        case _PACK_SELECT_KEY( I8,  I8,  1, 1, 1 ):
-        case _PACK_SELECT_KEY( I16, I16, 1, 1, 1 ):
-        {
-            gpu_dp_inst_t uniPreluInt8_2x8 = {{
-                0x55555555, // TCfg
-                0x00000000, // ASelt
-                0xb3a29180, 0xf7e6d5c4, // ABin
-                0x66666666, // BSelt
-                0x30201000, 0x70605040, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniPreluInt16_part0_4x4 = {{
-                0x05050505, // TCfg
-                0x00000000, // ASelt
-                0x00510040, 0x00730062, // ABin
-                0x06060606, // BSelt
-                0x00100000, 0x00300020, // BBin
-                0x00000400, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000000, 0x00000001, 0x00000000,
-                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniPreluInt16_part1_4x4 = {{
-                0x05050505, // TCfg
-                0x00000000, // ASelt
-                0x00510040, 0x00730062, // ABin
-                0x06060606, // BSelt
-                0x00500040, 0x00700060, // BBin
-                0x00000400, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000000, 0x00000001, 0x00000000,
-                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };
 
-            gpu_dp_inst_update_postshfit( &uniPreluInt8_2x8, shift0 );
-            gpu_dp_inst_update_postshfit( &uniPreluInt16_part0_4x4, shift0 );
-            gpu_dp_inst_update_postshfit( &uniPreluInt16_part1_4x4, shift0 );
+        gpu_dp_inst_update_postshfit(&uniPreluDFPLo_2x8b, shift0);
+        gpu_dp_inst_update_postshfit(&uniPreluDFPHi_2x8b, shift0);
 
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniPreluInt8_2x8", &uniPreluInt8_2x8 );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniPreluInt16_part0_4x4", &uniPreluInt16_part0_4x4 );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniPreluInt16_part1_4x4", &uniPreluInt16_part1_4x4 );
-            CHECK_STATUS_FAIL_GOTO(status, final );
-        }
-        break;
-        case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 1 ):
-        case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 2 ):
-        case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 1 ):
-        case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 2 ):
+        status = vsi_nn_kernel_gpu_add_param(node,
+            "uniPreluDFPLo_2x8b", &uniPreluDFPLo_2x8b);
+        status |= vsi_nn_kernel_gpu_add_param(node,
+            "uniPreluDFPHi_2x8b", &uniPreluDFPHi_2x8b);
+        CHECK_STATUS_FAIL_GOTO(status, final);
+    }
+    break;
+    case _PACK_SELECT_KEY(I8,  I8,  1, 1, 1):
+    case _PACK_SELECT_KEY(I16, I16, 1, 1, 1):
+    {
+        gpu_dp_inst_t uniPreluInt8_2x8 = { {
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0xb3a29180, 0xf7e6d5c4, // ABin
+            0x66666666, // BSelt
+            0x30201000, 0x70605040, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniPreluInt16_part0_4x4 = { {
+            0x05050505, // TCfg
+            0x00000000, // ASelt
+            0x00510040, 0x00730062, // ABin
+            0x06060606, // BSelt
+            0x00100000, 0x00300020, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniPreluInt16_part1_4x4 = { {
+            0x05050505, // TCfg
+            0x00000000, // ASelt
+            0x00510040, 0x00730062, // ABin
+            0x06060606, // BSelt
+            0x00500040, 0x00700060, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_update_postshfit(&uniPreluInt8_2x8, shift0);
+        gpu_dp_inst_update_postshfit(&uniPreluInt16_part0_4x4, shift0);
+        gpu_dp_inst_update_postshfit(&uniPreluInt16_part1_4x4, shift0);
+
+        status = vsi_nn_kernel_gpu_add_param(node,
+            "uniPreluInt8_2x8", &uniPreluInt8_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node,
+            "uniPreluInt16_part0_4x4", &uniPreluInt16_part0_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node,
+            "uniPreluInt16_part1_4x4", &uniPreluInt16_part1_4x4);
+        CHECK_STATUS_FAIL_GOTO(status, final);
+    }
+    break;
+    case _PACK_SELECT_KEY(BF16, BF16, 0, 1, 1):
+    case _PACK_SELECT_KEY(BF16, BF16, 0, 1, 2):
+    case _PACK_SELECT_KEY(BF16, BF16, 0, 0, 1):
+    case _PACK_SELECT_KEY(BF16, BF16, 0, 0, 2):
         {
             gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
                 0x11111111, // TCfg
@@ -446,15 +442,15 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
             status |= vsi_nn_kernel_gpu_add_param( node,
                 "uniConvF16toF32_part1_4x4", &uniConvF16toF32_part1_4x4 );
             status |= vsi_nn_kernel_gpu_add_param( node,
-                "inputZP0", &inputZP0 );
+                "input0_zp", &input0_zp);
             status |= vsi_nn_kernel_gpu_add_param( node,
-                "input_scale0", &input_scale0 );
+                "input0_scale", &input0_scale );
             status |= vsi_nn_kernel_gpu_add_param( node,
-                "inputZP1", &inputZP1 );
+                "input1_zp", &input1_zp);
             status |= vsi_nn_kernel_gpu_add_param( node,
-                "input_scale1", &input_scale1 );
+                "input1_scale", &input1_scale );
             status |= vsi_nn_kernel_gpu_add_param( node,
-                "outputZP", &outputZP );
+                "output_zp", &output_zp );
             if (attr[2]->dtype == F16)
             {
                 status |= vsi_nn_kernel_gpu_add_param( node,
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
index a63fc3a..63c72b2 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@@ -58,53 +59,92 @@ typedef enum
 #define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type)  "resize_bilinear_"#_input_type"_opt"
 #define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_1"
 #define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_2"
+#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_3"
+#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC4(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_4"
+#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC5(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_5"
 
 #define STR(a) #a
 // Add kernel hashtable here
-#define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag ) \
-        (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (scale_flag))
+#define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag, same_type ) \
+        (( IN_DTYPE ) | ( OUT_DTYPE << 8) | (scale_flag << 16) | (same_type << 22))
 
-#define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN ), \
+#define _PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN, SAME_TYPE ), \
           CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_DOWN"), \
           _RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) }
 
-#define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP ), \
+#define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \
+        _PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, TRUE ), \
+        _PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, FALSE )
+
+#define _PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP, SAME_TYPE ), \
           CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP"), \
           _RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) }
 
-#define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT ), \
+#define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \
+        _PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, TRUE ), \
+        _PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, FALSE )
+
+#define _PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT, SAME_TYPE ), \
           CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP_opt"), \
           _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(IN_DTYPE) }
 
-#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF ), \
+#define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \
+        _PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, TRUE ), \
+        _PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, FALSE )
+
+#define PACK_KERNEL_MAP_UP_2X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF, TRUE ), \
           CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
             "_SAME_2x_upsample_half_pixel_centers"), \
           _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
 
-#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF ), \
+#define PACK_KERNEL_MAP_UP_4X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF, TRUE ), \
           CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
             "_SAME_4x_upsample_half_pixel_centers"), \
           _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
 
-#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF ), \
+#define PACK_KERNEL_MAP_UP_8X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF, TRUE ), \
           CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
             "_SAME_8x_upsample_half_pixel_centers"), \
           _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(IN_DTYPE) }
 
-#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF ), \
+#define PACK_KERNEL_MAP_UP_3X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF, TRUE ), \
           CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
             "_SAME_3x_upsample_half_pixel_centers"), \
           _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
 
+#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF, FALSE ), \
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_2x_upsample_half_pixel_centers"), \
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(IN_DTYPE) }
+
+#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF, FALSE ), \
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_4x_upsample_half_pixel_centers"), \
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(IN_DTYPE) }
+
+#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF, FALSE ), \
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_8x_upsample_half_pixel_centers"), \
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC5(IN_DTYPE) }
+
+#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF, FALSE ), \
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_3x_upsample_half_pixel_centers"), \
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC4(IN_DTYPE) }
+
 #define PACK_KERNEL_MAP_UP_8X_ALIGN( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN ), \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN, TRUE ), \
           CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
             "_SAME_8x_upsample_align_corners"), \
           "resize_bilinear_align_corners" }
@@ -135,6 +175,10 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
     PACK_KERNEL_MAP_UP(F16, F16),
     PACK_KERNEL_MAP_UP(BF16, BF16),
     PACK_KERNEL_MAP_UP_OPT(U8, U8),
+    PACK_KERNEL_MAP_UP_2X_HALF_SAME_TYPE(U8, U8),
+    PACK_KERNEL_MAP_UP_3X_HALF_SAME_TYPE(U8, U8),
+    PACK_KERNEL_MAP_UP_4X_HALF_SAME_TYPE(U8, U8),
+    PACK_KERNEL_MAP_UP_8X_HALF_SAME_TYPE(U8, U8),
     PACK_KERNEL_MAP_UP_2X_HALF(U8, U8),
     PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
     PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
@@ -672,18 +716,23 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
         };
     vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
     vsi_nn_kernel_tensor_attr_t * input_attr    = NULL;
-    vsi_size_array_t             * out_shape     = NULL;
-    vsi_size_array_t             * in_shape      = NULL;
+    vsi_size_array_t             * out_shape    = NULL;
+    vsi_size_array_t             * in_shape     = NULL;
     vsi_nn_kernel_dtype_e         input_dtype   = F16;
+    vsi_nn_kernel_dtype_e         output_dtype  = F16;
     uint32_t    depth = 0;
     uint32_t    in_width = 0;
     uint32_t    in_height = 0;
     uint32_t    out_width = 0;
     uint32_t    out_height = 0;
+    vsi_bool    is_same_type = FALSE;
     vsi_bool    is_2x_up_kernel  = FALSE;
     vsi_bool    is_3x_up_kernel  = FALSE;
     vsi_bool    is_4x_up_kernel  = FALSE;
     vsi_bool    is_8x_up_kernel  = FALSE;
+    float scale = 1.f;
+    int32_t input_zp = 0;
+    int32_t output_zp = 0;
 
     VSI_UNREFERENCED(param_size);
 
@@ -692,17 +741,23 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
-    out_shape     = output_attr->shape;
-    in_shape      = input_attr->shape;
-    input_dtype   = input_attr->dtype;
+    out_shape    = output_attr->shape;
+    in_shape     = input_attr->shape;
+    input_dtype  = input_attr->dtype;
+    output_dtype = output_attr->dtype;
 
     in_width          = (uint32_t)(in_shape->data[0]);
     in_height         = (uint32_t)(in_shape->data[1]);
     depth             = (uint32_t)(in_shape->data[2]);
     out_width         = (uint32_t)(out_shape->data[0]);
     out_height        = (uint32_t)(out_shape->data[1]);
+    scale = input_attr->scale;
+    input_zp = input_attr->zero_point;
+    scale /= output_attr->scale;
+    output_zp = output_attr->zero_point;
+    is_same_type = _is_same_quant(input_attr, output_attr);
 
-    if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
+    if ((U8 == input_dtype) && (output_dtype == U8))
     {
         is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
         is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
@@ -728,206 +783,303 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
         gpu_param.global_scale[2] = 1;
     }
 
-    if (is_2x_up_kernel)
+    if (is_2x_up_kernel || is_3x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
     {
-        gpu_dp_inst_t uniResize2xUp_0_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
-            0x00000704, // AccumType, ConstantType, and PostShift
-            0x09030301, 0x03090103, 0x09030301, 0x03090103,
-            0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize2xUp_1_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
-            0x00000704, // AccumType, ConstantType, and PostShift
-            0x09030301, 0x03090103, 0x09030301, 0x03090103,
-            0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
-        }, GPU_DP_TYPE_16};
+        uint16_t M0 = 0;
+        int32_t  postShift = 0;
+        uint32_t multAndoutZP[2] = { 0 };
+        gpu_dp_inst_t uniU8PostProcess_2x8 = { {
+            0xdddddddd, // TCfg
+            0x44444444, // ASelt
+            0x13121110, 0x17161514, // ABin
+            0x11111111, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002600, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
 
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if (is_3x_up_kernel)
-    {
-        gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{
-            0x15515515, // TCfg
-            0x00000000, // ASelt
-            0x21210110, 0x03323202, // ABin
-            0x2aa2aa2a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000610, // AccumType, ConstantType, and PostShift
-            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
-            0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{
-            0x05155155, // TCfg
-            0x00000000, // ASelt
-            0x54044343, 0x00650554, // ABin
-            0x0a2aa2aa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000610, // AccumType, ConstantType, and PostShift
-            0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
-            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{
-            0x55551155, // TCfg
-            0x50501050, // ASelt
-            0x01011010, 0x21212121, // ABin
-            0xaaaa22aa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x0000060f, // AccumType, ConstantType, and PostShift
-            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
-            0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{
-            0x11555511, // TCfg
-            0x10505010, // ASelt
-            0x32320202, 0x03033232, // ABin
-            0x22aaaa22, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x0000060f, // AccumType, ConstantType, and PostShift
-            0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
-            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{
-            0x55115555, // TCfg
-            0x50105050, // ASelt
-            0x43434343, 0x54540404, // ABin
-            0xaa22aaaa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x0000060f, // AccumType, ConstantType, and PostShift
-            0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
-            0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{
-            0x00551155, // TCfg
-            0x00501050, // ASelt
-            0x05055454, 0x00006565, // ABin
-            0x00aa22aa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x0000060f, // AccumType, ConstantType, and PostShift
-            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
-            0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
+        if (is_2x_up_kernel)
+        {
+            gpu_dp_inst_t uniResize2xUp_0_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
+                0x00000704, // AccumType, ConstantType, and PostShift
+                0x09030301, 0x03090103, 0x09030301, 0x03090103,
+                0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize2xUp_1_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
+                0x00000704, // AccumType, ConstantType, and PostShift
+                0x09030301, 0x03090103, 0x09030301, 0x03090103,
+                0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
+            }, GPU_DP_TYPE_16 };
 
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if (is_4x_up_kernel)
-    {
-        gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
-            0x00000406, // AccumType, ConstantType, and PostShift
-            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
-            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
-            0x00000406, // AccumType, ConstantType, and PostShift
-            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
-            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
-            0x00000406, // AccumType, ConstantType, and PostShift
-            0x23150503, 0x31070701, 0x07310107, 0x15230305,
-            0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
-            0x00000406, // AccumType, ConstantType, and PostShift
-            0x23150503, 0x31070701, 0x07310107, 0x15230305,
-            0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
-        }, GPU_DP_TYPE_16};
+            if (!is_same_type)
+            {
+                float f2i_radio = 16.0f;
+                gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
+                multAndoutZP[0] = (uint32_t)(M0);
+                multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
 
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if (is_8x_up_kernel)
-    {
-        gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
-            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
-            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
-            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
-            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
-            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
-            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
-            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
-            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
-        }, GPU_DP_TYPE_16};
+                gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
+                uniResize2xUp_0_4x8.data[7] = 0x00000700;
+                uniResize2xUp_1_4x8.data[7] = 0x00000700;
 
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
-        CHECK_STATUS_FAIL_GOTO(status, final );
+                status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
+                    &uniU8PostProcess_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
+                CHECK_STATUS_FAIL_GOTO(status, final);
+            }
+
+            status = vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
+            CHECK_STATUS_FAIL_GOTO(status, final);
+        }
+        else if (is_3x_up_kernel)
+        {
+            gpu_dp_inst_t uniResize3xUp_l00_2x8 = { {
+                0x15515515, // TCfg
+                0x00000000, // ASelt
+                0x21210110, 0x03323202, // ABin
+                0x2aa2aa2a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000610, // AccumType, ConstantType, and PostShift
+                0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
+                0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize3xUp_l01_2x8 = { {
+                0x05155155, // TCfg
+                0x00000000, // ASelt
+                0x54044343, 0x00650554, // ABin
+                0x0a2aa2aa, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000610, // AccumType, ConstantType, and PostShift
+                0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
+                0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize3xUp_l10_4x4 = { {
+                0x55551155, // TCfg
+                0x50501050, // ASelt
+                0x01011010, 0x21212121, // ABin
+                0xaaaa22aa, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x0000060f, // AccumType, ConstantType, and PostShift
+                0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
+                0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize3xUp_l11_4x4 = { {
+                0x11555511, // TCfg
+                0x10505010, // ASelt
+                0x32320202, 0x03033232, // ABin
+                0x22aaaa22, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x0000060f, // AccumType, ConstantType, and PostShift
+                0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
+                0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize3xUp_l12_4x4 = { {
+                0x55115555, // TCfg
+                0x50105050, // ASelt
+                0x43434343, 0x54540404, // ABin
+                0xaa22aaaa, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x0000060f, // AccumType, ConstantType, and PostShift
+                0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
+                0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize3xUp_l13_4x4 = { {
+                0x00551155, // TCfg
+                0x00501050, // ASelt
+                0x05055454, 0x00006565, // ABin
+                0x00aa22aa, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x0000060f, // AccumType, ConstantType, and PostShift
+                0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
+                0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            if (!is_same_type)
+            {
+                float f2i_radio = 256.0f;
+                gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
+                multAndoutZP[0] = (uint32_t)(M0);
+                multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
+
+                gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
+                uniResize3xUp_l00_2x8.data[7] = 0x00000608;
+                uniResize3xUp_l01_2x8.data[7] = 0x00000608;
+                uniResize3xUp_l10_4x4.data[7] = 0x00000607;
+                uniResize3xUp_l11_4x4.data[7] = 0x00000607;
+                uniResize3xUp_l12_4x4.data[7] = 0x00000607;
+                uniResize3xUp_l13_4x4.data[7] = 0x00000607;
+
+                status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
+                    &uniU8PostProcess_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
+                CHECK_STATUS_FAIL_GOTO(status, final);
+            }
+
+            status = vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
+            CHECK_STATUS_FAIL_GOTO(status, final);
+        }
+        else if (is_4x_up_kernel)
+        {
+            gpu_dp_inst_t uniResize4xUp_l00_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
+                0x00000406, // AccumType, ConstantType, and PostShift
+                0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
+                0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize4xUp_l01_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
+                0x00000406, // AccumType, ConstantType, and PostShift
+                0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
+                0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize4xUp_l10_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
+                0x00000406, // AccumType, ConstantType, and PostShift
+                0x23150503, 0x31070701, 0x07310107, 0x15230305,
+                0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize4xUp_l11_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
+                0x00000406, // AccumType, ConstantType, and PostShift
+                0x23150503, 0x31070701, 0x07310107, 0x15230305,
+                0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            if (!is_same_type)
+            {
+                float f2i_radio = 64.0f;
+                gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
+                multAndoutZP[0] = (uint32_t)(M0);
+                multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
+
+                gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
+                uniResize4xUp_l00_4x8.data[7] = 0x00000400;
+                uniResize4xUp_l01_4x8.data[7] = 0x00000400;
+                uniResize4xUp_l10_4x8.data[7] = 0x00000400;
+                uniResize4xUp_l11_4x8.data[7] = 0x00000400;
+
+                status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
+                    &uniU8PostProcess_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
+                CHECK_STATUS_FAIL_GOTO(status, final);
+            }
+
+            status = vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
+            CHECK_STATUS_FAIL_GOTO(status, final);
+        }
+        else if (is_8x_up_kernel)
+        {
+            gpu_dp_inst_t uniResize8xUp_l00_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+                0x00000708, // AccumType, ConstantType, and PostShift
+                0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
+                0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize8xUp_l01_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+                0x00000708, // AccumType, ConstantType, and PostShift
+                0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
+                0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize8xUp_l10_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+                0x00000708, // AccumType, ConstantType, and PostShift
+                0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
+                0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize8xUp_l11_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+                0x00000708, // AccumType, ConstantType, and PostShift
+                0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
+                0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize8xUp_l20_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+                0x00000708, // AccumType, ConstantType, and PostShift
+                0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
+                0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize8xUp_l21_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+                0x00000708, // AccumType, ConstantType, and PostShift
+                0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
+                0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize8xUp_l30_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+                0x00000708, // AccumType, ConstantType, and PostShift
+                0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
+                0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize8xUp_l31_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+                0x00000708, // AccumType, ConstantType, and PostShift
+                0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
+                0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            if (!is_same_type)
+            {
+                float f2i_radio = 256.0f;
+                gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
+                multAndoutZP[0] = (uint32_t)(M0);
+                multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
+
+                gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
+                uniResize8xUp_l00_4x8.data[7] = 0x00000700;
+                uniResize8xUp_l01_4x8.data[7] = 0x00000700;
+                uniResize8xUp_l10_4x8.data[7] = 0x00000700;
+                uniResize8xUp_l11_4x8.data[7] = 0x00000700;
+                uniResize8xUp_l20_4x8.data[7] = 0x00000700;
+                uniResize8xUp_l21_4x8.data[7] = 0x00000700;
+                uniResize8xUp_l30_4x8.data[7] = 0x00000700;
+                uniResize8xUp_l31_4x8.data[7] = 0x00000700;
+
+                status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
+                    &uniU8PostProcess_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
+                CHECK_STATUS_FAIL_GOTO(status, final);
+            }
+
+            status = vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
+            CHECK_STATUS_FAIL_GOTO(status, final);
+        }
     }
     else
     {
@@ -1193,22 +1345,22 @@ static vsi_status _query_kernel
 
     if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0])
     {
-        if (is_same_type && (!align_corners) && (half_pixel_centers) && is_2x_upsample)
+        if ((!align_corners) && (half_pixel_centers) && is_2x_upsample)
         {
             scale_flag = UP_2X_HALF;
             initializer = _bilinear_half_pixel_centers_opt_initializer;
         }
-        else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_3x_upsample)
+        else if ((!align_corners) && (half_pixel_centers) && is_3x_upsample)
         {
             scale_flag = UP_3X_HALF;
             initializer = _bilinear_half_pixel_centers_opt_initializer;
         }
-        else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_4x_upsample)
+        else if ((!align_corners) && (half_pixel_centers) && is_4x_upsample)
         {
             scale_flag = UP_4X_HALF;
             initializer = _bilinear_half_pixel_centers_opt_initializer;
         }
-        else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_8x_upsample)
+        else if ((!align_corners) && (half_pixel_centers) && is_8x_upsample)
         {
             scale_flag = UP_8X_HALF;
             initializer = _bilinear_half_pixel_centers_opt_initializer;
@@ -1232,7 +1384,7 @@ static vsi_status _query_kernel
         scale_flag = DOWN;
     }
 
-    key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
+    key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
     for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
     {
         if( kernel_map[i].key == key )
@@ -1244,7 +1396,7 @@ static vsi_status _query_kernel
     if ((UP_OPT == scale_flag) && (i >= kernel_map_size))
     {
         scale_flag = UP;
-        key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
+        key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
         for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
         {
             if( kernel_map[i].key == key )
@@ -1257,7 +1409,7 @@ static vsi_status _query_kernel
     if ((UP == scale_flag) && (i >= kernel_map_size))
     {
         scale_flag = DOWN;
-        key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
+        key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
         for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
         {
             if( kernel_map[i].key == key )
@@ -1433,7 +1585,7 @@ static vsi_bool _is_image_width_lt16
     size_t bytes = vsi_nn_kernel_dtype_get_bytes(in_dtype);
     vsi_size_t max_cross_read_img_width = bytes == 1 ? 16 : 8;
 
-    if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
+    if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
     {
         return FALSE;
     }
@@ -1468,7 +1620,8 @@ static vsi_nn_kernel_node_t _setup
     int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
     int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
     vsi_bool is_same_type       = vsi_nn_is_same_type(inputs[0], outputs[0]);
-    vsi_bool is_evis2           = (vsi_bool)(graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_2);
+    vsi_bool is_evis2           = \
+        (vsi_bool)(((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver == VSI_NN_HW_EVIS_2);
     vsi_bool is_run_opt_kernel  = FALSE;
     vsi_nn_tensor_t*  scale     = NULL;
     int32_t pad_left = half_pixel_centers ? 1 : 0;
diff --git a/src/tim/vx/internal/src/kernel/evis/rope_evis.c b/src/tim/vx/internal/src/kernel/evis/rope_evis.c
new file mode 100644
index 0000000..381abeb
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/rope_evis.c
@@ -0,0 +1,744 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+    B---batch
+    N---num_heads
+    S---sequence length
+    H---head size
+ */
+typedef enum
+{
+    LAYOUT_NONE,
+    LAYOUT_BNHS,
+    LAYOUT_BNH1,
+    LAYOUT_BSNH,
+    LAYOUT_BNSH,
+} _internal_rope_layout_e;
+
+// Add kernel hashtable here
+#define STR(a) #a
+#define ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT, INTERLEAVED ) \
+      ((IN0_DTYPE) | (IN1_DTYPE << 8) | (OUT_DTYPE << 16) | (LAYOUT << 24) | (INTERLEAVED << 28))
+#define PACK_KERNEL_BNHS_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+        { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNHS, 0 ), \
+         CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnhs"), \
+         "rope_0" }
+#define PACK_KERNEL_BNH1_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+        { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNH1, 0 ), \
+         CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnh1"), \
+         "rope_1" }
+
+#define PACK_KERNEL_BSNH_INTERLEVEAD_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+        { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BSNH, 1 ), \
+         CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bsnh"), \
+         "rope_2" }
+
+#define PACK_KERNEL_BNSH_INTERLEVEAD_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+        { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNSH, 1 ), \
+         CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnsh"), \
+         "rope_3" }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+#define PACK_KERNEL_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
+    PACK_KERNEL_BNHS_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
+    PACK_KERNEL_BNH1_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
+    PACK_KERNEL_BSNH_INTERLEVEAD_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
+    PACK_KERNEL_BNSH_INTERLEVEAD_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE),
+
+static const _kernel_map_type _rope_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( BF16, BF16, BF16)
+    PACK_KERNEL_MAP( F16,  F16,  F16 )
+    PACK_KERNEL_MAP( I16,  I16,  I16 )
+    PACK_KERNEL_MAP( I16,  F16,  I16 )
+    PACK_KERNEL_MAP( I16,  I16,  I8 )
+    PACK_KERNEL_MAP( I16,  F16,  I8 )
+    PACK_KERNEL_MAP( I16,  I16,  U8 )
+    PACK_KERNEL_MAP( I16,  F16,  U8 )
+    PACK_KERNEL_MAP( U16,  U16,  U16 )
+    PACK_KERNEL_MAP( U16,  F16,  U16 )
+    PACK_KERNEL_MAP( I8,   I8,   I8  )
+    PACK_KERNEL_MAP( I8,   F16,  I8  )
+    PACK_KERNEL_MAP( U8,   U8,   U8  )
+    PACK_KERNEL_MAP( U8,   F16,  U8  )
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _rope_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _ROPE_PARAM_NUM  _cnt_of_array( _rope_kernel_param_def )
+#define SCALAR_AXIS       (4)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_rope_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+    };
+    vsi_nn_kernel_tensor_attr_t* out_attr = NULL;
+    vsi_nn_kernel_tensor_attr_t* in0_attr = NULL;
+    vsi_nn_kernel_tensor_attr_t* in1_attr = NULL;
+    vsi_nn_kernel_tensor_attr_t* in2_attr = NULL;
+    vsi_size_array_t* in_shape = NULL;
+    vsi_nn_kernel_dtype_e in0_dtype = F16;
+    vsi_nn_kernel_dtype_e in1_dtype = F16;
+    vsi_nn_kernel_dtype_e in2_dtype = F16;
+    vsi_nn_kernel_dtype_e out_dtype = F16;
+    float in0_scale = 1.0f;
+    float in1_scale = 1.0f;
+    float in2_scale = 1.0f;
+    float output_scale = 1.0f;
+    float output_zp = 0;
+    int32_t in0_zp = 0;
+    int32_t cos_zp = 0;
+    int32_t sin_zp = 0;
+    int32_t p = 0;
+    int32_t axis = 0;
+    int32_t interleaved = 0;
+    int32_t half_head_size = 1;
+    vsi_size_t shape[3] = {1};
+    uint32_t pack_key = 0;
+
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param);
+    VSI_UNREFERENCED(param_size);
+    // Add initializer
+
+    in0_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
+    CHECK_PTR_FAIL_GOTO(in0_attr, "Create tensor attr buffer fail.", final);
+    in1_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]);
+    CHECK_PTR_FAIL_GOTO(in1_attr, "Create tensor attr buffer fail.", final);
+    in2_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
+    CHECK_PTR_FAIL_GOTO(in2_attr, "Create tensor attr buffer fail.", final);
+    out_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[3]);
+    CHECK_PTR_FAIL_GOTO(out_attr, "Create tensor attr buffer fail.", final);
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &p);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+    axis = p & 0xFFFF;
+    interleaved = (p >> 16) & 0xFFFF;
+
+    in_shape = in0_attr->shape;
+    in0_dtype = in0_attr->dtype;
+    in1_dtype = in1_attr->dtype;
+    in2_dtype = in2_attr->dtype;
+    out_dtype = out_attr->dtype;
+
+    in0_scale = in0_attr->scale;
+    in1_scale = in1_attr->scale;
+    in2_scale = in2_attr->scale;
+    in0_zp = -in0_attr->zero_point;
+    cos_zp = -in1_attr->zero_point;
+    sin_zp = -in2_attr->zero_point;
+    output_scale = out_attr->scale;
+    output_zp = (float)out_attr->zero_point;
+
+    half_head_size = (int32_t)(in_shape->data[axis] / 2);
+    shape[0] = in_shape->data[0];
+    shape[1] = in_shape->data[1];
+    shape[2] = in_shape->data[2];
+    shape[axis] = half_head_size;
+
+    gpu_param.global_scale[0] = 8;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+    gpu_param.global_size[0] = gpu_align_p2((shape[0] + \
+        gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = shape[1];
+    gpu_param.global_size[2] = shape[2];
+
+#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE )    \
+        ((IN0_TYPE) | (IN1_TYPE << 8) | (IN2_TYPE << 16) | (OUT_TYPE << 24))
+
+    pack_key = _PACK_SELECT_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype);
+    switch (pack_key)
+    {
+    case _PACK_SELECT_KEY(BF16, BF16, BF16, BF16):
+    {
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = { {
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = { {
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x05050404, 0x07070606, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractOddData_2x8 = { {
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x07050301, 0x07050301, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        if (interleaved && axis == 0)
+        {
+            uniExtractOddData_2x8.data[1] = 0x10101010;
+            uniExtractOddData_2x8.data[2] = 0x03030101;
+            uniExtractOddData_2x8.data[3] = 0x07070505;
+        }
+        else
+        {
+            status = vsi_nn_kernel_gpu_add_param(node,
+                "half_head_size", &half_head_size);
+            CHECK_STATUS_FAIL_GOTO(status, final);
+        }
+        status = vsi_nn_kernel_gpu_add_param(node,
+            "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node,
+            "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node,
+            "uniExtractOddData_2x8", &uniExtractOddData_2x8);
+        CHECK_STATUS_FAIL_GOTO(status, final);
+    }
+    break;
+    case _PACK_SELECT_KEY(I16, I16, I16, I16):
+    case _PACK_SELECT_KEY(I16, F16, F16, I16):
+    case _PACK_SELECT_KEY(I16, I16, I16, I8):
+    case _PACK_SELECT_KEY(I16, F16, F16, I8):
+    case _PACK_SELECT_KEY(I16, I16, I16, U8):
+    case _PACK_SELECT_KEY(I16, F16, F16, U8):
+    case _PACK_SELECT_KEY(F16, F16, F16, F16):
+        {
+            float scale0 = in0_scale * in1_scale / output_scale;
+            float scale1 = in0_scale* in2_scale / output_scale;
+            gpu_dp_inst_t uniExtractHalf8_2x8 = { {
+                0x11111111, // TCfg
+                0x11110000, // ASelt
+                0x06040200, 0x06040200, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniExtractInteger_2x8 = { {
+                0x33333333, // TCfg
+                0x11110000, // ASelt
+                0x03020100, 0x03020100, // ABin
+                0x00000000, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniATimesB_0_4x4 = { {
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x01010101, // BSelt
+                0x00010000, 0x00030002, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniATimesB_1_4x4 = { {
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00050004, 0x00070006, // ABin
+                0x01010101, // BSelt
+                0x00050004, 0x00070006, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniAEvenTimesB_0_4x4 = { {
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00020000, 0x00060004, // ABin
+                0x01010101, // BSelt
+                0x00010000, 0x00030002, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniAEvenTimesB_1_4x4 = { {
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00020000, 0x00060004, // ABin
+                0x01010101, // BSelt
+                0x00050004, 0x00070006, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniAOddTimesB_0_4x4 = { {
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00030001, 0x00070005, // ABin
+                0x01010101, // BSelt
+                0x00010000, 0x00030002, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniAOddTimesB_1_4x4 = { {
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00030001, 0x00070005, // ABin
+                0x01010101, // BSelt
+                0x00050004, 0x00070006, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            if (interleaved && axis == 0)
+            {
+                uniExtractHalf8_2x8.data[1] = 0x10101010;
+                uniExtractHalf8_2x8.data[2] = 0x02020000;
+                uniExtractHalf8_2x8.data[3] = 0x06060404;
+                uniExtractInteger_2x8.data[1] = 0x10101010;
+                uniExtractInteger_2x8.data[2] = 0x01010000;
+                uniExtractInteger_2x8.data[3] = 0x03030202;
+
+                status = vsi_nn_kernel_gpu_add_param(node,
+                    "uniAEvenTimesB_0_4x4", &uniAEvenTimesB_0_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node,
+                    "uniAEvenTimesB_1_4x4", &uniAEvenTimesB_1_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node,
+                    "uniAOddTimesB_0_4x4", &uniAOddTimesB_0_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node,
+                    "uniAOddTimesB_1_4x4", &uniAOddTimesB_1_4x4);
+            }
+            else
+            {
+                status = vsi_nn_kernel_gpu_add_param(node,
+                    "uniATimesB_0_4x4", &uniATimesB_0_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node,
+                    "uniATimesB_1_4x4", &uniATimesB_1_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node,
+                    "half_head_size", &half_head_size);
+            }
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "scale0", &scale0);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "scale1", &scale1);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "output_zp", &output_zp);
+            if (out_dtype == F16)
+            {
+                status |= vsi_nn_kernel_gpu_add_param(node,
+                    "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
+            }
+            else
+            {
+                status |= vsi_nn_kernel_gpu_add_param(node,
+                    "uniExtract8Data_2x8", &uniExtractInteger_2x8);
+            }
+            CHECK_STATUS_FAIL_GOTO(status, final);
+        }
+        break;
+    case _PACK_SELECT_KEY(I8,  I8,  I8,  I8):
+    case _PACK_SELECT_KEY(U8,  U8,  U8,  U8):
+    case _PACK_SELECT_KEY(U16, U16, U16, U16):
+    case _PACK_SELECT_KEY(I8,  F16, F16, I8):
+    case _PACK_SELECT_KEY(U8,  F16, F16, U8):
+    case _PACK_SELECT_KEY(U16, F16, F16, U16):
+        {
+            float scale0 = in0_scale * in1_scale / output_scale;
+            float scale1 = in0_scale* in2_scale / output_scale;
+            gpu_dp_inst_t uniExtractInteger_2x8 = { {
+                0x33333333, // TCfg
+                0x11110000, // ASelt
+                0x03020100, 0x03020100, // ABin
+                0x00000000, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniAMinusZp_0_4x4 = { {
+                0x0d0d0d0d, // TCfg
+                0x04040404, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniAMinusZp_1_4x4 = { {
+                0x0d0d0d0d, // TCfg
+                0x04040404, // ASelt
+                0x00050004, 0x00070006, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniAEvenMinusZp_4x4 = { {
+                0x0d0d0d0d, // TCfg
+                0x04040404, // ASelt
+                0x00020000, 0x00060004, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniAOddMinusZp_4x4 = { {
+                0x0d0d0d0d, // TCfg
+                0x04040404, // ASelt
+                0x00030001, 0x00070005, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            if (interleaved && axis == 0)
+            {
+                uniExtractInteger_2x8.data[1] = 0x10101010;
+                uniExtractInteger_2x8.data[2] = 0x01010000;
+                uniExtractInteger_2x8.data[3] = 0x03030202;
+
+                status = vsi_nn_kernel_gpu_add_param(node,
+                    "uniAEvenMinusZp_4x4", &uniAEvenMinusZp_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node,
+                    "uniAOddMinusZp_4x4", &uniAOddMinusZp_4x4);
+            }
+            else
+            {
+                status = vsi_nn_kernel_gpu_add_param(node,
+                    "half_head_size", &half_head_size);
+            }
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "uniAMinusZp_0_4x4", &uniAMinusZp_0_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "uniAMinusZp_1_4x4", &uniAMinusZp_1_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "scale0", &scale0);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "scale1", &scale1);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "output_zp", &output_zp);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "in0_zp", &in0_zp);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "cos_zp", &cos_zp);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "sin_zp", &sin_zp);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "uniExtract8Data_2x8", &uniExtractInteger_2x8);
+            CHECK_STATUS_FAIL_GOTO(status, final);
+        }
+        break;
+    default:
+        break;
+    }
+    status = vsi_nn_kernel_gpu_config(node, &gpu_param);
+final:
+    if (in0_attr) vsi_nn_kernel_tensor_attr_release(&in0_attr);
+    if (in1_attr) vsi_nn_kernel_tensor_attr_release(&in1_attr);
+    if (in2_attr) vsi_nn_kernel_tensor_attr_release(&in2_attr);
+    if (out_attr) vsi_nn_kernel_tensor_attr_release(&out_attr);
+    return status;
+} /* _rope_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t axis,
+    int32_t interleaved,
+    _internal_rope_layout_e *layout
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e in2_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    int32_t in0_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
+    int32_t in1_zp = vsi_nn_get_tensor_zero_point(inputs[1]);
+    int32_t in2_zp = vsi_nn_get_tensor_zero_point(inputs[2]);
+    const _kernel_map_type * kernel_map = _rope_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _rope_kernel_map );
+    vx_param_description_t * param_def  = _rope_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _rope_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    in2_dtype  = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    /*only support symmetric int16*/
+    if ( ( (in0_dtype == I16 && in1_dtype == I16 && out_dtype == I16) ||
+           (in0_dtype == I16 && in1_dtype == F16 && out_dtype == I16) ||
+           (in0_dtype == I16 && in1_dtype == F16 && out_dtype == I8)  ||
+           (in0_dtype == I16 && in1_dtype == I16 && out_dtype == I8)  ||
+           (in0_dtype == I16 && in1_dtype == F16 && out_dtype == U8)  ||
+           (in0_dtype == I16 && in1_dtype == I16 && out_dtype == U8) ) &&
+        (in0_zp != 0 || in1_zp != 0 || in2_zp != 0))
+    {
+        return VSI_FAILURE;
+    }
+
+    if (axis == 1 && inputs[0]->attr.size[0] == inputs[1]->attr.size[0] &&
+        in1_dtype == in2_dtype)
+    {
+        if (inputs[0]->attr.size[0] == 1)
+        {
+            *layout = LAYOUT_BNH1;
+        }
+        else
+        {
+            *layout = LAYOUT_BNHS;
+        }
+    }
+    else if (axis == 0 && in1_dtype == in2_dtype)
+    {
+        if (inputs[0]->attr.size[2] == inputs[1]->attr.size[2] &&
+            inputs[1]->attr.size[1] == 1)
+        {
+            *layout = LAYOUT_BSNH;
+        }
+        else
+        {
+            *layout = LAYOUT_BNSH;
+        }
+    }
+
+    key = ROPE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, *layout, interleaved);
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _rope_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_ROPE_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t axis = 0;
+    int32_t i = 0;
+    int32_t interleaved = 0;
+    int32_t param = 0;
+    vsi_size_t shape[3][VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_nn_tensor_t* rs_tensors[4] = { NULL };
+    vsi_nn_tensor_t* reshape_tensors[4] = { NULL };
+    _internal_rope_layout_e layout = LAYOUT_NONE;
+
+    VSI_UNREFERENCED(params);
+
+    axis = vsi_nn_kernel_param_get_int32(params, "axis");
+    interleaved = vsi_nn_kernel_param_get_int32(params, "interleaved");
+
+    // Check if gpu can support the size
+    if ( !vsi_nn_kernel_gpu_check_shape(
+        inputs[0]->attr.size, inputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, axis, interleaved, &layout );
+    if (outputs[0]->attr.size[0] == 1 || layout == LAYOUT_BSNH)
+    {
+        memcpy(shape[0], inputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+        memcpy(shape[1], inputs[1]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+        memcpy(shape[2], outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+
+        if (outputs[0]->attr.size[0] == 1)
+        {
+            for (i = 1; i < 3; i++)
+            {
+                shape[0][i - 1] = shape[0][i];
+                shape[1][i - 1] = shape[1][i];
+                shape[2][i - 1] = shape[2][i];
+            }
+            shape[0][2] = 1;
+            shape[1][2] = 1;
+            shape[2][2] = 1;
+        }
+        else
+        {
+            int32_t j = 0;
+            for (i = 0; i < 3; i++)
+            {
+                if (shape[1][i] != 1)
+                {
+                    shape[1][j] = shape[1][i];
+                    j ++;
+                }
+            }
+            for (; j < 3; j++)
+            {
+                shape[1][j] = 1;
+            }
+        }
+
+        rs_tensors[0] = vsi_nn_reshape_tensor(graph,
+            inputs[0], shape[0], inputs[0]->attr.dim_num);
+        rs_tensors[1] = vsi_nn_reshape_tensor(graph,
+            inputs[1], shape[1], inputs[1]->attr.dim_num);
+        rs_tensors[2] = vsi_nn_reshape_tensor(graph,
+            inputs[2], shape[1], inputs[2]->attr.dim_num);
+        rs_tensors[3] = vsi_nn_reshape_tensor(graph,
+            outputs[0], shape[2], outputs[0]->attr.dim_num);
+
+        if (outputs[0]->attr.size[0] == 1 && axis > 0)
+        {
+            axis--;
+        }
+        reshape_tensors[0] = rs_tensors[0];
+        reshape_tensors[1] = rs_tensors[1];
+        reshape_tensors[2] = rs_tensors[2];
+        reshape_tensors[3] = rs_tensors[3];
+    }
+    else
+    {
+        reshape_tensors[0] = inputs[0];
+        reshape_tensors[1] = inputs[1];
+        reshape_tensors[2] = inputs[2];
+        reshape_tensors[3] = outputs[0];
+    }
+
+    param = (interleaved << 16) | axis;
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _ROPE_PARAM_NUM,
+                reshape_tensors, input_num, &reshape_tensors[3], output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_AXIS] = vsi_nn_kernel_scalar_create(graph, I32, &param);
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ROPE_PARAM_NUM );
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_AXIS]);
+        }
+    }
+
+    for (i = 0; i < 4; i++)
+    {
+        vsi_safe_release_tensor(rs_tensors[i]);
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( rope, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
index 4786abb..8662c71 100644
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
@@ -186,18 +186,26 @@ static const _kernel_map_type scatter_nd_update_special_ref_map[] =
 {
     TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
     TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I16,  I32, I16,  I16, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U16,  I32, U16,  U16, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(F16,  I32, F16,  F16, KERNEL_SOURCE_4)
 };
 
 static const _kernel_map_type scatter_nd_update_special_update_map[] =
 {
     TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
     TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U16,  I32, U16,  U16, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I16,  I32, I16,  I16, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(F16,  I32, F16,  F16, KERNEL_SOURCE_4)
 };
 
 static const _kernel_map_type scatter_nd_update_special_copy_map[] =
 {
     TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
     TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U16,  I32, U16,  U16, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I16,  I32, I16,  I16, KERNEL_SOURCE_4)
 };
 
 /*
@@ -563,6 +571,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
     {
     case _PACK_SELECT_KEY( I8,  I8 ):
     case _PACK_SELECT_KEY( U8,  U8 ):
+    case _PACK_SELECT_KEY( I16,  I16 ):
+    case _PACK_SELECT_KEY( U16,  U16 ):
         {
             uint16_t M0               = 0;
             int32_t  postShift0       = 0;
@@ -605,6 +615,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
             CHECK_STATUS_FAIL_GOTO(status, OnError );
         }
         break;
+    case _PACK_SELECT_KEY( F16,  F16 ):
+        break;
     default:
         break;
     }
@@ -759,6 +771,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
     {
     case _PACK_SELECT_KEY( I8,  I8 ):
     case _PACK_SELECT_KEY( U8,  U8 ):
+    case _PACK_SELECT_KEY( I16,  I16 ):
+    case _PACK_SELECT_KEY( U16,  U16 ):
         {
             uint16_t M1               = 0;
             int32_t  postShift1       = 0;
@@ -801,6 +815,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
             CHECK_STATUS_FAIL_GOTO(status, OnError );
         }
         break;
+    case _PACK_SELECT_KEY( F16,  F16 ):
+        break;
     default:
         break;
     }
@@ -1597,6 +1613,19 @@ static vsi_status _query_kernel_special
         status |= VSI_FAILURE;
     }
 
+    if (input0_dtype == F16)
+    {
+        input0_dtype = U16;
+    }
+    if (input2_dtype == F16)
+    {
+        input2_dtype = U16;
+    }
+    if (output_dtype == F16)
+    {
+        output_dtype = U16;
+    }
+
     key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 6, 1, 0);
 
     for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_copy_map); i ++ )
diff --git a/src/tim/vx/internal/src/kernel/evis/swish_evis.c b/src/tim/vx/internal/src/kernel/evis/swish_evis.c
index f1ad40b..9006707 100644
--- a/src/tim/vx/internal/src/kernel/evis/swish_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/swish_evis.c
@@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@@ -591,7 +592,7 @@ static vsi_nn_kernel_node_t _setup
     VSI_UNREFERENCED(input_num);
     VSI_UNREFERENCED(output_num);
 #if (VX_ACTIVATION_EXT_SUPPORT)
-    if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
+    if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
     {
         return NULL;
     }
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
index 8ff82f5..d4083d3 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@@ -548,16 +548,16 @@ static vsi_status _gpu_register
     vsi_status status;
     vx_kernel_description_t* info;
     vx_kernel obj;
-    vsi_nn_context_t context;
     vx_program program = NULL;
     const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
+    vsi_nn_runtime_option_t* options;
+    options = ((vsi_nn_graph_prv_t*)graph)->options;
 
 #define MAX_BUILDPROGRAM_LEN 1024
     char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
     size_t cost_bytes = 0;
 
     memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
-    context = graph->ctx;
 
     status = VSI_FAILURE;
     info = &(kernel->info);
@@ -579,21 +579,21 @@ static vsi_status _gpu_register
         return status;
     }
 
-    if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE )
+    if (options->config.evis.ver == VSI_NN_HW_EVIS_NONE)
     {
         // set default evis version is 2
         if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
         {
             cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
                     "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
-                    context->config.use_40bits_va );
+                    options->config.use_40bits_va );
         }
     }
     else
     {
         cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
                 "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
-                context->config.evis.ver, context->config.use_40bits_va );
+                options->config.evis.ver, options->config.use_40bits_va );
     }
     // Pack build option
     if( kernel->gpu.sources[active_fmt].build_option.data )
@@ -655,16 +655,16 @@ static vsi_status _gpu_register_ext
     vsi_status status;
     vx_kernel_description_t* info;
     vx_kernel obj;
-    vsi_nn_context_t context;
     vx_program program = NULL;
     const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
+    vsi_nn_runtime_option_t* options;
+    options = ((vsi_nn_graph_prv_t*)graph)->options;
 
 #define MAX_BUILDPROGRAM_LEN 1024
     char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
     size_t cost_bytes = 0;
 
     memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
-    context = graph->ctx;
 
     status = VSI_FAILURE;
     info = &(kernel->info);
@@ -686,21 +686,21 @@ static vsi_status _gpu_register_ext
         return status;
     }
 
-    if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE )
+    if (options->config.evis.ver == VSI_NN_HW_EVIS_NONE)
     {
         // set default evis version is 2
         if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
         {
             cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
                     "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
-                    context->config.use_40bits_va );
+                    options->config.use_40bits_va );
         }
     }
     else
     {
         cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
                 "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
-                context->config.evis.ver, context->config.use_40bits_va );
+                options->config.evis.ver, options->config.use_40bits_va );
     }
     // Pack build option
     if( kernel->gpu.sources[active_fmt].build_option.data )
@@ -1258,7 +1258,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
             }
             /* Skip evis if not support */
             if( type == VSI_NN_KERNEL_TYPE_EVIS
-                    && graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_NONE )
+                    && ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver == VSI_NN_HW_EVIS_NONE )
             {
                 continue;
             }
@@ -1677,7 +1677,7 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
     int32_t enableShader = ((vsi_nn_graph_prv_t*)graph)->options->enable_shader;
 
 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
-    if ( graph->ctx->config.subGroupSize == 0 )
+    if ( ((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize == 0 )
     {
         return FALSE;
     }
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
index 92e94f6..bce1b01 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
@@ -162,15 +162,11 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(pow)
 #if (VX_TENSOR_GATHER_API_SUPPORT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(gather)
 #endif
-#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(relational_ops)
-#endif
 #if (VX_TENSOR_TILE_API_SUPPORT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(tile)
 #endif
-#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(layer_norm)
-#endif
 #if (VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(exp)
 #endif
@@ -184,6 +180,7 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(log_softmax)
 #if (VX_BITCAST_VX_SUPPORT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(bitcast)
 #endif
-
+REGISTER_VX_FIRST_KERNEL_SELECTOR(group_norm)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(instance_norm)
 
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/vx/group_norm_vx.c b/src/tim/vx/internal/src/kernel/vx/group_norm_vx.c
new file mode 100644
index 0000000..cdfe633
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vx/group_norm_vx.c
@@ -0,0 +1,89 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+#if VX_GROUP_NORMALIZATION_VX_SUPPORT
+#define REGISTER_GROUP_NORM_OPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_GROUP_NORM_OPENVX_KERNEL(group_norm)
+{
+    vx_node node = NULL;
+    float eps = vsi_nn_kernel_param_get_float32(params, "eps");
+    int32_t group_num = vsi_nn_kernel_param_get_int32(params, "group_num");
+    vx_tensor inputs_tensor[3] = { NULL };
+    vx_tensor output_tensor = NULL;
+
+    inputs_tensor[0] = inputs[0]->t;
+    inputs_tensor[1] = inputs[1]->t;
+    inputs_tensor[2] = inputs[2]->t;
+    output_tensor = outputs[0]->t;
+
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(kernel);
+
+    if (graph->ctx->config.support_ffd ||
+        graph->ctx->config.support_stream_processor)
+    {
+        node = vxGroupNormalizationLayer(
+            graph->g,
+            eps,
+            group_num,
+            inputs_tensor,
+            (vx_uint32)input_num,
+            output_tensor
+        );
+    }
+
+    return (vsi_nn_kernel_node_t)node;
+} /* group_norm() */
+
+#endif
diff --git a/src/tim/vx/internal/src/kernel/vx/instance_norm_vx.c b/src/tim/vx/internal/src/kernel/vx/instance_norm_vx.c
new file mode 100644
index 0000000..a363b41
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vx/instance_norm_vx.c
@@ -0,0 +1,87 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+#if VX_INSTANCE_NORMALIZATION_VX_SUPPORT
+#define REGISTER_INSTANCE_NORM_OPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_INSTANCE_NORM_OPENVX_KERNEL(instance_norm)
+{
+    vsi_nn_kernel_node_t node = NULL;
+    float eps = vsi_nn_kernel_param_get_float32(params, "eps");
+    vx_tensor inputs_tensor[3] = { NULL };
+    vx_tensor output_tensor = NULL;
+
+    inputs_tensor[0] = inputs[0]->t;
+    inputs_tensor[1] = inputs[1]->t;
+    inputs_tensor[2] = inputs[2]->t;
+    output_tensor = outputs[0]->t;
+
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(kernel);
+
+    if (graph->ctx->config.support_ffd ||
+        graph->ctx->config.support_stream_processor)
+    {
+        node = vxInstanceNormalizationLayer(
+            graph->g,
+            eps,
+            inputs_tensor,
+            (vx_uint32)input_num,
+            output_tensor
+        );
+    }
+
+    return (vsi_nn_kernel_node_t)node;
+} /* instance_norm() */
+
+#endif
diff --git a/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c b/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c
index 00a2def..3cdd73b 100644
--- a/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c
@@ -30,7 +30,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
 
-#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
+#if (VX_LAYER_NORMALIZATION_VX_SUPPORT)
 #define REGISTER_LAYER_NORM_OPENVX_KERNEL( kernel_name )   \
     static vsi_nn_kernel_node_t _##kernel_name##setup \
         ( \
@@ -71,14 +71,20 @@ REGISTER_LAYER_NORM_OPENVX_KERNEL( layer_norm )
     inputs_tensor[2] = inputs[2]->t;
     output_tensor = outputs[0]->t;
 
-    node = vxLayerNormalizationLayer(
-        graph->g,
-        eps,
-        axis,
-        inputs_tensor,
-        (uint32_t)input_num,
-        output_tensor
+#if !defined(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) || !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
+    if (graph->ctx->config.support_ffd ||
+        graph->ctx->config.support_stream_processor)
+#endif
+    {
+        node = vxLayerNormalizationLayer(
+            graph->g,
+            eps,
+            axis,
+            inputs_tensor,
+            (uint32_t)input_num,
+            output_tensor
         );
+    }
 
     return (vsi_nn_kernel_node_t)node;
 } /* layer_norm() */
diff --git a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
index c9a2c84..0d35c8d 100644
--- a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
@@ -89,9 +89,10 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 )
     if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
     {
         vsi_nn_tensor_attr_t attr;
+
         memcpy( &attr, &outputs[0]->attr, sizeof( attr ) );
         memcpy( &attr.size, &inputs[0]->attr.size, sizeof( attr.size ) );
-        attr.vtl = FALSE;
+        attr.vtl = TRUE;
         attr.is_const = FALSE;
 
         convert_tensor = vsi_nn_CreateTensor(graph, &attr);
diff --git a/src/tim/vx/internal/src/kernel/vx/relationalops_vx.c b/src/tim/vx/internal/src/kernel/vx/relationalops_vx.c
index 0d93b45..bfb26b9 100644
--- a/src/tim/vx/internal/src/kernel/vx/relationalops_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/relationalops_vx.c
@@ -30,7 +30,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
 
-#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
+#if (VX_RELATIONAL_OPS_VX_SUPPORT)
 
 #define REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( kernel_name )   \
     static vsi_nn_kernel_node_t _##kernel_name##setup \
@@ -68,12 +68,25 @@ REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( relational_ops )
     VSI_UNREFERENCED(kernel);
     VSI_UNREFERENCED(output_num);
 
-    node = vxRelationalLayer(graph->g,
-                              operation,
-                              inputs_tensor,
-                              (uint32_t)input_num,
-                              outputs[0]->t
-                              );
+#if !defined(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) || !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
+    if (vsi_nn_is_broadcast_operaton(inputs, input_num, outputs[0]))
+    {
+        return NULL;
+    }
+#endif
+
+#if !defined(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) || !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
+    if (graph->ctx->config.support_stream_processor)
+#endif
+    {
+        node = vxRelationalLayer(
+            graph->g,
+            operation,
+            inputs_tensor,
+            (uint32_t)input_num,
+            outputs[0]->t
+        );
+    }
 
     return (vsi_nn_kernel_node_t)node;
 } /* relational_ops() */
diff --git a/src/tim/vx/internal/src/kernel/vx/swish_vx.c b/src/tim/vx/internal/src/kernel/vx/swish_vx.c
index 9b458c6..e758ced 100644
--- a/src/tim/vx/internal/src/kernel/vx/swish_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/swish_vx.c
@@ -23,6 +23,7 @@
 *****************************************************************************/
 
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_log.h"
@@ -66,7 +67,7 @@ REGISTER_SWISH_OPENVX_KERNEL( swish )
     VSI_UNREFERENCED(output_num);
     VSI_UNREFERENCED(input_num);
 
-    if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
+    if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
     {
         swish_type = (vsi_nn_swish_type)vsi_nn_kernel_param_get_int32(params, "type");
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl
index bf9fd64..447f197 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl
@@ -67,8 +67,8 @@ __kernel void cumsum_F32toF32_axis2(
     }
 }
 
-#define CUMSUM_toU8_AXIS2_SH(name, src_type, read_image_type) \
-__kernel void cumsum_##name##toU8_axis2( \
+#define CUMSUM_toINT_AXIS2_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
+__kernel void cumsum_##name##_axis2( \
     __read_only image2d_array_t  input, \
     __write_only image2d_array_t  output, \
     int axis, \
@@ -87,19 +87,19 @@ __kernel void cumsum_##name##toU8_axis2( \
     int4 coord_out = coord; \
  \
     src_type sum = (src_type)(0); \
-    uint4 dst = (uint4)(0); \
+    dst_type dst = (dst_type)(0); \
     int tmp_zp = convert_int_rte(output_zp); \
-    dst.x = convert_uint_sat(tmp_zp); \
+    dst.x = convert_dtype(tmp_zp); \
  \
     float cnt = 0.0f; \
  \
     if(exclusive && rev) \
     { \
         coord_out.z = channel - 1; \
-        write_imageui(output, coord_out, dst); \
+        image_write(output, coord_out, dst); \
         for(coord.z = channel - 1; coord.z > 0; coord.z--) \
         { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
             coord_out.z--; \
             cnt += 1.0f; \
             sum += data; \
@@ -107,17 +107,17 @@ __kernel void cumsum_##name##toU8_axis2( \
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \
  \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord_out, dst); \
         } \
     } \
     else if(exclusive) \
     { \
         coord_out.z = 0; \
-        write_imageui(output, coord_out, dst); \
+        image_write(output, coord_out, dst); \
         for(coord.z = 0; coord.z < channel - 1; coord.z++) \
         { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
             coord_out.z++; \
             cnt += 1.0f; \
             sum += data; \
@@ -125,45 +125,44 @@ __kernel void cumsum_##name##toU8_axis2( \
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \
  \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord_out, dst); \
         } \
     } \
     else if(rev) \
     { \
         for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
         { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
             cnt += 1.0f; \
             sum += data; \
  \
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \
  \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord, dst); \
         } \
     } \
     else \
     { \
         for(coord.z = 0; coord.z < channel; coord.z++) \
         { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
             cnt += 1.0f; \
             sum += data; \
  \
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \
  \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord, dst); \
         } \
     } \
 }
-CUMSUM_toU8_AXIS2_SH(U8,uint4,read_imageui)
-CUMSUM_toU8_AXIS2_SH(F32,float4,read_imagef)
-
-
+CUMSUM_toINT_AXIS2_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_toINT_AXIS2_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_toINT_AXIS2_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)
 
 __kernel void cumsum_F32toF32_axis1(
     __read_only image2d_array_t  input,
@@ -233,10 +232,10 @@ __kernel void cumsum_F32toF32_axis1(
     }
 }
 
-#define CUMSUM_toU8_AXIS1_SH(name, src_type, read_image_type) \
-__kernel void cumsum_##name##toU8_axis1( \
-    __read_only image2d_array_t  input, \
-    __write_only image2d_array_t  output, \
+#define CUMSUM_toINT_AXIS1_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
+__kernel void cumsum_##name##_axis1( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
     int axis, \
     int exclusive, \
     int rev, \
@@ -253,20 +252,20 @@ __kernel void cumsum_##name##toU8_axis1( \
     int4 coord_out = coord; \
  \
     src_type sum = (src_type)(0); \
-    uint4 dst = (uint4)(0); \
+    dst_type dst = (dst_type)(0); \
     int tmp_zp = convert_int_rte(output_zp); \
-    dst.x = convert_uint_sat(tmp_zp); \
+    dst.x = convert_dtype(tmp_zp); \
  \
     float cnt = 0; \
  \
     if(exclusive && rev) \
     { \
         coord_out.y = height - 1; \
-        write_imageui(output, coord_out, dst); \
+        image_write(output, coord_out, dst); \
  \
         for(coord.y = height - 1; coord.y > 0; coord.y--) \
         { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
             cnt += 1.0f; \
             coord_out.y--; \
             sum += data; \
@@ -274,17 +273,17 @@ __kernel void cumsum_##name##toU8_axis1( \
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \
  \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord_out, dst); \
         } \
     } \
     else if(exclusive) \
     { \
         coord_out.y = 0; \
-        write_imageui(output, coord_out, dst); \
+        image_write(output, coord_out, dst); \
         for(coord.y = 0; coord.y < height - 1; coord.y++) \
         { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
             cnt += 1.0f; \
             coord_out.y++; \
             sum += data; \
@@ -292,44 +291,44 @@ __kernel void cumsum_##name##toU8_axis1( \
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \
  \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord_out, dst); \
         } \
     } \
     else if(rev) \
     { \
         for(coord.y = height - 1; coord.y >= 0; coord.y--) \
         { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
             cnt += 1.0f; \
             sum += data; \
  \
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \
  \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord, dst); \
         } \
     } \
     else \
     { \
         for(coord.y = 0; coord.y < height; coord.y++) \
         { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
             cnt += 1.0f; \
             sum += data; \
  \
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \
  \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord, dst); \
         } \
     } \
 }
-CUMSUM_toU8_AXIS1_SH(U8,uint4,read_imageui)
-CUMSUM_toU8_AXIS1_SH(F32,float4,read_imagef)
-
+CUMSUM_toINT_AXIS1_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_toINT_AXIS1_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_toINT_AXIS1_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)
 
 __kernel void cumsum_F32toF32_axis0(
     __read_only image2d_array_t  input,
@@ -399,8 +398,8 @@ __kernel void cumsum_F32toF32_axis0(
     }
 }
 
-#define CUMSUM_toU8_AXIS0_SH(name, src_type, read_image_type) \
-__kernel void cumsum_##name##toU8_axis0( \
+#define CUMSUM_toINT_AXIS0_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
+__kernel void cumsum_##name##_axis0( \
     __read_only image2d_array_t  input, \
     __write_only image2d_array_t  output, \
     int axis, \
@@ -419,19 +418,19 @@ __kernel void cumsum_##name##toU8_axis0( \
     int4 coord_out = coord; \
  \
     src_type sum = (src_type)(0); \
-    uint4 dst = (uint4)(0); \
+    dst_type dst = (dst_type)(0); \
     int tmp_zp = convert_int_rte(output_zp); \
-    dst.x = convert_uint_sat(tmp_zp); \
+    dst.x = convert_dtype(tmp_zp); \
  \
     float cnt = 0; \
  \
     if(exclusive && rev) \
     { \
         coord_out.x = width - 1; \
-        write_imageui(output, coord_out, dst); \
+        image_write(output, coord_out, dst); \
         for(coord.x = width - 1; coord.x > 0; coord.x--) \
         { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
             coord_out.x--; \
             cnt += 1.0f; \
             sum += data; \
@@ -439,8 +438,8 @@ __kernel void cumsum_##name##toU8_axis0( \
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \
  \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord_out, dst); \
         } \
     } \
     else if(exclusive) \
@@ -449,7 +448,7 @@ __kernel void cumsum_##name##toU8_axis0( \
         write_imageui(output, coord_out, dst); \
         for(coord.x = 0; coord.x < width - 1; coord.x++) \
         { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
             coord_out.x++; \
             cnt += 1.0f; \
             sum += data; \
@@ -457,40 +456,42 @@ __kernel void cumsum_##name##toU8_axis0( \
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \
  \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord_out, dst); \
         } \
     } \
     else if(rev) \
     { \
         for(coord.x = width - 1; coord.x >= 0; coord.x--) \
         { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
             cnt += 1.0f; \
             sum += data; \
  \
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \
  \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord, dst); \
         } \
     } \
     else \
     { \
         for(coord.x = 0; coord.x < width; coord.x++) \
         { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
             cnt += 1.0f; \
             sum += data; \
  \
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \
  \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord, dst); \
         } \
     } \
 }
-CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui)
-CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)
+CUMSUM_toINT_AXIS0_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_toINT_AXIS0_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_toINT_AXIS0_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl
index 3a90480..f89cf5e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl
@@ -65,188 +65,100 @@ __kernel void cumsum_F32toF32_axis1_2D(
     }
 }
 
-__kernel void cumsum_U8toU8_axis1_2D(
-    __read_only image2d_t  input,
-    __write_only image2d_t  output,
-    int axis,
-    int exclusive,
-    int rev,
-    int width,
-    int height,
-    int chn,
-    int input_zp,
-    float in_out_scale,
-    float in_out_zp_scale,
-    float output_zp
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
-
-    uint4 sum = (uint4)(0);
-    uint4 dst = (uint4)(0);
-
-    int tmp_zp = convert_int_rte(output_zp);
-    dst.x = convert_uint_sat(tmp_zp);
-
-    float cnt = 0;
-
-    if(exclusive && rev)
-    {
-        coord.w = height - 1;
-        write_imageui(output, coord.zw, dst);
-        for(coord.y = height - 1; coord.y > 0; coord.y--)
-        {
-            uint4 data = read_imageui(input, coord.xy);
-            cnt += 1.0f;
-            coord.w--;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.zw, dst);
-        }
-    }
-    else if(exclusive)
-    {
-        write_imageui(output, coord.zw, dst);
-        for(coord.y = 0; coord.y < height - 1; coord.y++)
-        {
-            uint4 data = read_imageui(input, coord.xy);
-            cnt += 1.0f;
-            coord.w++;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.zw, dst);
-        }
-    }
-    else if(rev)
-    {
-        for(coord.y = height - 1; coord.y >= 0; coord.y--)
-        {
-            uint4 data = read_imageui(input, coord.xy);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.xy, dst);
-        }
-    }
-    else
-    {
-        for(coord.y = 0; coord.y < height; coord.y++)
-        {
-            uint4 data = read_imageui(input, coord.xy);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.xy, dst);
-        }
-    }
-}
-
-__kernel void cumsum_F32toU8_axis1_2D(
-    __read_only image2d_t  input,
-    __write_only image2d_t  output,
-    int axis,
-    int exclusive,
-    int rev,
-    int width,
-    int height,
-    int chn,
-    int input_zp,
-    float in_out_scale,
-    float in_out_zp_scale,
-    float output_zp
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
-
-    float4 sum = (float4)(0);
-    uint4 dst = (uint4)(0);
-    int tmp_zp = convert_int_rte(output_zp);
-    dst.x = convert_uint_sat(tmp_zp);
-
-    float cnt = 0;
-
-    if(exclusive && rev)
-    {
-        coord.w = height - 1;
-        write_imageui(output, coord.zw, dst);
-        for(coord.y = height - 1; coord.y > 0; coord.y--)
-        {
-            float4 data = read_imagef(input, coord.xy);
-            cnt += 1.0f;
-            coord.w--;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.zw, dst);
-        }
-    }
-    else if(exclusive)
-    {
-        write_imageui(output, coord.zw, dst);
-        for(coord.y = 0; coord.y < height - 1; coord.y++)
-        {
-            float4 data = read_imagef(input, coord.xy);
-            cnt += 1.0f;
-            coord.w++;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.zw, dst);
-        }
-    }
-    else if(rev)
-    {
-        for(coord.y = height - 1; coord.y >= 0; coord.y--)
-        {
-            float4 data = read_imagef(input, coord.xy);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.xy, dst);
-        }
-    }
-    else
-    {
-        for(coord.y = 0; coord.y < height; coord.y++)
-        {
-            float4 data = read_imagef(input, coord.xy);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.xy, dst);
-        }
-    }
+#define CUMSUM_INT_AXIS1_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
+__kernel void cumsum_##name##_axis1_2D( \
+    __read_only  image2d_t input, \
+    __write_only image2d_t output, \
+    int axis, \
+    int exclusive, \
+    int rev, \
+    int width, \
+    int height, \
+    int chn, \
+    int input_zp, \
+    float in_out_scale, \
+    float in_out_zp_scale, \
+    float output_zp \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    src_type sum = (src_type)(0); \
+    dst_type dst = (dst_type)(0); \
+    int tmp_zp = convert_int_rte(output_zp); \
+    dst.x = convert_dtype(tmp_zp); \
+ \
+    float cnt = 0; \
+ \
+    if(exclusive && rev) \
+    { \
+        coord.w = height - 1; \
+        image_write(output, coord.zw, dst); \
+        for(coord.y = height - 1; coord.y > 0; coord.y--) \
+        { \
+            src_type data = image_read(input, coord.xy); \
+            cnt += 1.0f; \
+            coord.w--; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord.zw, dst); \
+        } \
+    } \
+    else if(exclusive) \
+    { \
+        image_write(output, coord.zw, dst); \
+        for(coord.y = 0; coord.y < height - 1; coord.y++) \
+        { \
+            src_type data = image_read(input, coord.xy); \
+            cnt += 1.0f; \
+            coord.w++; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord.zw, dst); \
+        } \
+    } \
+    else if(rev) \
+    { \
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \
+        { \
+            src_type data = image_read(input, coord.xy); \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord.xy, dst); \
+        } \
+    } \
+    else \
+    { \
+        for(coord.y = 0; coord.y < height; coord.y++) \
+        { \
+            src_type data = image_read(input, coord.xy); \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord.xy, dst); \
+        } \
+    } \
 }
+CUMSUM_INT_AXIS1_2D_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_INT_AXIS1_2D_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_INT_AXIS1_2D_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)
 
 __kernel void cumsum_F32toF32_axis0_2D(
     __read_only image2d_t  input,
@@ -316,188 +228,100 @@ __kernel void cumsum_F32toF32_axis0_2D(
     }
 }
 
-__kernel void cumsum_U8toU8_axis0_2D(
-    __read_only image2d_t  input,
-    __write_only image2d_t  output,
-    int axis,
-    int exclusive,
-    int rev,
-    int width,
-    int height,
-    int chn,
-    int input_zp,
-    float in_out_scale,
-    float in_out_zp_scale,
-    float output_zp
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
-
-    uint4 sum = (uint4)(0);
-    uint4 dst = (uint4)(0);
-
-    int tmp_zp = convert_int_rte(output_zp);
-    dst.x = convert_uint_sat(tmp_zp);
-
-    float cnt = 0.0f;
-
-    if(exclusive && rev)
-    {
-        coord.x = width - 1;
-        coord.z = coord.x;
-        write_imageui(output, coord.zw, dst);
-        for(; coord.x > 0; coord.x--)
-        {
-            uint4 data = read_imageui(input, coord.xy);
-            coord.z--;
-            cnt += 1.0;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.zw, dst);
-        }
-    }
-    else if(exclusive)
-    {
-        coord.z = 0;
-        write_imageui(output, coord.zw, dst);
-        for(coord.x = 0; coord.x < width - 1; coord.x++)
-        {
-            uint4 data = read_imageui(input, coord.xy);
-            cnt += 1.0f;
-            coord.z++;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.zw, dst);
-        }
-    }
-    else if(rev)
-    {
-        for(coord.x = width - 1; coord.x >= 0; coord.x--)
-        {
-            uint4 data = read_imageui(input, coord.xy);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.xy, dst);
-        }
-    }
-    else
-    {
-        for(coord.x = 0; coord.x < width; coord.x++)
-        {
-            uint4 data = read_imageui(input, coord.xy);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.xy, dst);
-        }
-    }
-}
-
-__kernel void cumsum_F32toU8_axis0_2D(
-    __read_only image2d_t  input,
-    __write_only image2d_t  output,
-    int axis,
-    int exclusive,
-    int rev,
-    int width,
-    int height,
-    int chn,
-    int input_zp,
-    float in_out_scale,
-    float in_out_zp_scale,
-    float output_zp
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
-
-    float4 sum = (float4)(0);
-    uint4 dst = (uint4)(0);
-    int tmp_zp = convert_int_rte(output_zp);
-    dst.x = convert_uint_sat(tmp_zp);
-
-    float cnt = 0.0f;
-    if(exclusive && rev)
-    {
-        coord.x = width - 1;
-        coord.z = coord.x;
-        write_imageui(output, coord.zw, dst);
-        for(; coord.x > 0; coord.x--)
-        {
-            float4 data = read_imagef(input, coord.xy);
-            coord.z--;
-            cnt += 1.0;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.zw, dst);
-        }
-    }
-    else if(exclusive)
-    {
-        coord.z = 0;
-        write_imageui(output, coord.zw, dst);
-        for(coord.x = 0; coord.x < width - 1; coord.x++)
-        {
-            float4 data = read_imagef(input, coord.xy);
-            cnt += 1.0f;
-            coord.z++;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.zw, dst);
-        }
-    }
-    else if(rev)
-    {
-        for(coord.x = width - 1; coord.x >= 0; coord.x--)
-        {
-            float4 data = read_imagef(input, coord.xy);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.xy, dst);
-        }
-    }
-    else
-    {
-        for(coord.x = 0; coord.x < width; coord.x++)
-        {
-            float4 data = read_imagef(input, coord.xy);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.xy, dst);
-        }
-    }
+#define CUMSUM_INT_AXIS0_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
+__kernel void cumsum_##name##_axis0_2D( \
+    __read_only  image2d_t input, \
+    __write_only image2d_t output, \
+    int axis, \
+    int exclusive, \
+    int rev, \
+    int width, \
+    int height, \
+    int chn, \
+    int input_zp, \
+    float in_out_scale, \
+    float in_out_zp_scale, \
+    float output_zp \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    src_type sum = (src_type)(0); \
+    dst_type dst = (dst_type)(0); \
+ \
+    int tmp_zp = convert_int_rte(output_zp); \
+    dst.x = convert_dtype(tmp_zp); \
+ \
+    float cnt = 0.0f; \
+ \
+    if(exclusive && rev) \
+    { \
+        coord.x = width - 1; \
+        coord.z = coord.x; \
+        image_write(output, coord.zw, dst); \
+        for(; coord.x > 0; coord.x--) \
+        { \
+            src_type data = image_read(input, coord.xy); \
+            coord.z--; \
+            cnt += 1.0; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord.zw, dst); \
+        } \
+    } \
+    else if(exclusive) \
+    { \
+        coord.z = 0; \
+        image_write(output, coord.zw, dst); \
+        for(coord.x = 0; coord.x < width - 1; coord.x++) \
+        { \
+            src_type data = image_read(input, coord.xy); \
+            cnt += 1.0f; \
+            coord.z++; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord.zw, dst); \
+        } \
+    } \
+    else if(rev) \
+    { \
+        for(coord.x = width - 1; coord.x >= 0; coord.x--) \
+        { \
+            src_type data = image_read(input, coord.xy); \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord.xy, dst); \
+        } \
+    } \
+    else \
+    { \
+        for(coord.x = 0; coord.x < width; coord.x++) \
+        { \
+            src_type data = image_read(input, coord.xy); \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord.xy, dst); \
+        } \
+    } \
 }
+CUMSUM_INT_AXIS0_2D_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_INT_AXIS0_2D_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_INT_AXIS0_2D_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl b/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl
index e535b86..60163a7 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl
@@ -132,3 +132,30 @@ __kernel void one_hot_U8toU8
         coord.z ++;
     } while (coord.z < depth);
 }
+
+__kernel void one_hot_I32toBF16
+    (
+        __read_only  image2d_t       input,
+        __write_only image2d_array_t output,
+                     int             depth,
+                     uint            on_value,
+                     uint            off_value,
+                     float           inputScale,
+                     float           inputTail
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+    int4 src = read_imagei(input, coord.xy);
+
+    int  val = convert_int(convert_float(src.x) * inputScale - inputTail);
+    do
+    {
+        uint4 dst;
+        dst.x = val == coord.z ? on_value : off_value;
+
+        write_imageui(output, coord.xzyw, dst.xxxx);
+
+        coord.z ++;
+    } while (coord.z < depth);
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/rope_0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/rope_0.cl
new file mode 100644
index 0000000..08ad576
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/rope_0.cl
@@ -0,0 +1,373 @@
+__kernel void rope_F32_F32toF32_axis0
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    float4 cos, sin;
+
+    READ_IMAGEF_2DARRAY(cos, cos_cache, coord);
+    READ_IMAGEF_2DARRAY(sin, sin_cache, coord);
+    coord.x = coord.x * step;
+    float4 src0 = read_imagef(input, coord);
+    int4 coord_out = coord;
+
+    coord.x += half_head_size;
+    float4 src1 = read_imagef(input, coord);
+
+    float4 dst0 = src0 * cos - src1 * sin;
+    float4 dst1 = src0 * sin + src1 * cos;
+
+    write_imagef(output, coord_out, dst0);
+    coord_out.x += half_head_size;
+    write_imagef(output, coord_out, dst1);
+}
+
+__kernel void rope_F32_F32toF32_axis1
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    float4 cos, sin;
+
+    READ_IMAGEF_2DARRAY(cos, cos_cache, coord);
+    READ_IMAGEF_2DARRAY(sin, sin_cache, coord);
+    coord.y = coord.y * step;
+    float4 src0 = read_imagef(input, coord);
+    int4 coord_out = coord;
+    coord.y += half_head_size;
+    float4 src1 = read_imagef(input, coord);
+
+    float4 dst0 = src0 * cos - src1 * sin;
+    float4 dst1 = src0 * sin + src1 * cos;
+
+    write_imagef(output, coord_out, dst0);
+    coord_out.y += half_head_size;
+    write_imagef(output, coord_out, dst1);
+}
+
+__kernel void rope_F32_F32toF32_axis2
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+
+    float4 cos = read_imagef(cos_cache, coord);
+    float4 sin = read_imagef(sin_cache, coord);
+    coord.z = coord.z * step;
+    float4 src0 = read_imagef(input, coord);
+    int4 coord_out = coord;
+    coord.z += half_head_size;
+    float4 src1 = read_imagef(input, coord);
+
+    float4 dst0 = src0 * cos - src1 * sin;
+    float4 dst1 = src0 * sin + src1 * cos;
+
+    write_imagef(output, coord_out, dst0);
+    coord_out.z += half_head_size;
+    write_imagef(output, coord_out, dst1);
+}
+
+__kernel void rope_I32_I32toI32_axis0
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4 _cos, _sin;
+    float4 cos, sin;
+
+    READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);
+    READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);
+    coord.x = coord.x * step;
+    float4 src0 = convert_float4(read_imagei(input, coord));
+    int4 coord_out = coord;
+
+    coord.x += half_head_size;
+    float4 src1 = convert_float4(read_imagei(input, coord));
+
+    src0 = src0 - input_zp;
+    src1 = src1 - input_zp;
+    cos = convert_float4(_cos) - cos_zp;
+    sin = convert_float4(_sin) - sin_zp;
+
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
+    int4 dst0 = convert_int4_rte(_dst0);
+    int4 dst1 = convert_int4_rte(_dst1);
+
+    write_imagei(output, coord_out, dst0);
+    coord_out.x += half_head_size;
+    write_imagei(output, coord_out, dst1);
+}
+
+__kernel void rope_I32_I32toI32_axis1
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4 _cos, _sin;
+    float4 cos, sin;
+
+    READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);
+    READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);
+    coord.y = coord.y * step;
+    float4 src0 = convert_float4(read_imagei(input, coord));
+    int4 coord_out = coord;
+
+    coord.y += half_head_size;
+    float4 src1 = convert_float4(read_imagei(input, coord));
+
+    src0 = src0 - input_zp;
+    src1 = src1 - input_zp;
+    cos = convert_float4(_cos) - cos_zp;
+    sin = convert_float4(_sin) - sin_zp;
+
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
+    int4 dst0 = convert_int4_rte(_dst0);
+    int4 dst1 = convert_int4_rte(_dst1);
+
+    write_imagei(output, coord_out, dst0);
+    coord_out.y += half_head_size;
+    write_imagei(output, coord_out, dst1);
+}
+
+__kernel void rope_I32_I32toI32_axis2
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+
+    float4 cos = convert_float4(read_imagei(cos_cache, coord));
+    float4 sin = convert_float4(read_imagei(sin_cache, coord));
+    coord.z = coord.z * step;
+    float4 src0 = convert_float4(read_imagei(input, coord));
+    int4 coord_out = coord;
+
+    coord.z += half_head_size;
+    float4 src1 = convert_float4(read_imagei(input, coord));
+
+    src0 = src0 - input_zp;
+    src1 = src1 - input_zp;
+    cos = cos - cos_zp;
+    sin = sin - sin_zp;
+
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
+    int4 dst0 = convert_int4_rte(_dst0);
+    int4 dst1 = convert_int4_rte(_dst1);
+
+    write_imagei(output, coord_out, dst0);
+    coord_out.z += half_head_size;
+    write_imagei(output, coord_out, dst1);
+}
+
+__kernel void rope_U32_U32toU32_axis0
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    uint4 _cos, _sin;
+    float4 cos, sin;
+
+    READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);
+    READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);
+    coord.x = coord.x * step;
+    float4 src0 = convert_float4(read_imageui(input, coord));
+    int4 coord_out = coord;
+
+    coord.x += half_head_size;
+    float4 src1 = convert_float4(read_imageui(input, coord));
+
+    src0 = src0 - input_zp;
+    src1 = src1 - input_zp;
+    cos = convert_float4(_cos) - cos_zp;
+    sin = convert_float4(_sin) - sin_zp;
+
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
+    uint4 dst0 = convert_uint4_rte(_dst0);
+    uint4 dst1 = convert_uint4_rte(_dst1);
+
+    write_imageui(output, coord_out, dst0);
+    coord_out.x += half_head_size;
+    write_imageui(output, coord_out, dst1);
+}
+
+__kernel void rope_U32_U32toU32_axis1
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    uint4 _cos, _sin;
+    float4 cos, sin;
+
+    READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);
+    READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);
+    coord.y = coord.y * step;
+    float4 src0 = convert_float4(read_imageui(input, coord));
+    int4 coord_out = coord;
+
+    coord.y += half_head_size;
+    float4 src1 = convert_float4(read_imageui(input, coord));
+
+    src0 = src0 - input_zp;
+    src1 = src1 - input_zp;
+    cos = convert_float4(_cos) - cos_zp;
+    sin = convert_float4(_sin) - sin_zp;
+
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
+    uint4 dst0 = convert_uint4_rte(_dst0);
+    uint4 dst1 = convert_uint4_rte(_dst1);
+
+    write_imageui(output, coord_out, dst0);
+    coord_out.y += half_head_size;
+    write_imageui(output, coord_out, dst1);
+}
+
+__kernel void rope_U32_U32toU32_axis2
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+
+    float4 cos = convert_float4(read_imageui(cos_cache, coord));
+    float4 sin = convert_float4(read_imageui(sin_cache, coord));
+    coord.z = coord.z * step;
+    float4 src0 = convert_float4(read_imageui(input, coord));
+    int4 coord_out = coord;
+
+    coord.z += half_head_size;
+    float4 src1 = convert_float4(read_imageui(input, coord));
+
+    src0 = src0 - input_zp;
+    src1 = src1 - input_zp;
+    cos = cos - cos_zp;
+    sin = sin - sin_zp;
+
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
+    uint4 dst0 = convert_uint4_rte(_dst0);
+    uint4 dst1 = convert_uint4_rte(_dst1);
+
+    write_imageui(output, coord_out, dst0);
+    coord_out.z += half_head_size;
+    write_imageui(output, coord_out, dst1);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/custom_letterbox.vx b/src/tim/vx/internal/src/libnnext/ops/vx/custom_letterbox.vx
new file mode 100644
index 0000000..8a8d1ad
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_letterbox.vx
@@ -0,0 +1,307 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int top;
+_viv_uniform int left;
+_viv_uniform float out_scale_r;
+_viv_uniform float out_scale_g;
+_viv_uniform float out_scale_b;
+_viv_uniform float out_zp_r;
+_viv_uniform float out_zp_g;
+_viv_uniform float out_zp_b;
+_viv_uniform float pad_v_r;
+_viv_uniform float pad_v_g;
+_viv_uniform float pad_v_b;
+_viv_uniform float scale_w;
+_viv_uniform float scale_h;
+_viv_uniform int resize_max_w;
+_viv_uniform int resize_max_h;
+_viv_uniform int out_height;
+_viv_uniform int r_order;
+_viv_uniform int b_order;
+_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
+_viv_uniform VXC_512Bits uniLeftToFloat32_4x4;
+_viv_uniform VXC_512Bits uniExtactHalf8_2x8;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+
+__kernel void custom_letterbox_U8toU8
+    (
+     __read_only  image2d_t input,
+     __write_only image2d_t output,
+            int       top_,
+            int       bottom_,
+            int       left_,
+            int       right_,
+            float     mean_r_,
+            float     mean_g_,
+            float     mean_b_,
+            float     scale_r_,
+            float     scale_g_,
+            float     scale_b_,
+            int       pad_r_,
+            int       pad_g_,
+            int       pad_b_,
+            int       reverse_channel
+    )
+{
+    int2 coord_out  =  (int2)(get_global_id(0), get_global_id(1));
+    int2 coord = coord_out;
+    uint4 dst = (uint4)(0,0,0,0);
+    vxc_uchar8 result;
+
+    if (coord_out.x < left || coord_out.x >= resize_max_w ||
+        coord_out.y < top  || coord_out.y >= resize_max_h)
+    {
+        dst.x = convert_uint(pad_v_r);
+        coord.y = coord_out.y + r_order;
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+        dst.x = convert_uint(pad_v_g);
+        coord.y = coord_out.y + out_height;
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+        dst.x = convert_uint(pad_v_b);
+        coord.y = coord_out.y + b_order;
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+        return;
+    }
+
+    float in_x = convert_float(coord_out.x - left) * scale_w;
+    float in_y = convert_float(coord_out.y - top) * scale_h;
+    float left_x_f    = floor(in_x);
+    float top_y_f     = floor(in_y);
+    float x_lerp      = in_x - left_x_f;
+    float y_lerp      = in_y - top_y_f;
+    int   left_x_idx  = convert_int(left_x_f);
+    int   top_y_idx   = convert_int(top_y_f);
+
+    int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
+    vxc_uchar8 top_data, bottom_data;
+
+    VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
+                                VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
+                                VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+
+    float4 left4 = (float4)(0,0,0,0);
+    float4 right4 = (float4)(0,0,0,0);
+    float4 top4 = (float4)(0,0,0,0);
+    float4 bottom4 = (float4)(0,0,0,0);
+    VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
+    top4 = right4 * x_lerp + left4;
+    VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
+    bottom4 = right4 * x_lerp + left4;
+    float4 out = (bottom4 - top4) * y_lerp + top4;
+
+    dst.x = convert_uint(out.x * out_scale_r + out_zp_r );
+    coord.y = coord_out.y + r_order;
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    dst.x = convert_uint(out.y * out_scale_g + out_zp_g);
+    coord.y = coord_out.y + out_height;
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    dst.x = convert_uint(out.z * out_scale_b + out_zp_b);
+    coord.y = coord_out.y + b_order;
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_letterbox_U8toI8
+    (
+     __read_only  image2d_t input,
+     __write_only image2d_t output,
+            int       top_,
+            int       bottom_,
+            int       left_,
+            int       right_,
+            float     mean_r_,
+            float     mean_g_,
+            float     mean_b_,
+            float     scale_r_,
+            float     scale_g_,
+            float     scale_b_,
+            int       pad_r_,
+            int       pad_g_,
+            int       pad_b_,
+            int       reverse_channel
+    )
+{
+    int2 coord_out  =  (int2)(get_global_id(0), get_global_id(1));
+    int2 coord = coord_out;
+    int4 dst = (int4)(0,0,0,0);
+    vxc_char8 result;
+
+    if (coord_out.x < left || coord_out.x >= resize_max_w ||
+        coord_out.y < top  || coord_out.y >= resize_max_h)
+    {
+        dst.x = convert_int(pad_v_r);
+        coord.y = coord_out.y + r_order;
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+        dst.x = convert_int(pad_v_g);
+        coord.y = coord_out.y + out_height;
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+        dst.x = convert_int(pad_v_b);
+        coord.y = coord_out.y + b_order;
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+        return;
+    }
+
+    float in_x = convert_float(coord_out.x - left) * scale_w;
+    float in_y = convert_float(coord_out.y - top) * scale_h;
+    float left_x_f    = floor(in_x);
+    float top_y_f     = floor(in_y);
+    float x_lerp      = in_x - left_x_f;
+    float y_lerp      = in_y - top_y_f;
+    int   left_x_idx  = convert_int(left_x_f);
+    int   top_y_idx   = convert_int(top_y_f);
+
+    int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
+    vxc_char8 top_data, bottom_data;
+
+    VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
+                               VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
+                               VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+
+    float4 left4 = (float4)(0,0,0,0);
+    float4 right4 = (float4)(0,0,0,0);
+    float4 top4 = (float4)(0,0,0,0);
+    float4 bottom4 = (float4)(0,0,0,0);
+    VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
+    top4 = right4 * x_lerp + left4;
+    VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
+    bottom4 = right4 * x_lerp + left4;
+    float4 out = (bottom4 - top4) * y_lerp + top4;
+
+    dst.x = convert_int(out.x * out_scale_r + out_zp_r);
+    coord.y = coord_out.y + r_order;
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    dst.x = convert_int(out.y * out_scale_g + out_zp_g);
+    coord.y = coord_out.y + out_height;
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    dst.x = convert_int(out.z * out_scale_b + out_zp_b);
+    coord.y = coord_out.y + b_order;
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_letterbox_U8toF16
+    (
+     __read_only  image2d_t input,
+     __write_only image2d_t output,
+            int       top_,
+            int       bottom_,
+            int       left_,
+            int       right_,
+            float     mean_r_,
+            float     mean_g_,
+            float     mean_b_,
+            float     scale_r_,
+            float     scale_g_,
+            float     scale_b_,
+            int       pad_r_,
+            int       pad_g_,
+            int       pad_b_,
+            int       reverse_channel
+    )
+{
+    int2 coord_out  =  (int2)(get_global_id(0), get_global_id(1));
+    int2 coord = coord_out;
+    half4 tmp;
+    vxc_half8 dst_temp;
+    vxc_ushort8 dst;
+
+    if (coord_out.x < left || coord_out.x >= resize_max_w ||
+        coord_out.y < top  || coord_out.y >= resize_max_h)
+    {
+
+        float4 pad = (float4)(pad_v_r, pad_v_g, pad_v_b, 0);
+        _viv_asm(CONV, tmp, pad);
+        VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
+        _viv_asm(COPY, dst, dst_temp, 16);
+        coord.y = coord_out.y + r_order;
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+        tmp.x = tmp.y;
+        VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
+        _viv_asm(COPY, dst, dst_temp, 16);
+        coord.y = coord_out.y + out_height;
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+        tmp.x = tmp.z;
+        VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
+        _viv_asm(COPY, dst, dst_temp, 16);
+        coord.y = coord_out.y + b_order;
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+        return;
+    }
+
+    float in_x = convert_float(coord_out.x - left) * scale_w;
+    float in_y = convert_float(coord_out.y - top) * scale_h;
+    float left_x_f    = floor(in_x);
+    float top_y_f     = floor(in_y);
+    float x_lerp      = in_x - left_x_f;
+    float y_lerp      = in_y - top_y_f;
+    int   left_x_idx  = convert_int(left_x_f);
+    int   top_y_idx   = convert_int(top_y_f);
+
+    int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
+    vxc_uchar8 top_data, bottom_data;
+
+    VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
+                            VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+
+    float4 left4 = (float4)(0,0,0,0);
+    float4 right4 = (float4)(0,0,0,0);
+    float4 top4 = (float4)(0,0,0,0);
+    float4 bottom4 = (float4)(0,0,0,0);
+    VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
+    top4 = right4 * x_lerp + left4;
+    VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
+    bottom4 = right4 * x_lerp + left4;
+    float4 out = (bottom4 - top4) * y_lerp + top4;
+
+    float4 out_temp = (float4)(0,0,0,0);
+    out_temp.x = out.x * out_scale_r + out_zp_r;
+    _viv_asm(CONV, tmp, out_temp);
+    VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
+    _viv_asm(COPY, dst, dst_temp, 16);
+    coord.y = coord_out.y + r_order;
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    out_temp.x = out.y * out_scale_g + out_zp_g;
+    _viv_asm(CONV, tmp, out_temp);
+    VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
+    _viv_asm(COPY, dst, dst_temp, 16);
+    coord.y = coord_out.y + out_height;
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    out_temp.x = out.z * out_scale_b + out_zp_b;
+    _viv_asm(CONV, tmp, out_temp);
+    VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
+    _viv_asm(COPY, dst, dst_temp, 16);
+    coord.y = coord_out.y + out_height;
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx b/src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx
index e3ca29e..432a228 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx
@@ -10,7 +10,12 @@
 #include "cl_viv_vx_ext.h"
 
 _viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32;
+_viv_uniform VXC_512Bits uniExtract8Bin_2x8;
 _viv_uniform int  sf_size;
+_viv_uniform float  srcScale;
+_viv_uniform float  srcZP;
+_viv_uniform float  dstScale;
+_viv_uniform float  dstZP;
  #define F_MAX(a,b) ((a)>(b)?(a):(b))
 __kernel void Softmax2VXC
     (
@@ -19,35 +24,37 @@ __kernel void Softmax2VXC
     int axis
     )
 {
-   int4 coord_in = (int4)(0,0,0,0);
-   float fMax = 0.0;
+   int4 coord_in = (int4)(0, get_global_id(0), 0, 0);
+   float fMax = 0;
    for (int i = 0; i < sf_size; i++)
    {
-       vxc_char8 val;
+       vxc_short8 val;
+       vxc_half8  val_h;
        coord_in.x = i;
-       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+       _viv_asm(COPY, val_h, val, 16);
        float fval;
-       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
+       VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
 
        fMax = F_MAX(fMax, fval);
    }
-
     float  fProbSum = 0.0f;
     vxc_short8 dst;
     for (int i = 0; i < sf_size; i++)
     {
-       vxc_char8 val;
-
+       vxc_short8 val;
+       vxc_half8  val_h;
        coord_in.x = i;
-       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+       _viv_asm(COPY, val_h, val, 16);
        float fval;
-       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
-
+       VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
        float fOut = (float)exp(fval - fMax);
        fProbSum += fOut;
        half hVal;
-       _viv_asm(CONV,hVal,fOut);
-       _viv_asm(COPY,dst,hVal, 4);
+       _viv_asm(CONV, hVal, fOut);
+       _viv_asm(COPY, dst, hVal, 4);
+
        VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     }
 
@@ -56,15 +63,68 @@ __kernel void Softmax2VXC
        vxc_short8 val;
        vxc_half8  val_h;
        coord_in.x = i;
-       VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
+       VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
        float fval;
        _viv_asm(COPY, val_h,val, 16);
        VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
-
-       float fOut =fval/fProbSum;
+       float fOut =fval / fProbSum;
        half hVal;
-       _viv_asm(CONV,hVal,fOut);
+       _viv_asm(CONV, hVal, fOut);
        _viv_asm(COPY,dst,hVal, 4);
        VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     }
 }
+
+__kernel void Softmax2VXC_u8
+    (
+    image2d_array_t input,
+    image2d_array_t output,
+    int axis
+    )
+{
+   int4 coord_in = (int4)(0, get_global_id(0), 0, 0);
+   float fMax = -3.4e38f;
+   for (int i = 0; i < sf_size; i++)
+   {
+       vxc_uchar8 val;
+       coord_in.x = i;
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+       float fval;
+       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
+       fval = (fval - srcZP) * srcScale;
+       fMax = F_MAX(fMax, fval);
+   }
+
+    float  fProbSum = 0.0f;
+    vxc_uchar8 dst;
+    for (int i = 0; i < sf_size; i++)
+    {
+       vxc_uchar8 val;
+
+       coord_in.x = i;
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+       float fval;
+       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
+       fval = (fval - srcZP) * srcScale;
+       float fOut = (float)exp(fval - fMax);
+       fProbSum += fOut;
+    }
+
+    for (int i = 0; i < sf_size; i++)
+    {
+       vxc_uchar8 val;
+       coord_in.x = i;
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+       float fval;
+       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
+       fval = (fval - srcZP) * srcScale;
+
+       float fOut = exp(fval - fMax) / fProbSum;
+
+       fOut = fOut * dstScale + dstZP;
+       short dst0;
+       _viv_asm(CONV, dst0, fOut);
+       VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8);
+       VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx
index 33edef8..15f596e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx
@@ -16,7 +16,7 @@ _viv_uniform float sum_x2_tail1;
 _viv_uniform float output_scale;
 _viv_uniform float output_zp;
 
-#define GROUP_NORM_SUMS_16BITS_IMPL(name, src_type) \
+#define GROUP_NORM_SUMS_16BITS_IMPL(name, load_type, src_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \
     __read_only  image2d_array_t input, \
     __write_only image2d_array_t output, \
@@ -26,7 +26,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
     int lidx = get_local_id(0); \
     int gidz = get_global_id(1); \
     int4 coord = (int4)(gidx, 0, gidz, 0); \
-    vxc_short8 src0; \
+    load_type src; \
     src_type in_h; \
     float4 sumsqr; \
     float4 tmpSumSqr = (float4)(0); \
@@ -43,9 +43,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
     { \
         for(coord.y = 0; coord.y < height;) \
         { \
-            VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            VXC_OP4(img_load_3d, src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
             coord.y++; \
-            _viv_asm(COPY, in_h, src0, 16); \
+            _viv_asm(COPY, in_h, src, 16); \
             VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
             tmpSumSqr += sumsqr; \
         } \
@@ -76,10 +76,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
         write_imagef(output, coord_out, data); \
     } \
 }
-GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_half8)
-GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)
+GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_short8,  vxc_half8)
+GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8,  vxc_short8)
+GROUP_NORM_SUMS_16BITS_IMPL(U16, vxc_ushort8, vxc_ushort8)
 
-#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \
+#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, load_type, src_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \
     __read_only  image2d_array_t input, \
     __write_only image2d_array_t output, \
@@ -89,7 +90,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
     int lidx = get_local_id(0); \
  \
     int2 coord = (int2)(gidx, get_global_id(1)); \
-    vxc_short8 src0; \
+    load_type src; \
     src_type in_h; \
     float4 sumsqr = (float4)(0); \
  \
@@ -98,8 +99,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
  \
     if(gidx < width) \
     { \
-        VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-        _viv_asm(COPY, in_h, src0, 16); \
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, in_h, src, 16); \
         VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
         sumsqr.y = sumsqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sumsqr.x; \
         sumsqr.x = sumsqr.x * input_scale + sum_x_tail; \
@@ -128,8 +129,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
         write_imagef(output, coord_out, data); \
     } \
 }
-GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)
-GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)
+GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_short8, vxc_half8)
+GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8, vxc_short8)
+GROUP_NORM_SUMS_16BITS_IMPL_2D(U16, vxc_ushort8, vxc_ushort8)
 
 #define GROUP_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \
@@ -178,7 +180,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
     _viv_asm(CONV_RTE, tmpVal0, norm); \
     norm = alpha * tmpData1 + bias_val; \
     _viv_asm(CONV_RTE, tmpVal1, norm); \
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
     _viv_asm(COPY, outval, dst, 16); \
     VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
 }
@@ -230,10 +232,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
     float4 norm; \
     norm = alpha * tmpData0 + bias_val; \
-    _viv_asm(CONV, tmpVal0, norm); \
+    _viv_asm(CONV_RTE, tmpVal0, norm); \
     norm = alpha * tmpData1 + bias_val; \
-    _viv_asm(CONV, tmpVal1, norm); \
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(CONV_RTE, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
     _viv_asm(COPY, outval, dst, 16); \
     VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
@@ -283,7 +285,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
  \
     float4 norm; \
     norm = alpha * tmpData0 + bias_val; \
-    _viv_asm(CONV, tmpVal0, norm); \
+    _viv_asm(CONV_RTE, tmpVal0, norm); \
     norm = alpha * tmpData1 + bias_val; \
     _viv_asm(CONV_RTE, tmpVal1, norm); \
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
@@ -296,6 +298,7 @@ GROUP_NORM_16BITS_F32_IMPL(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int
 GROUP_NORM_16BITS_F32_IMPL(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)
 GROUP_NORM_16BITS_F32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
 GROUP_NORM_16BITS_F32_IMPL(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)
+GROUP_NORM_16BITS_F32_IMPL(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)
 
 #define GROUP_NORM_16BITS_F32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \
@@ -333,10 +336,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
     float4 norm; \
     norm = alpha * tmpData0 + bias_val; \
-    _viv_asm(CONV, tmpVal0, norm); \
+    _viv_asm(CONV_RTE, tmpVal0, norm); \
     norm = alpha * tmpData1 + bias_val; \
-    _viv_asm(CONV, tmpVal1, norm); \
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(CONV_RTE, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
     _viv_asm(COPY, outval, dst, 16); \
     VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
@@ -346,4 +349,5 @@ GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,
 GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)
 GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
 GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)
+GROUP_NORM_16BITS_F32_IMPL_2D(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/prelu.vx b/src/tim/vx/internal/src/libnnext/ops/vx/prelu.vx
index 695601d..89d5b05 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/prelu.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/prelu.vx
@@ -115,45 +115,45 @@ _viv_uniform VXC_512Bits uniDataSubZPtoFp32Part1_4x4;
 _viv_uniform VXC_512Bits uniConvF16toF32_part0_4x4;
 _viv_uniform VXC_512Bits uniConvF16toF32_part1_4x4;
 _viv_uniform VXC_512Bits uniExtact8Bin_2x8;
-_viv_uniform int inputZP0;
-_viv_uniform int inputZP1;
-_viv_uniform float input_scale0;
-_viv_uniform float input_scale1;
-_viv_uniform float outputZP;
-#define PRELU_F16_3D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \
-    __kernel void prelu_##name0##to##name1( \
+_viv_uniform int input0_zp;
+_viv_uniform int input1_zp;
+_viv_uniform float input0_scale;
+_viv_uniform float input1_scale;
+_viv_uniform float output_zp;
+#define PRELU_F16_3D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \
+    __kernel void prelu_##name( \
     __read_only  image2d_array_t input0, \
     __read_only  image2d_array_t input1, \
     __write_only image2d_array_t output) \
 {\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\
-    vxc_float4 vecA, vecB, vecC, vecD;\
+    float4 vecA, vecB, vecC, vecD;\
     input_type0 srcA;\
     copy_type0  src0;\
     vxc_short8 srcB;\
     vxc_half8  src1;\
-    input_type0 input_ZP;\
+    input_type0 zp;\
     VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
     _viv_asm(COPY, src0, srcA, 16); \
     VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
     _viv_asm(COPY, src1, srcB, 16); \
     \
-    _viv_asm(COPY, input_ZP, inputZP0, 4);\
-    VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
+    _viv_asm(COPY, zp, input0_zp, 4);\
+    VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
         uniDataSubZPtoFp32Part0_4x4); \
-    VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
+    VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
         uniDataSubZPtoFp32Part1_4x4);\
     VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\
     VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\
     \
-    vecA = vecA * input_scale0;\
-    vecB = vecB * input_scale0;\
-    vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \
-    vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \
-    vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
-    vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
-    vecA = maxData0 + vecC * minData0 + outputZP;\
-    vecB = maxData1 + vecD * minData1 + outputZP;\
+    vecA = vecA * input0_scale;\
+    vecB = vecB * input0_scale;\
+    float4 maxData0 = vecA > 0 ? vecA : 0.0; \
+    float4 maxData1 = vecB > 0 ? vecB : 0.0; \
+    float4 minData0 = vecA < 0 ? vecA : 0.0; \
+    float4 minData1 = vecB < 0 ? vecB : 0.0; \
+    vecA = maxData0 + vecC * minData0 + output_zp;\
+    vecB = maxData1 + vecD * minData1 + output_zp;\
     convert_type dst0, dst1;\
     _viv_asm(CONV_RTE, dst0, vecA);\
     _viv_asm(CONV_RTE, dst1, vecB);\
@@ -164,49 +164,49 @@ _viv_uniform float outputZP;
     VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
 }
 //        name0, name1, input_type0, copy_type0,  output_type, convert_type, copy_type
-PRELU_F16_3D(I8F16,  I8,  vxc_char16,  vxc_char16,  vxc_char16,  int4,  vxc_char16)
-PRELU_F16_3D(I8F16,  F16, vxc_char16,  vxc_char16,  vxc_half8,   half4, vxc_short8)
-PRELU_F16_3D(I16F16, I16, vxc_short8,  vxc_short8,  vxc_short8,  int4,  vxc_short8)
-PRELU_F16_3D(I16F16, F16, vxc_short8,  vxc_short8,  vxc_half8,   half4, vxc_short8)
-PRELU_F16_3D(U8F16,  U8,  vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)
-PRELU_F16_3D(U8F16,  F16, vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)
-PRELU_F16_3D(F16F16, F16, vxc_short8,  vxc_half8,   vxc_half8,   half4, vxc_short8)
-PRELU_F16_3D(F16F16, I8,  vxc_short8,  vxc_half8,   vxc_char16,  int4,  vxc_char16)
-PRELU_F16_3D(F16F16, I16, vxc_short8,  vxc_half8,   vxc_short8,  int4,  vxc_short8)
-PRELU_F16_3D(F16F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_uchar16)
+PRELU_F16_3D(I8F16toI8,   vxc_char16,  vxc_char16,  vxc_char16,  int4,  vxc_char16)
+PRELU_F16_3D(I8F16toF16,  vxc_char16,  vxc_char16,  vxc_half8,   half4, vxc_short8)
+PRELU_F16_3D(I16F16toI16, vxc_short8,  vxc_short8,  vxc_short8,  int4,  vxc_short8)
+PRELU_F16_3D(I16F16toF16, vxc_short8,  vxc_short8,  vxc_half8,   half4, vxc_short8)
+PRELU_F16_3D(U8F16toU8,   vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)
+PRELU_F16_3D(U8F16toF16,  vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)
+PRELU_F16_3D(F16F16toF16, vxc_short8,  vxc_half8,   vxc_half8,   half4, vxc_short8)
+PRELU_F16_3D(F16F16toI8,  vxc_short8,  vxc_half8,   vxc_char16,  int4,  vxc_char16)
+PRELU_F16_3D(F16F16toI16, vxc_short8,  vxc_half8,   vxc_short8,  int4,  vxc_short8)
+PRELU_F16_3D(F16F16toU8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_uchar16)
 
-#define PRELU_F16_2D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \
-    __kernel void prelu_##name0##to##name1##_2D( \
+#define PRELU_F16_2D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \
+    __kernel void prelu_##name##_2D( \
     __read_only  image2d_array_t input0, \
     __read_only  image2d_array_t input1, \
     __write_only image2d_array_t output) \
 {\
     int2 coord = (int2)(get_global_id(0), get_global_id(1));\
-    vxc_float4 vecA, vecB, vecC, vecD;\
+    float4 vecA, vecB, vecC, vecD;\
     input_type0 srcA;\
     copy_type0  src0;\
     vxc_short8 srcB;\
     vxc_half8  src1;\
-    input_type0 input_ZP;\
+    input_type0 zp;\
     VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
     _viv_asm(COPY, src0, srcA, 16); \
     VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
     _viv_asm(COPY, src1, srcB, 16); \
     \
-    _viv_asm(COPY, input_ZP, inputZP0, 4);\
-    VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
-    VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
+    _viv_asm(COPY, zp, input0_zp, 4);\
+    VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
+    VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
     VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\
     VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\
     \
-    vecA = vecA * input_scale0;\
-    vecB = vecB * input_scale0;\
-    vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \
-    vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \
-    vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
-    vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
-    vecA = maxData0 + vecC * minData0 + outputZP;\
-    vecB = maxData1 + vecD * minData1 + outputZP;\
+    vecA = vecA * input0_scale;\
+    vecB = vecB * input0_scale;\
+    float4 maxData0 = vecA > 0 ? vecA : 0.0; \
+    float4 maxData1 = vecB > 0 ? vecB : 0.0; \
+    float4 minData0 = vecA < 0 ? vecA : 0.0; \
+    float4 minData1 = vecB < 0 ? vecB : 0.0; \
+    vecA = maxData0 + vecC * minData0 + output_zp;\
+    vecB = maxData1 + vecD * minData1 + output_zp;\
     convert_type dst0, dst1;\
     _viv_asm(CONV_RTE, dst0, vecA);\
     _viv_asm(CONV_RTE, dst1, vecB);\
@@ -216,49 +216,49 @@ PRELU_F16_3D(F16F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_ucha
     _viv_asm(COPY, dst, dst2, 16); \
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
 }
-PRELU_F16_2D(I8F16,  F16, vxc_char16,  vxc_char16,  vxc_half8,   half4, vxc_short8)
-PRELU_F16_2D(I8F16,  I8,  vxc_char16,  vxc_char16,  vxc_char16,  int4,  vxc_char16)
-PRELU_F16_2D(I16F16, F16, vxc_short8,  vxc_short8,  vxc_half8,   half4, vxc_short8)
-PRELU_F16_2D(U8F16,  U8,  vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)
-PRELU_F16_2D(U8F16,  F16, vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)
-PRELU_F16_2D(F16F16, F16, vxc_short8,  vxc_half8,   vxc_half8,   half4, vxc_short8)
-PRELU_F16_2D(F16F16, I8,  vxc_short8,  vxc_half8,   vxc_char16,  int4,  vxc_char16)
-PRELU_F16_2D(F16F16, I16, vxc_short8,  vxc_half8,   vxc_short8,  int4,  vxc_short8)
-PRELU_F16_2D(I16F16, I16, vxc_short8,  vxc_short8,  vxc_short8,  int4,  vxc_short8)
-PRELU_F16_2D(F16F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_uchar16)
+PRELU_F16_2D(I8F16toF16,  vxc_char16,  vxc_char16,  vxc_half8,   half4, vxc_short8)
+PRELU_F16_2D(I8F16toI8,   vxc_char16,  vxc_char16,  vxc_char16,  int4,  vxc_char16)
+PRELU_F16_2D(I16F16toF16, vxc_short8,  vxc_short8,  vxc_half8,   half4, vxc_short8)
+PRELU_F16_2D(U8F16toU8,   vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)
+PRELU_F16_2D(U8F16toF16,  vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)
+PRELU_F16_2D(F16F16toF16, vxc_short8,  vxc_half8,   vxc_half8,   half4, vxc_short8)
+PRELU_F16_2D(F16F16toI8,  vxc_short8,  vxc_half8,   vxc_char16,  int4,  vxc_char16)
+PRELU_F16_2D(F16F16toI16, vxc_short8,  vxc_half8,   vxc_short8,  int4,  vxc_short8)
+PRELU_F16_2D(I16F16toI16, vxc_short8,  vxc_short8,  vxc_short8,  int4,  vxc_short8)
+PRELU_F16_2D(F16F16toU8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_uchar16)
 
-#define PRELU_U8_2D(name, output_type, convert_type, copy_type) \
-    __kernel void prelu_U8U8to##name##_2D( \
+#define PRELU_INTEGER_2D(name, src0_type, src1_type, output_type, convert_type, copy_type) \
+    __kernel void prelu_##name##_2D( \
     __read_only  image2d_array_t input0, \
     __read_only  image2d_array_t input1, \
     __write_only image2d_array_t output) \
 {\
     int2 coord = (int2)(get_global_id(0), get_global_id(1));\
-    vxc_float4 vecA, vecB, vecC, vecD;\
-    vxc_uchar16  src0;\
-    vxc_uchar16  src1;\
-    vxc_uchar16 input_ZP0;\
-    vxc_uchar16 input_ZP1;\
+    float4 vecA, vecB, vecC, vecD;\
+    src0_type  src0;\
+    src1_type  src1;\
+    short zp0;\
+    short zp1;\
     VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
     VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
     \
-    _viv_asm(COPY, input_ZP0, inputZP0, 4);\
-    VXC_DP4x4(vecA, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
-    VXC_DP4x4(vecB, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
-    _viv_asm(COPY, input_ZP1, inputZP1, 4);\
-    VXC_DP4x4(vecC, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
-    VXC_DP4x4(vecD, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
+    _viv_asm(COPY, zp0, input0_zp, 2);\
+    VXC_DP4x4(vecA, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
+    VXC_DP4x4(vecB, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
+    _viv_asm(COPY, zp1, input1_zp, 4);\
+    VXC_DP4x4(vecC, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
+    VXC_DP4x4(vecD, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
     \
-    vecA = vecA * input_scale0;\
-    vecB = vecB * input_scale0;\
-    vecC = vecC * input_scale1;\
-    vecD = vecD * input_scale1;\
-    vxc_float4 maxData0 = vecA >= 0 ? vecA : 0.0; \
-    vxc_float4 maxData1 = vecB >= 0 ? vecB : 0.0; \
-    vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
-    vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
-    vecA = maxData0 + vecC * minData0 + outputZP;\
-    vecB = maxData1 + vecD * minData1 + outputZP;\
+    vecA = vecA * input0_scale;\
+    vecB = vecB * input0_scale;\
+    vecC = vecC * input1_scale;\
+    vecD = vecD * input1_scale;\
+    float4 maxData0 = vecA >= 0 ? vecA : 0.0; \
+    float4 maxData1 = vecB >= 0 ? vecB : 0.0; \
+    float4 minData0 = vecA < 0 ? vecA : 0.0; \
+    float4 minData1 = vecB < 0 ? vecB : 0.0; \
+    vecA = maxData0 + vecC * minData0 + output_zp;\
+    vecB = maxData1 + vecD * minData1 + output_zp;\
     convert_type dst0, dst1;\
     _viv_asm(CONV_RTE, dst0, vecA);\
     _viv_asm(CONV_RTE, dst1, vecB);\
@@ -268,7 +268,8 @@ PRELU_F16_2D(F16F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_ucha
     _viv_asm(COPY, dst, dst2, 16); \
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
 }
-PRELU_U8_2D(U8,  vxc_uchar16, int4,  vxc_uchar16)
-PRELU_U8_2D(F16, vxc_half8,   half4, vxc_short8)
+PRELU_INTEGER_2D(U8U8toU8,    vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)
+PRELU_INTEGER_2D(U8U8toF16,   vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)
+
 
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_3.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_3.vx
new file mode 100644
index 0000000..6fa4e3e
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_3.vx
@@ -0,0 +1,181 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
+_viv_uniform VXC_512Bits uniResize2xUp_0_4x8;
+_viv_uniform VXC_512Bits uniResize2xUp_1_4x8;
+_viv_uniform int out_height;
+
+__kernel void resize_bilinear_U8toU8_2x_upsample_half_pixel_centers
+    (
+    __read_only  image2d_array_t   input,
+    __write_only image2d_array_t   output,
+                             int   align_corners,
+                             int   half_pixel_centers
+    )
+{
+    int4 coord_out  = (int4)(get_global_id(0), 0, get_global_id(1), 0);
+    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);
+    coord_in.x = (coord_out.x * 2 - 1) >> 2;
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;
+
+    vxc_uchar16 in0, in1, tmp, result;
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    vxc_ushort8 multiplier;
+    _viv_asm(COPY, multiplier, multAndoutZP, 16);
+
+    vxc_ushort8 dst0;
+    while (coord_out.y < out_height)
+    {
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_in.y += 2;
+        coord_out.y++;
+    }
+}
+
+_viv_uniform VXC_512Bits uniResize4xUp_l00_4x8;
+_viv_uniform VXC_512Bits uniResize4xUp_l01_4x8;
+_viv_uniform VXC_512Bits uniResize4xUp_l10_4x8;
+_viv_uniform VXC_512Bits uniResize4xUp_l11_4x8;
+__kernel void resize_bilinear_U8toU8_4x_upsample_half_pixel_centers
+    (
+    __read_only  image2d_array_t   input,
+    __write_only image2d_array_t   output,
+                             int   align_corners,
+                             int   half_pixel_centers
+    )
+{
+    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0);
+    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);
+    coord_in.x = (coord_out.x * 2 - 3) >> 3;
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;
+
+    vxc_uchar16 in0, in1, dst0, dst1;
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    vxc_ushort8 multiplier;
+    _viv_asm(COPY, multiplier, multAndoutZP, 16);
+
+    vxc_ushort8 tmp;
+    while (coord_out.y < out_height)
+    {
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l10_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l10_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_in.y += 2;
+        coord_out.y++;
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_4.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_4.vx
new file mode 100644
index 0000000..32c188f
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_4.vx
@@ -0,0 +1,102 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
+_viv_uniform VXC_512Bits uniResize3xUp_l00_2x8;
+_viv_uniform VXC_512Bits uniResize3xUp_l01_2x8;
+_viv_uniform VXC_512Bits uniResize3xUp_l10_4x4;
+_viv_uniform VXC_512Bits uniResize3xUp_l11_4x4;
+_viv_uniform VXC_512Bits uniResize3xUp_l12_4x4;
+_viv_uniform VXC_512Bits uniResize3xUp_l13_4x4;
+__kernel void resize_bilinear_U8toU8_3x_upsample_half_pixel_centers
+    (
+    __read_only  image2d_array_t   input,
+    __write_only image2d_array_t   output,
+                             int   align_corners,
+                             int   half_pixel_centers
+    )
+{
+    int4 coord_out  = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_in   = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    coord_in.x = (short)(coord_out.x * 2 - 1) / (short)6;
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;
+    coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;
+    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;
+
+    vxc_uchar16 in0, in1, in2, in3, tmp, dst0, dst1, dst2;
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    vxc_ushort8 multiplier;
+    _viv_asm(COPY, multiplier, multAndoutZP, 16);
+
+    vxc_ushort8 data;
+
+    VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);
+    VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);
+    VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);
+    VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
+    VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+
+    VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l00_2x8);
+    VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l01_2x8);
+    VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);
+    VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);
+    VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);
+    VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
+    VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);
+    VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);
+    VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);
+    VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
+    VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+
+    VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l00_2x8);
+    VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l01_2x8);
+    VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);
+    VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);
+    VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);
+    VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
+    VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_5.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_5.vx
new file mode 100644
index 0000000..06ddcae
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_5.vx
@@ -0,0 +1,167 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
+_viv_uniform int out_height;
+_viv_uniform VXC_512Bits uniResize8xUp_l00_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l01_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l10_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l11_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l20_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l21_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l30_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l31_4x8;
+__kernel void resize_bilinear_U8toU8_8x_upsample_half_pixel_centers
+    (
+    __read_only  image2d_array_t   input,
+    __write_only image2d_array_t   output,
+                             int   align_corners,
+                             int   half_pixel_centers
+    )
+{
+    int4 coord_out  = (int4)(get_global_id(0), 0, get_global_id(1), 0);
+    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);
+    coord_in.x = (coord_out.x * 2 - 7) >> 4;
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;
+
+    vxc_uchar16 in0, in1, in2, dst0, dst1, dst2, dst3;
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    vxc_ushort8 multiplier;
+    _viv_asm(COPY, multiplier, multAndoutZP, 16);
+
+    vxc_ushort8 tmp;
+    while (coord_out.y < out_height)
+    {
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_in.y += 2;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/rope_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/rope_0.vx
new file mode 100644
index 0000000..d321d79
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/rope_0.vx
@@ -0,0 +1,303 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float scale0;
+_viv_uniform float scale1;
+_viv_uniform float output_zp;
+_viv_uniform int half_head_size;
+_viv_uniform VXC_512Bits uniATimesB_0_4x4;
+_viv_uniform VXC_512Bits uniATimesB_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+
+#define ROPE_BNHS_SYMM(name, src_type, src1_type, copy_type, dst_type) \
+__kernel void rope_##name##_bnhs \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t cos_cache, \
+    __read_only  image2d_array_t sin_cache, \
+    __write_only image2d_array_t output, \
+                 int   axis \
+    ) \
+{ \
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+    int4 coord_out = coord_in; \
+ \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+ \
+    src_type data0, data1; \
+    src1_type cos, sin; \
+    copy_type v0, v1; \
+    dst_type dst; \
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, cos, v0, 16); \
+    VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, sin, v1, 16); \
+    coord_in.y += half_head_size; \
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+ \
+    float4 data2, data3, data4, data5; \
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
+    VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
+    VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
+    data3 = data3 * scale0 - data5 * scale1 + output_zp; \
+ \
+    int4 dst0 = convert_int4_rte(data2); \
+    int4 dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+                dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
+    VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
+    data2 = data2 * scale1 + data4 * scale0 + output_zp; \
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
+ \
+    dst0 = convert_int4_rte(data2); \
+    dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    coord_out.y += half_head_size; \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+            dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+ROPE_BNHS_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
+ROPE_BNHS_SYMM(I16_I16toI8,  vxc_short8, vxc_short8, vxc_short8, vxc_char8)
+ROPE_BNHS_SYMM(I16_I16toU8,  vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
+ROPE_BNHS_SYMM(I16_F16toI16, vxc_short8, vxc_half8,  vxc_short8, vxc_short8)
+ROPE_BNHS_SYMM(I16_F16toI8,  vxc_short8, vxc_half8,  vxc_short8, vxc_char8)
+ROPE_BNHS_SYMM(I16_F16toU8,  vxc_short8, vxc_half8,  vxc_short8, vxc_uchar8)
+
+__kernel void rope_F16_F16toF16_bnhs
+    (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int   axis
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4 coord_out = coord_in;
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    vxc_short8 v0, v1, v2, v3, dst;
+    vxc_half8 data0, data1, cos, sin, dst2;
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, data0, v0, 16);
+    VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, cos, v1, 16);
+    VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, sin, v2, 16);
+    coord_in.y += half_head_size;
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, data1, v3, 16);
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 data2, data3, data4, data5;
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
+    VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
+    VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
+    data2 = data2 - data4;
+    data3 = data3 - data5;
+
+    half4 dst0;
+    half4 dst1;
+    _viv_asm(CONV_RTE, dst0, data2);
+    _viv_asm(CONV_RTE, dst1, data3);
+
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+    _viv_asm(COPY, dst, dst2, 16);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+
+    VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
+    VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
+    data2 = data2 * scale1 + data4 * scale0 + output_zp;
+    data3 = data3 * scale1 + data5 * scale0 + output_zp;
+
+    _viv_asm(CONV_RTE, dst0, data2);
+    _viv_asm(CONV_RTE, dst1, data3);
+
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+    _viv_asm(COPY, dst, dst2, 16);
+    coord_out.y += half_head_size;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform int in0_zp;
+_viv_uniform int cos_zp;
+_viv_uniform int sin_zp;
+_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
+_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
+#define ROPE_ASYM_BNHS(name, src1_type, copy_type, dtype) \
+__kernel void rope_##name##_bnhs \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t cos_cache, \
+    __read_only  image2d_array_t sin_cache, \
+    __write_only image2d_array_t output, \
+                 int   axis \
+    ) \
+{ \
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+    int4 coord_out = coord_in; \
+ \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+ \
+    dtype data0, data1, dst; \
+    src1_type cos, sin; \
+    copy_type v0, v1; \
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, cos, v0, 16); \
+    VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, sin, v1, 16); \
+    coord_in.y += half_head_size; \
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+ \
+    float4 l00, l01, cos0, cos1; \
+    float4 l10, l11, sin0, sin1; \
+    VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \
+    float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
+ \
+    int4 dst0 = convert_int4_rte(data2); \
+    int4 dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, \
+            coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+ \
+    data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \
+    data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
+ \
+    dst0 = convert_int4_rte(data2); \
+    dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    coord_out.y += half_head_size; \
+    VXC_OP4_NoDest(img_store_3d, output, \
+        coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+ROPE_ASYM_BNHS(I8_I8toI8,    vxc_char8,   vxc_char8,   vxc_char8)
+ROPE_ASYM_BNHS(U8_U8toU8,    vxc_uchar8,  vxc_uchar8,  vxc_uchar8)
+ROPE_ASYM_BNHS(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
+ROPE_ASYM_BNHS(I8_F16toI8,   vxc_half8,   vxc_short8,  vxc_char8)
+ROPE_ASYM_BNHS(U8_F16toU8,   vxc_half8,   vxc_short8,  vxc_uchar8)
+ROPE_ASYM_BNHS(U16_F16toU16, vxc_half8,   vxc_short8,  vxc_ushort8)
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+__kernel void rope_BF16_BF16toBF16_bnhs
+    (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int   axis
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4 coord_out = coord_in;
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    vxc_ushort8 v0, v1, v2, v3, dst;
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    coord_in.y += half_head_size;
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_short8 data;
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, src0, data, 16);
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, src1, data, 16);
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, cos0, data, 16);
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, cos1, data, 16);
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, sin0, data, 16);
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, sin1, data, 16);
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, src2, data, 16);
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, src3, data, 16);
+
+    float4 data0 = src0 * cos0 - src2 * sin0;
+    float4 data1 = src1 * cos1 - src3 * sin1;
+
+    _viv_asm(COPY, v0, data0, 16);
+    _viv_asm(COPY, v1, data1, 16);
+
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+
+    data0 = src0 * sin0 + src2 * cos0;
+    data1 = src1 * sin1 + src3 * cos1;
+
+    _viv_asm(COPY, v0, data0, 16);
+    _viv_asm(COPY, v1, data1, 16);
+
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    coord_out.y += half_head_size;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/rope_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/rope_1.vx
new file mode 100644
index 0000000..d2aab97
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/rope_1.vx
@@ -0,0 +1,245 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float scale0;
+_viv_uniform float scale1;
+_viv_uniform float output_zp;
+_viv_uniform int half_head_size;
+_viv_uniform VXC_512Bits uniATimesB_0_4x4;
+_viv_uniform VXC_512Bits uniATimesB_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+
+#define ROPE_BNH1_SYMM(name, src_type, src1_type, copy_type, dst_type) \
+__kernel void rope_##name##_bnh1 \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t cos_cache, \
+    __read_only  image2d_array_t sin_cache, \
+    __write_only image2d_array_t output, \
+                 int   axis \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    src_type data0, data1; \
+    src1_type cos, sin; \
+    copy_type v0, v1; \
+    VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, cos, v0, 16); \
+    VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, sin, v1, 16); \
+    coord.x += half_head_size; \
+    VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 data2, data3, data4, data5; \
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
+    VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
+    VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
+    data3 = data3 * scale0 - data5 * scale1 + output_zp; \
+ \
+    int4 dst0 = convert_int4_rte(data2); \
+    int4 dst1 = convert_int4_rte(data3); \
+ \
+    dst_type dst; \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
+    VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
+    data2 = data2 * scale1 + data4 * scale0 + output_zp; \
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
+ \
+    dst0 = convert_int4_rte(data2); \
+    dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+ROPE_BNH1_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
+ROPE_BNH1_SYMM(I16_I16toI8,  vxc_short8, vxc_short8, vxc_short8, vxc_char8)
+ROPE_BNH1_SYMM(I16_I16toU8,  vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
+ROPE_BNH1_SYMM(I16_F16toI16, vxc_short8, vxc_half8,  vxc_short8, vxc_short8)
+ROPE_BNH1_SYMM(I16_F16toI8,  vxc_short8, vxc_half8,  vxc_short8, vxc_char8)
+ROPE_BNH1_SYMM(I16_F16toU8,  vxc_short8, vxc_half8,  vxc_short8, vxc_uchar8)
+
+__kernel void rope_F16_F16toF16_bnh1
+    (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int   axis
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    vxc_short8 v0, v1, v2, v3, dst;
+    vxc_half8 data0, data1, cos, sin, dst2;
+    VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, data0, v0, 16);
+    VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, cos, v1, 16);
+    VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, sin, v2, 16);
+    coord.x += half_head_size;
+    VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, data1, v3, 16);
+
+    float4 data2, data3, data4, data5;
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
+    VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
+    VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
+    data2 = data2 - data4;
+    data3 = data3 - data5;
+
+    half4 dst0;
+    half4 dst1;
+    _viv_asm(CONV_RTE, dst0, data2);
+    _viv_asm(CONV_RTE, dst1, data3);
+
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+    _viv_asm(COPY, dst, dst2, 16);
+    VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
+    VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
+    data2 = data2 + data4;
+    data3 = data3 + data5;
+
+    _viv_asm(CONV_RTE, dst0, data2);
+    _viv_asm(CONV_RTE, dst1, data3);
+
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+    _viv_asm(COPY, dst, dst2, 16);
+    VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform int in0_zp;
+_viv_uniform int cos_zp;
+_viv_uniform int sin_zp;
+_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
+_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
+#define ROPE_ASYM_BNH1(name, src1_type, copy_type, dtype) \
+__kernel void rope_##name##_bnh1 \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t cos_cache, \
+    __read_only  image2d_array_t sin_cache, \
+    __write_only image2d_array_t output, \
+                 int   axis \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    dtype data0, data1, dst; \
+    src1_type cos, sin; \
+    copy_type v0, v1; \
+    VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, cos, v0, 16); \
+    VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, sin, v1, 16); \
+    coord.x += half_head_size; \
+    VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 l00, l01, cos0, cos1; \
+    float4 l10, l11, sin0, sin1; \
+    VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \
+    float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
+ \
+    int4 dst0 = convert_int4_rte(data2); \
+    int4 dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \
+    data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
+ \
+    dst0 = convert_int4_rte(data2); \
+    dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+ROPE_ASYM_BNH1(I8_I8toI8,    vxc_char8,   vxc_char8,   vxc_char8)
+ROPE_ASYM_BNH1(U8_U8toU8,    vxc_uchar8,  vxc_uchar8,  vxc_uchar8)
+ROPE_ASYM_BNH1(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
+ROPE_ASYM_BNH1(I8_F16toI8,   vxc_half8,   vxc_short8,  vxc_char8)
+ROPE_ASYM_BNH1(U8_F16toU8,   vxc_half8,   vxc_short8,  vxc_uchar8)
+ROPE_ASYM_BNH1(U16_F16toU16, vxc_half8,   vxc_short8,  vxc_ushort8)
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+__kernel void rope_BF16_BF16toBF16_bnh1
+    (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int   axis
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    vxc_ushort8 v0, v1, v2, v3, dst;
+    VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    coord.x += half_head_size;
+    VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_short8 data;
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, src0, data, 16);
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, src1, data, 16);
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, cos0, data, 16);
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, cos1, data, 16);
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, sin0, data, 16);
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, sin1, data, 16);
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, src2, data, 16);
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, src3, data, 16);
+
+    float4 data0 = src0 * cos0 - src2 * sin0;
+    float4 data1 = src1 * cos1 - src3 * sin1;
+
+    _viv_asm(COPY, v0, data0, 16);
+    _viv_asm(COPY, v1, data1, 16);
+
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    data0 = src0 * sin0 + src2 * cos0;
+    data1 = src1 * sin1 + src3 * cos1;
+
+    _viv_asm(COPY, v0, data0, 16);
+    _viv_asm(COPY, v1, data1, 16);
+
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/rope_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/rope_2.vx
new file mode 100644
index 0000000..a77830b
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/rope_2.vx
@@ -0,0 +1,312 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float scale0;
+_viv_uniform float scale1;
+_viv_uniform float output_zp;
+_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;
+_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;
+_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;
+_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+
+#define ROPE_BSNH_SYMM(name, src_type, src1_type, copy_type, dst_type) \
+__kernel void rope_##name##_bsnh \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t cos_cache, \
+    __read_only  image2d_array_t sin_cache, \
+    __write_only image2d_array_t output, \
+                 int   axis \
+    ) \
+{ \
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    src_type data0, data1; \
+    src1_type cos, sin; \
+    copy_type v0, v1; \
+    dst_type dst; \
+    VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, cos, v0, 16); \
+    VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, sin, v1, 16); \
+ \
+    coord_in.x *= 2; \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 coord_out = coord_in; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+ \
+    float4 data2, data3, data4, data5; \
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
+    VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
+    VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
+ \
+    int4 dst0 = convert_int4_rte(data2); \
+    int4 dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+                dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
+    VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
+ \
+    dst0 = convert_int4_rte(data2); \
+    dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    coord_out.x += 8; \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+            dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+ROPE_BSNH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
+ROPE_BSNH_SYMM(I16_I16toI8,  vxc_short8, vxc_short8, vxc_short8, vxc_char8)
+ROPE_BSNH_SYMM(I16_I16toU8,  vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
+ROPE_BSNH_SYMM(I16_F16toI16, vxc_short8, vxc_half8,  vxc_short8, vxc_short8)
+ROPE_BSNH_SYMM(I16_F16toI8,  vxc_short8, vxc_half8,  vxc_short8, vxc_char8)
+ROPE_BSNH_SYMM(I16_F16toU8,  vxc_short8, vxc_half8,  vxc_short8, vxc_uchar8)
+
+__kernel void rope_F16_F16toF16_bsnh
+    (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int   axis
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+
+    vxc_short8 v0, v1, v2, v3, dst;
+    vxc_half8 data0, data1, cos, sin, dst2;
+    VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, cos, v1, 16);
+    VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, sin, v2, 16);
+
+    coord_in.x *= 2;
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, data0, v0, 16);
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, data1, v3, 16);
+
+    int4 coord_out = coord_in;
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 data2, data3, data4, data5;
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
+    VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
+    VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
+    data2 = data2 - data4;
+    data3 = data3 + data5;
+
+    half4 dst0;
+    half4 dst1;
+    _viv_asm(CONV_RTE, dst0, data2);
+    _viv_asm(CONV_RTE, dst1, data3);
+
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+    _viv_asm(COPY, dst, dst2, 16);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+
+    VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
+    VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
+    data2 = data2 - data4;
+    data3 = data3 + data5;
+
+    _viv_asm(CONV_RTE, dst0, data2);
+    _viv_asm(CONV_RTE, dst1, data3);
+
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+    _viv_asm(COPY, dst, dst2, 16);
+    coord_out.x += 8;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform int in0_zp;
+_viv_uniform int cos_zp;
+_viv_uniform int sin_zp;
+_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
+_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
+_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;
+_viv_uniform VXC_512Bits uniAOddMinusZp_4x4;
+#define ROPE_ASYM_BSNH(name, src1_type, copy_type, dtype) \
+__kernel void rope_##name##_bsnh \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t cos_cache, \
+    __read_only  image2d_array_t sin_cache, \
+    __write_only image2d_array_t output, \
+                 int   axis \
+    ) \
+{ \
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    dtype data0, data1, dst; \
+    src1_type cos, sin; \
+    copy_type v0, v1; \
+ \
+    VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, cos, v0, 16); \
+    VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, sin, v1, 16); \
+    coord_in.x *= 2; \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 coord_out = coord_in; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+ \
+    float4 l00, l01, cos0, cos1; \
+    float4 l10, l11, sin0, sin1; \
+    VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
+    VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
+    VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \
+    float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \
+ \
+    int4 dst0 = convert_int4_rte(data2); \
+    int4 dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, \
+            coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
+    VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
+    data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
+    data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
+ \
+    dst0 = convert_int4_rte(data2); \
+    dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    coord_out.x += 8; \
+    VXC_OP4_NoDest(img_store_3d, output, \
+        coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+ROPE_ASYM_BSNH(I8_I8toI8,    vxc_char8,   vxc_char8,   vxc_char8)
+ROPE_ASYM_BSNH(U8_U8toU8,    vxc_uchar8,  vxc_uchar8,  vxc_uchar8)
+ROPE_ASYM_BSNH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
+ROPE_ASYM_BSNH(I8_F16toI8,   vxc_half8,   vxc_short8,  vxc_char8)
+ROPE_ASYM_BSNH(U8_F16toU8,   vxc_half8,   vxc_short8,  vxc_uchar8)
+ROPE_ASYM_BSNH(U16_F16toU16, vxc_half8,   vxc_short8,  vxc_ushort8)
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+__kernel void rope_BF16_BF16toBF16_bsnh
+    (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int   axis
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+
+    vxc_ushort8 v0, v1, v2, v3, dst;
+    VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    coord_in.x *= 2;
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    int4 coord_out = coord_in;
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_short8 data;
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, src0, data, 16);
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, src1, data, 16);
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, cos0, data, 16);
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, cos1, data, 16);
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, sin0, data, 16);
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, sin1, data, 16);
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, src2, data, 16);
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, src3, data, 16);
+
+    float4 even = (float4)(src0.xz, src1.xz);
+    float4 odd = (float4)(src0.yw, src1.yw);
+    float4 data0 = even * cos0 - odd * sin0;
+    float4 data1 = even * sin0 + odd * cos0;
+
+    _viv_asm(COPY, v0, data0, 16);
+    _viv_asm(COPY, v1, data1, 16);
+
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+
+    even = (float4)(src2.xz, src3.xz);
+    odd = (float4)(src2.yw, src3.yw);
+    data0 = even * cos1 - odd * sin1;
+    data1 = even * sin1 + odd * cos1;
+
+    _viv_asm(COPY, v0, data0, 16);
+    _viv_asm(COPY, v1, data1, 16);
+
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    coord_out.x += 8;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/rope_3.vx b/src/tim/vx/internal/src/libnnext/ops/vx/rope_3.vx
new file mode 100644
index 0000000..3fb11f9
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/rope_3.vx
@@ -0,0 +1,312 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float scale0;
+_viv_uniform float scale1;
+_viv_uniform float output_zp;
+_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;
+_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;
+_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;
+_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+
+#define ROPE_BNSH_SYMM(name, src_type, src1_type, copy_type, dst_type) \
+__kernel void rope_##name##_bnsh \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t cos_cache, \
+    __read_only  image2d_array_t sin_cache, \
+    __write_only image2d_array_t output, \
+                 int   axis \
+    ) \
+{ \
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    src_type data0, data1; \
+    src1_type cos, sin; \
+    copy_type v0, v1; \
+    dst_type dst; \
+    VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, cos, v0, 16); \
+    VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, sin, v1, 16); \
+ \
+    coord_in.x *= 2; \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 coord_out = coord_in; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+ \
+    float4 data2, data3, data4, data5; \
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
+    VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
+    VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
+ \
+    int4 dst0 = convert_int4_rte(data2); \
+    int4 dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+                dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
+    VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
+ \
+    dst0 = convert_int4_rte(data2); \
+    dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    coord_out.x += 8; \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+            dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+ROPE_BNSH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
+ROPE_BNSH_SYMM(I16_I16toI8,  vxc_short8, vxc_short8, vxc_short8, vxc_char8)
+ROPE_BNSH_SYMM(I16_I16toU8,  vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
+ROPE_BNSH_SYMM(I16_F16toI16, vxc_short8, vxc_half8,  vxc_short8, vxc_short8)
+ROPE_BNSH_SYMM(I16_F16toI8,  vxc_short8, vxc_half8,  vxc_short8, vxc_char8)
+ROPE_BNSH_SYMM(I16_F16toU8,  vxc_short8, vxc_half8,  vxc_short8, vxc_uchar8)
+
+__kernel void rope_F16_F16toF16_bnsh
+    (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int   axis
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+
+    vxc_short8 v0, v1, v2, v3, dst;
+    vxc_half8 data0, data1, cos, sin, dst2;
+    VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, cos, v1, 16);
+    VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, sin, v2, 16);
+
+    coord_in.x *= 2;
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, data0, v0, 16);
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, data1, v3, 16);
+
+    int4 coord_out = coord_in;
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 data2, data3, data4, data5;
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
+    VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
+    VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
+    data2 = data2 - data4;
+    data3 = data3 + data5;
+
+    half4 dst0;
+    half4 dst1;
+    _viv_asm(CONV_RTE, dst0, data2);
+    _viv_asm(CONV_RTE, dst1, data3);
+
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+    _viv_asm(COPY, dst, dst2, 16);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+
+    VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
+    VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
+    data2 = data2 - data4;
+    data3 = data3 + data5;
+
+    _viv_asm(CONV_RTE, dst0, data2);
+    _viv_asm(CONV_RTE, dst1, data3);
+
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+    _viv_asm(COPY, dst, dst2, 16);
+    coord_out.x += 8;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform int in0_zp;
+_viv_uniform int cos_zp;
+_viv_uniform int sin_zp;
+_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
+_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
+_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;
+_viv_uniform VXC_512Bits uniAOddMinusZp_4x4;
+#define ROPE_ASYM_BNSH(name, src1_type, copy_type, dtype) \
+__kernel void rope_##name##_bnsh \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t cos_cache, \
+    __read_only  image2d_array_t sin_cache, \
+    __write_only image2d_array_t output, \
+                 int   axis \
+    ) \
+{ \
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    dtype data0, data1, dst; \
+    src1_type cos, sin; \
+    copy_type v0, v1; \
+ \
+    VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, cos, v0, 16); \
+    VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, sin, v1, 16); \
+    coord_in.x *= 2; \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 coord_out = coord_in; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+ \
+    float4 l00, l01, cos0, cos1; \
+    float4 l10, l11, sin0, sin1; \
+    VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
+    VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
+    VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \
+    float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \
+ \
+    int4 dst0 = convert_int4_rte(data2); \
+    int4 dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, \
+            coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
+    VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
+    data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
+    data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
+ \
+    dst0 = convert_int4_rte(data2); \
+    dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    coord_out.x += 8; \
+    VXC_OP4_NoDest(img_store_3d, output, \
+        coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+ROPE_ASYM_BNSH(I8_I8toI8,    vxc_char8,   vxc_char8,   vxc_char8)
+ROPE_ASYM_BNSH(U8_U8toU8,    vxc_uchar8,  vxc_uchar8,  vxc_uchar8)
+ROPE_ASYM_BNSH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
+ROPE_ASYM_BNSH(I8_F16toI8,   vxc_half8,   vxc_short8,  vxc_char8)
+ROPE_ASYM_BNSH(U8_F16toU8,   vxc_half8,   vxc_short8,  vxc_uchar8)
+ROPE_ASYM_BNSH(U16_F16toU16, vxc_half8,   vxc_short8,  vxc_ushort8)
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+__kernel void rope_BF16_BF16toBF16_bnsh
+    (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int   axis
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+
+    vxc_ushort8 v0, v1, v2, v3, dst;
+    VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    coord_in.x *= 2;
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    int4 coord_out = coord_in;
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_short8 data;
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, src0, data, 16);
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, src1, data, 16);
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, cos0, data, 16);
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, cos1, data, 16);
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, sin0, data, 16);
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, sin1, data, 16);
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, src2, data, 16);
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, src3, data, 16);
+
+    float4 even = (float4)(src0.xz, src1.xz);
+    float4 odd = (float4)(src0.yw, src1.yw);
+    float4 data0 = even * cos0 - odd * sin0;
+    float4 data1 = even * sin0 + odd * cos0;
+
+    _viv_asm(COPY, v0, data0, 16);
+    _viv_asm(COPY, v1, data1, 16);
+
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+
+    even = (float4)(src2.xz, src3.xz);
+    odd = (float4)(src2.yw, src3.yw);
+    data0 = even * cos1 - odd * sin1;
+    data1 = even * sin1 + odd * cos1;
+
+    _viv_asm(COPY, v0, data0, 16);
+    _viv_asm(COPY, v1, data1, 16);
+
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    coord_out.x += 8;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_special.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_special.vx
index 1118356..7e2970a 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_special.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_special.vx
@@ -93,3 +93,101 @@ __kernel void scatter_nd_update_cpy2out_##src0_type##to##src0_type( \
 }
 SCATTER_ND_UPDATE_COPY2OUT(U8,  vxc_uchar16, 1)
 SCATTER_ND_UPDATE_COPY2OUT(I8,  vxc_char16, 1)
+SCATTER_ND_UPDATE_COPY2OUT(U16,  vxc_ushort8, 2)
+SCATTER_ND_UPDATE_COPY2OUT(I16,  vxc_short8, 2)
+
+#define SCATTER_ND_UPDATE_REF2OUT_16BITS(src0_type, data_type) \
+__kernel void scatter_nd_update_ref2out_##src0_type##to##src0_type( \
+    __read_only image2d_t   input_ref, \
+    image2d_t  temp_ref, \
+    image2d_t  output0 \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    Image img0 = create_image_from_image2d(input_ref, 2); \
+    Image img1 = create_image_from_image2d(temp_ref, 2); \
+    __global data_type* in_ptr = (__global data_type*)img0.ptr; \
+    __global data_type* out_ptr = (__global data_type*)img1.ptr; \
+    data_type src, dst; \
+    src = in_ptr[gidx]; \
+    vxc_ushort8 mp0; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    VXC_DP2x8(dst, src, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    out_ptr[gidx] = dst; \
+}
+SCATTER_ND_UPDATE_REF2OUT_16BITS(U16,  vxc_ushort8)
+SCATTER_ND_UPDATE_REF2OUT_16BITS(I16,  vxc_short8)
+
+#define SCATTER_ND_UPDATE_UPDATE2REF_16BITS(src0_type, data_type) \
+__kernel void scatter_nd_update_update2ref_##src0_type##to##src0_type##_16x( \
+    __read_only image2d_t   input_index, \
+    __read_only image2d_t   input_update, \
+    image2d_t  temp_ref, \
+    image2d_t  input0, \
+    image2d_t  output1, \
+    int width, int area, int vol, int coord_dim \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    Image img1 = create_image_from_image2d(input_index, 4); \
+    Image img2 = create_image_from_image2d(input_update, 2); \
+    Image img3 = create_image_from_image2d(temp_ref, 2); \
+    __global int* index_ptr = (__global int*)img1.ptr; \
+    __global data_type* update_ptr = (__global data_type*)img2.ptr; \
+    __global data_type* output_ptr = (__global data_type*)img3.ptr; \
+    data_type dst; \
+ \
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx); \
+    data_type src = update_ptr[gidy * update_width + gidx]; \
+    int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \
+    int loc = idx * output_width + gidx; \
+    vxc_ushort8 mp1; \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(dst, src, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+    output_ptr[loc] = dst; \
+}
+SCATTER_ND_UPDATE_UPDATE2REF_16BITS(U16,  vxc_ushort8)
+SCATTER_ND_UPDATE_UPDATE2REF_16BITS(I16,  vxc_short8)
+
+__kernel void scatter_nd_update_ref2out_F16toF16(
+    __read_only image2d_t   input_ref,
+    image2d_t  temp_ref,
+    image2d_t  output0
+    )
+{
+    int gidx = get_global_id(0);
+    Image img0 = create_image_from_image2d(input_ref, 2);
+    Image img1 = create_image_from_image2d(temp_ref, 2);
+    __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)img0.ptr;
+    __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)img1.ptr;
+    out_ptr[gidx] = in_ptr[gidx];
+}
+
+__kernel void scatter_nd_update_update2ref_F16toF16_16x(
+    __read_only image2d_t   input_index,
+    __read_only image2d_t   input_update,
+    image2d_t  temp_ref,
+    image2d_t  input0,
+    image2d_t  output1,
+    int width, int area, int vol, int coord_dim
+    )
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    Image img1 = create_image_from_image2d(input_index, 4);
+    Image img2 = create_image_from_image2d(input_update, 2);
+    Image img3 = create_image_from_image2d(temp_ref, 2);
+    __global int* index_ptr = (__global int*)img1.ptr;
+    __global vxc_ushort8* update_ptr = (__global vxc_ushort8*)img2.ptr;
+    __global vxc_ushort8* output_ptr = (__global vxc_ushort8*)img3.ptr;
+
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx);
+    int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;
+    int loc = idx * output_width + gidx;
+    output_ptr[loc] = update_ptr[gidy * update_width + gidx];
+}
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
index 5d4159a..9503736 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
@@ -9841,6 +9841,315 @@ CUMSUM_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8)\n\
 CUMSUM_F16TOQINT_EX_REV_AXIS1(U8,  vxc_half8, vxc_uchar16)\n\
 "; /* end of cumsum_f16_u8_vx*/
 
+static const char custom_letterbox_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int top;\n\
+_viv_uniform int left;\n\
+_viv_uniform float out_scale_r;\n\
+_viv_uniform float out_scale_g;\n\
+_viv_uniform float out_scale_b;\n\
+_viv_uniform float out_zp_r;\n\
+_viv_uniform float out_zp_g;\n\
+_viv_uniform float out_zp_b;\n\
+_viv_uniform float pad_v_r;\n\
+_viv_uniform float pad_v_g;\n\
+_viv_uniform float pad_v_b;\n\
+_viv_uniform float scale_w;\n\
+_viv_uniform float scale_h;\n\
+_viv_uniform int resize_max_w;\n\
+_viv_uniform int resize_max_h;\n\
+_viv_uniform int out_height;\n\
+_viv_uniform int r_order;\n\
+_viv_uniform int b_order;\n\
+_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;\n\
+_viv_uniform VXC_512Bits uniLeftToFloat32_4x4;\n\
+_viv_uniform VXC_512Bits uniExtactHalf8_2x8;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+\n\
+__kernel void custom_letterbox_U8toU8\n\
+    (\n\
+     __read_only  image2d_t input,\n\
+     __write_only image2d_t output,\n\
+            int       top_,\n\
+            int       bottom_,\n\
+            int       left_,\n\
+            int       right_,\n\
+            float     mean_r_,\n\
+            float     mean_g_,\n\
+            float     mean_b_,\n\
+            float     scale_r_,\n\
+            float     scale_g_,\n\
+            float     scale_b_,\n\
+            int       pad_r_,\n\
+            int       pad_g_,\n\
+            int       pad_b_,\n\
+            int       reverse_channel\n\
+    )\n\
+{\n\
+    int2 coord_out  =  (int2)(get_global_id(0), get_global_id(1));\n\
+    int2 coord = coord_out;\n\
+    uint4 dst = (uint4)(0,0,0,0);\n\
+    vxc_uchar8 result;\n\
+\n\
+    if (coord_out.x < left || coord_out.x >= resize_max_w ||\n\
+        coord_out.y < top  || coord_out.y >= resize_max_h)\n\
+    {\n\
+        dst.x = convert_uint(pad_v_r);\n\
+        coord.y = coord_out.y + r_order;\n\
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+        dst.x = convert_uint(pad_v_g);\n\
+        coord.y = coord_out.y + out_height;\n\
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+        dst.x = convert_uint(pad_v_b);\n\
+        coord.y = coord_out.y + b_order;\n\
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+        return;\n\
+    }\n\
+\n\
+    float in_x = convert_float(coord_out.x - left) * scale_w;\n\
+    float in_y = convert_float(coord_out.y - top) * scale_h;\n\
+    float left_x_f    = floor(in_x);\n\
+    float top_y_f     = floor(in_y);\n\
+    float x_lerp      = in_x - left_x_f;\n\
+    float y_lerp      = in_y - top_y_f;\n\
+    int   left_x_idx  = convert_int(left_x_f);\n\
+    int   top_y_idx   = convert_int(top_y_f);\n\
+\n\
+    int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);\n\
+    vxc_uchar8 top_data, bottom_data;\n\
+\n\
+    VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                                VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 left4 = (float4)(0,0,0,0);\n\
+    float4 right4 = (float4)(0,0,0,0);\n\
+    float4 top4 = (float4)(0,0,0,0);\n\
+    float4 bottom4 = (float4)(0,0,0,0);\n\
+    VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
+    VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);\n\
+    top4 = right4 * x_lerp + left4;\n\
+    VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
+    VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);\n\
+    bottom4 = right4 * x_lerp + left4;\n\
+    float4 out = (bottom4 - top4) * y_lerp + top4;\n\
+\n\
+    dst.x = convert_uint(out.x * out_scale_r + out_zp_r );\n\
+    coord.y = coord_out.y + r_order;\n\
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    dst.x = convert_uint(out.y * out_scale_g + out_zp_g);\n\
+    coord.y = coord_out.y + out_height;\n\
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    dst.x = convert_uint(out.z * out_scale_b + out_zp_b);\n\
+    coord.y = coord_out.y + b_order;\n\
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void custom_letterbox_U8toI8\n\
+    (\n\
+     __read_only  image2d_t input,\n\
+     __write_only image2d_t output,\n\
+            int       top_,\n\
+            int       bottom_,\n\
+            int       left_,\n\
+            int       right_,\n\
+            float     mean_r_,\n\
+            float     mean_g_,\n\
+            float     mean_b_,\n\
+            float     scale_r_,\n\
+            float     scale_g_,\n\
+            float     scale_b_,\n\
+            int       pad_r_,\n\
+            int       pad_g_,\n\
+            int       pad_b_,\n\
+            int       reverse_channel\n\
+    )\n\
+{\n\
+    int2 coord_out  =  (int2)(get_global_id(0), get_global_id(1));\n\
+    int2 coord = coord_out;\n\
+    int4 dst = (int4)(0,0,0,0);\n\
+    vxc_char8 result;\n\
+\n\
+    if (coord_out.x < left || coord_out.x >= resize_max_w ||\n\
+        coord_out.y < top  || coord_out.y >= resize_max_h)\n\
+    {\n\
+        dst.x = convert_int(pad_v_r);\n\
+        coord.y = coord_out.y + r_order;\n\
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+        dst.x = convert_int(pad_v_g);\n\
+        coord.y = coord_out.y + out_height;\n\
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+        dst.x = convert_int(pad_v_b);\n\
+        coord.y = coord_out.y + b_order;\n\
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+        return;\n\
+    }\n\
+\n\
+    float in_x = convert_float(coord_out.x - left) * scale_w;\n\
+    float in_y = convert_float(coord_out.y - top) * scale_h;\n\
+    float left_x_f    = floor(in_x);\n\
+    float top_y_f     = floor(in_y);\n\
+    float x_lerp      = in_x - left_x_f;\n\
+    float y_lerp      = in_y - top_y_f;\n\
+    int   left_x_idx  = convert_int(left_x_f);\n\
+    int   top_y_idx   = convert_int(top_y_f);\n\
+\n\
+    int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);\n\
+    vxc_char8 top_data, bottom_data;\n\
+\n\
+    VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                               VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                               VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 left4 = (float4)(0,0,0,0);\n\
+    float4 right4 = (float4)(0,0,0,0);\n\
+    float4 top4 = (float4)(0,0,0,0);\n\
+    float4 bottom4 = (float4)(0,0,0,0);\n\
+    VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
+    VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);\n\
+    top4 = right4 * x_lerp + left4;\n\
+    VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
+    VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);\n\
+    bottom4 = right4 * x_lerp + left4;\n\
+    float4 out = (bottom4 - top4) * y_lerp + top4;\n\
+\n\
+    dst.x = convert_int(out.x * out_scale_r + out_zp_r);\n\
+    coord.y = coord_out.y + r_order;\n\
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    dst.x = convert_int(out.y * out_scale_g + out_zp_g);\n\
+    coord.y = coord_out.y + out_height;\n\
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    dst.x = convert_int(out.z * out_scale_b + out_zp_b);\n\
+    coord.y = coord_out.y + b_order;\n\
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void custom_letterbox_U8toF16\n\
+    (\n\
+     __read_only  image2d_t input,\n\
+     __write_only image2d_t output,\n\
+            int       top_,\n\
+            int       bottom_,\n\
+            int       left_,\n\
+            int       right_,\n\
+            float     mean_r_,\n\
+            float     mean_g_,\n\
+            float     mean_b_,\n\
+            float     scale_r_,\n\
+            float     scale_g_,\n\
+            float     scale_b_,\n\
+            int       pad_r_,\n\
+            int       pad_g_,\n\
+            int       pad_b_,\n\
+            int       reverse_channel\n\
+    )\n\
+{\n\
+    int2 coord_out  =  (int2)(get_global_id(0), get_global_id(1));\n\
+    int2 coord = coord_out;\n\
+    half4 tmp;\n\
+    vxc_half8 dst_temp;\n\
+    vxc_ushort8 dst;\n\
+\n\
+    if (coord_out.x < left || coord_out.x >= resize_max_w ||\n\
+        coord_out.y < top  || coord_out.y >= resize_max_h)\n\
+    {\n\
+\n\
+        float4 pad = (float4)(pad_v_r, pad_v_g, pad_v_b, 0);\n\
+        _viv_asm(CONV, tmp, pad);\n\
+        VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\
+        _viv_asm(COPY, dst, dst_temp, 16);\n\
+        coord.y = coord_out.y + r_order;\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+        tmp.x = tmp.y;\n\
+        VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\
+        _viv_asm(COPY, dst, dst_temp, 16);\n\
+        coord.y = coord_out.y + out_height;\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+        tmp.x = tmp.z;\n\
+        VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\
+        _viv_asm(COPY, dst, dst_temp, 16);\n\
+        coord.y = coord_out.y + b_order;\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+        return;\n\
+    }\n\
+\n\
+    float in_x = convert_float(coord_out.x - left) * scale_w;\n\
+    float in_y = convert_float(coord_out.y - top) * scale_h;\n\
+    float left_x_f    = floor(in_x);\n\
+    float top_y_f     = floor(in_y);\n\
+    float x_lerp      = in_x - left_x_f;\n\
+    float y_lerp      = in_y - top_y_f;\n\
+    int   left_x_idx  = convert_int(left_x_f);\n\
+    int   top_y_idx   = convert_int(top_y_f);\n\
+\n\
+    int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);\n\
+    vxc_uchar8 top_data, bottom_data;\n\
+\n\
+    VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                            VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 left4 = (float4)(0,0,0,0);\n\
+    float4 right4 = (float4)(0,0,0,0);\n\
+    float4 top4 = (float4)(0,0,0,0);\n\
+    float4 bottom4 = (float4)(0,0,0,0);\n\
+    VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
+    VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);\n\
+    top4 = right4 * x_lerp + left4;\n\
+    VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
+    VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);\n\
+    bottom4 = right4 * x_lerp + left4;\n\
+    float4 out = (bottom4 - top4) * y_lerp + top4;\n\
+\n\
+    float4 out_temp = (float4)(0,0,0,0);\n\
+    out_temp.x = out.x * out_scale_r + out_zp_r;\n\
+    _viv_asm(CONV, tmp, out_temp);\n\
+    VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\
+    _viv_asm(COPY, dst, dst_temp, 16);\n\
+    coord.y = coord_out.y + r_order;\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    out_temp.x = out.y * out_scale_g + out_zp_g;\n\
+    _viv_asm(CONV, tmp, out_temp);\n\
+    VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\
+    _viv_asm(COPY, dst, dst_temp, 16);\n\
+    coord.y = coord_out.y + out_height;\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    out_temp.x = out.z * out_scale_b + out_zp_b;\n\
+    _viv_asm(CONV, tmp, out_temp);\n\
+    VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\
+    _viv_asm(COPY, dst, dst_temp, 16);\n\
+    coord.y = coord_out.y + out_height;\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of custom_letterbox_vx*/
+
 static const char custom_softmax_vx[] = "/*\n\
  ============================================================================\n\
  Name        : Softmax2.vx\n\
@@ -9853,7 +10162,12 @@ static const char custom_softmax_vx[] = "/*\n\
 #include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32;\n\
+_viv_uniform VXC_512Bits uniExtract8Bin_2x8;\n\
 _viv_uniform int  sf_size;\n\
+_viv_uniform float  srcScale;\n\
+_viv_uniform float  srcZP;\n\
+_viv_uniform float  dstScale;\n\
+_viv_uniform float  dstZP;\n\
  #define F_MAX(a,b) ((a)>(b)?(a):(b))\n\
 __kernel void Softmax2VXC\n\
     (\n\
@@ -9862,35 +10176,37 @@ __kernel void Softmax2VXC\n\
     int axis\n\
     )\n\
 {\n\
-   int4 coord_in = (int4)(0,0,0,0);\n\
-   float fMax = 0.0;\n\
+   int4 coord_in = (int4)(0, get_global_id(0), 0, 0);\n\
+   float fMax = 0;\n\
    for (int i = 0; i < sf_size; i++)\n\
    {\n\
-       vxc_char8 val;\n\
+       vxc_short8 val;\n\
+       vxc_half8  val_h;\n\
        coord_in.x = i;\n\
-       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+       _viv_asm(COPY, val_h, val, 16);\n\
        float fval;\n\
-       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\
+       VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\
 \n\
        fMax = F_MAX(fMax, fval);\n\
    }\n\
-\n\
     float  fProbSum = 0.0f;\n\
     vxc_short8 dst;\n\
     for (int i = 0; i < sf_size; i++)\n\
     {\n\
-       vxc_char8 val;\n\
-\n\
+       vxc_short8 val;\n\
+       vxc_half8  val_h;\n\
        coord_in.x = i;\n\
-       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+       _viv_asm(COPY, val_h, val, 16);\n\
        float fval;\n\
-       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\
-\n\
+       VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\
        float fOut = (float)exp(fval - fMax);\n\
        fProbSum += fOut;\n\
        half hVal;\n\
-       _viv_asm(CONV,hVal,fOut);\n\
-       _viv_asm(COPY,dst,hVal, 4);\n\
+       _viv_asm(CONV, hVal, fOut);\n\
+       _viv_asm(COPY, dst, hVal, 4);\n\
+\n\
        VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
     }\n\
 \n\
@@ -9899,19 +10215,71 @@ __kernel void Softmax2VXC\n\
        vxc_short8 val;\n\
        vxc_half8  val_h;\n\
        coord_in.x = i;\n\
-       VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\
+       VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
        float fval;\n\
        _viv_asm(COPY, val_h,val, 16);\n\
        VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\
-\n\
-       float fOut =fval/fProbSum;\n\
+       float fOut =fval / fProbSum;\n\
        half hVal;\n\
-       _viv_asm(CONV,hVal,fOut);\n\
+       _viv_asm(CONV, hVal, fOut);\n\
        _viv_asm(COPY,dst,hVal, 4);\n\
        VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
     }\n\
 }\n\
-"; /* end of custom_softmax_vx*/
+\n\
+__kernel void Softmax2VXC_u8\n\
+    (\n\
+    image2d_array_t input,\n\
+    image2d_array_t output,\n\
+    int axis\n\
+    )\n\
+{\n\
+   int4 coord_in = (int4)(0, get_global_id(0), 0, 0);\n\
+   float fMax = -3.4e38f;\n\
+   for (int i = 0; i < sf_size; i++)\n\
+   {\n\
+       vxc_uchar8 val;\n\
+       coord_in.x = i;\n\
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+       float fval;\n\
+       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\
+       fval = (fval - srcZP) * srcScale;\n\
+       fMax = F_MAX(fMax, fval);\n\
+   }\n\
+\n\
+    float  fProbSum = 0.0f;\n\
+    vxc_uchar8 dst;\n\
+    for (int i = 0; i < sf_size; i++)\n\
+    {\n\
+       vxc_uchar8 val;\n\
+\n\
+       coord_in.x = i;\n\
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+       float fval;\n\
+       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\
+       fval = (fval - srcZP) * srcScale;\n\
+       float fOut = (float)exp(fval - fMax);\n\
+       fProbSum += fOut;\n\
+    }\n\
+\n\
+    for (int i = 0; i < sf_size; i++)\n\
+    {\n\
+       vxc_uchar8 val;\n\
+       coord_in.x = i;\n\
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+       float fval;\n\
+       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\
+       fval = (fval - srcZP) * srcScale;\n\
+\n\
+       float fOut = exp(fval - fMax) / fProbSum;\n\
+\n\
+       fOut = fOut * dstScale + dstZP;\n\
+       short dst0;\n\
+       _viv_asm(CONV, dst0, fOut);\n\
+       VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8);\n\
+       VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}"; /* end of custom_softmax_vx*/
 
 static const char custom_warp_affine_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
 \n\
@@ -18077,7 +18445,7 @@ _viv_uniform float sum_x2_tail1;\n\
 _viv_uniform float output_scale;\n\
 _viv_uniform float output_zp;\n\
 \n\
-#define GROUP_NORM_SUMS_16BITS_IMPL(name, src_type) \\\n\
+#define GROUP_NORM_SUMS_16BITS_IMPL(name, load_type, src_type) \\\n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \\\n\
     __read_only  image2d_array_t input, \\\n\
     __write_only image2d_array_t output, \\\n\
@@ -18087,7 +18455,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
     int lidx = get_local_id(0); \\\n\
     int gidz = get_global_id(1); \\\n\
     int4 coord = (int4)(gidx, 0, gidz, 0); \\\n\
-    vxc_short8 src0; \\\n\
+    load_type src; \\\n\
     src_type in_h; \\\n\
     float4 sumsqr; \\\n\
     float4 tmpSumSqr = (float4)(0); \\\n\
@@ -18104,9 +18472,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
     { \\\n\
         for(coord.y = 0; coord.y < height;) \\\n\
         { \\\n\
-            VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_OP4(img_load_3d, src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
             coord.y++; \\\n\
-            _viv_asm(COPY, in_h, src0, 16); \\\n\
+            _viv_asm(COPY, in_h, src, 16); \\\n\
             VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \\\n\
             tmpSumSqr += sumsqr; \\\n\
         } \\\n\
@@ -18137,10 +18505,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
         write_imagef(output, coord_out, data); \\\n\
     } \\\n\
 }\n\
-GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_half8)\n\
-GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)\n\
+GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_short8,  vxc_half8)\n\
+GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8,  vxc_short8)\n\
+GROUP_NORM_SUMS_16BITS_IMPL(U16, vxc_ushort8, vxc_ushort8)\n\
 \n\
-#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \\\n\
+#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, load_type, src_type) \\\n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \\\n\
     __read_only  image2d_array_t input, \\\n\
     __write_only image2d_array_t output, \\\n\
@@ -18150,7 +18519,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
     int lidx = get_local_id(0); \\\n\
  \\\n\
     int2 coord = (int2)(gidx, get_global_id(1)); \\\n\
-    vxc_short8 src0; \\\n\
+    load_type src; \\\n\
     src_type in_h; \\\n\
     float4 sumsqr = (float4)(0); \\\n\
  \\\n\
@@ -18159,8 +18528,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
  \\\n\
     if(gidx < width) \\\n\
     { \\\n\
-        VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-        _viv_asm(COPY, in_h, src0, 16); \\\n\
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, in_h, src, 16); \\\n\
         VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \\\n\
         sumsqr.y = sumsqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sumsqr.x; \\\n\
         sumsqr.x = sumsqr.x * input_scale + sum_x_tail; \\\n\
@@ -18189,8 +18558,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
         write_imagef(output, coord_out, data); \\\n\
     } \\\n\
 }\n\
-GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)\n\
-GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)\n\
+GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_short8, vxc_half8)\n\
+GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8, vxc_short8)\n\
+GROUP_NORM_SUMS_16BITS_IMPL_2D(U16, vxc_ushort8, vxc_ushort8)\n\
 \n\
 #define GROUP_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \\\n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \\\n\
@@ -18239,7 +18609,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
     _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\
     norm = alpha * tmpData1 + bias_val; \\\n\
     _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
     _viv_asm(COPY, outval, dst, 16); \\\n\
     VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
@@ -18291,10 +18661,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
     float4 norm; \\\n\
     norm = alpha * tmpData0 + bias_val; \\\n\
-    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\
     norm = alpha * tmpData1 + bias_val; \\\n\
-    _viv_asm(CONV, tmpVal1, norm); \\\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
     _viv_asm(COPY, outval, dst, 16); \\\n\
     VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
@@ -18344,7 +18714,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
  \\\n\
     float4 norm; \\\n\
     norm = alpha * tmpData0 + bias_val; \\\n\
-    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\
     norm = alpha * tmpData1 + bias_val; \\\n\
     _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
@@ -18357,6 +18727,7 @@ GROUP_NORM_16BITS_F32_IMPL(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int
 GROUP_NORM_16BITS_F32_IMPL(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)\n\
 GROUP_NORM_16BITS_F32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\
 GROUP_NORM_16BITS_F32_IMPL(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)\n\
+GROUP_NORM_16BITS_F32_IMPL(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)\n\
 \n\
 #define GROUP_NORM_16BITS_F32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \\\n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \\\n\
@@ -18394,10 +18765,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
     float4 norm; \\\n\
     norm = alpha * tmpData0 + bias_val; \\\n\
-    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\
     norm = alpha * tmpData1 + bias_val; \\\n\
-    _viv_asm(CONV, tmpVal1, norm); \\\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
     _viv_asm(COPY, outval, dst, 16); \\\n\
     VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
@@ -18407,6 +18778,7 @@ GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,
 GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)\n\
 GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\
 GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)\n\
+GROUP_NORM_16BITS_F32_IMPL_2D(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)\n\
 \n\
 "; /* end of group_normalization_2_vx*/
 
@@ -48227,45 +48599,45 @@ _viv_uniform VXC_512Bits uniDataSubZPtoFp32Part1_4x4;\n\
 _viv_uniform VXC_512Bits uniConvF16toF32_part0_4x4;\n\
 _viv_uniform VXC_512Bits uniConvF16toF32_part1_4x4;\n\
 _viv_uniform VXC_512Bits uniExtact8Bin_2x8;\n\
-_viv_uniform int inputZP0;\n\
-_viv_uniform int inputZP1;\n\
-_viv_uniform float input_scale0;\n\
-_viv_uniform float input_scale1;\n\
-_viv_uniform float outputZP;\n\
-#define PRELU_F16_3D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \\\n\
-    __kernel void prelu_##name0##to##name1( \\\n\
+_viv_uniform int input0_zp;\n\
+_viv_uniform int input1_zp;\n\
+_viv_uniform float input0_scale;\n\
+_viv_uniform float input1_scale;\n\
+_viv_uniform float output_zp;\n\
+#define PRELU_F16_3D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \\\n\
+    __kernel void prelu_##name( \\\n\
     __read_only  image2d_array_t input0, \\\n\
     __read_only  image2d_array_t input1, \\\n\
     __write_only image2d_array_t output) \\\n\
 {\\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\\\n\
-    vxc_float4 vecA, vecB, vecC, vecD;\\\n\
+    float4 vecA, vecB, vecC, vecD;\\\n\
     input_type0 srcA;\\\n\
     copy_type0  src0;\\\n\
     vxc_short8 srcB;\\\n\
     vxc_half8  src1;\\\n\
-    input_type0 input_ZP;\\\n\
+    input_type0 zp;\\\n\
     VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\
     _viv_asm(COPY, src0, srcA, 16); \\\n\
     VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\
     _viv_asm(COPY, src1, srcB, 16); \\\n\
     \\\n\
-    _viv_asm(COPY, input_ZP, inputZP0, 4);\\\n\
-    VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \\\n\
+    _viv_asm(COPY, zp, input0_zp, 4);\\\n\
+    VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \\\n\
         uniDataSubZPtoFp32Part0_4x4); \\\n\
-    VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \\\n\
+    VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \\\n\
         uniDataSubZPtoFp32Part1_4x4);\\\n\
     VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\\\n\
     VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\\\n\
     \\\n\
-    vecA = vecA * input_scale0;\\\n\
-    vecB = vecB * input_scale0;\\\n\
-    vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \\\n\
-    vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \\\n\
-    vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \\\n\
-    vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \\\n\
-    vecA = maxData0 + vecC * minData0 + outputZP;\\\n\
-    vecB = maxData1 + vecD * minData1 + outputZP;\\\n\
+    vecA = vecA * input0_scale;\\\n\
+    vecB = vecB * input0_scale;\\\n\
+    float4 maxData0 = vecA > 0 ? vecA : 0.0; \\\n\
+    float4 maxData1 = vecB > 0 ? vecB : 0.0; \\\n\
+    float4 minData0 = vecA < 0 ? vecA : 0.0; \\\n\
+    float4 minData1 = vecB < 0 ? vecB : 0.0; \\\n\
+    vecA = maxData0 + vecC * minData0 + output_zp;\\\n\
+    vecB = maxData1 + vecD * minData1 + output_zp;\\\n\
     convert_type dst0, dst1;\\\n\
     _viv_asm(CONV_RTE, dst0, vecA);\\\n\
     _viv_asm(CONV_RTE, dst1, vecB);\\\n\
@@ -48276,49 +48648,49 @@ _viv_uniform float outputZP;\n\
     VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\
 }\n\
 //        name0, name1, input_type0, copy_type0,  output_type, convert_type, copy_type\n\
-PRELU_F16_3D(I8F16,  I8,  vxc_char16,  vxc_char16,  vxc_char16,  int4,  vxc_char16)\n\
-PRELU_F16_3D(I8F16,  F16, vxc_char16,  vxc_char16,  vxc_half8,   half4, vxc_short8)\n\
-PRELU_F16_3D(I16F16, I16, vxc_short8,  vxc_short8,  vxc_short8,  int4,  vxc_short8)\n\
-PRELU_F16_3D(I16F16, F16, vxc_short8,  vxc_short8,  vxc_half8,   half4, vxc_short8)\n\
-PRELU_F16_3D(U8F16,  U8,  vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)\n\
-PRELU_F16_3D(U8F16,  F16, vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)\n\
-PRELU_F16_3D(F16F16, F16, vxc_short8,  vxc_half8,   vxc_half8,   half4, vxc_short8)\n\
-PRELU_F16_3D(F16F16, I8,  vxc_short8,  vxc_half8,   vxc_char16,  int4,  vxc_char16)\n\
-PRELU_F16_3D(F16F16, I16, vxc_short8,  vxc_half8,   vxc_short8,  int4,  vxc_short8)\n\
-PRELU_F16_3D(F16F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_uchar16)\n\
+PRELU_F16_3D(I8F16toI8,   vxc_char16,  vxc_char16,  vxc_char16,  int4,  vxc_char16)\n\
+PRELU_F16_3D(I8F16toF16,  vxc_char16,  vxc_char16,  vxc_half8,   half4, vxc_short8)\n\
+PRELU_F16_3D(I16F16toI16, vxc_short8,  vxc_short8,  vxc_short8,  int4,  vxc_short8)\n\
+PRELU_F16_3D(I16F16toF16, vxc_short8,  vxc_short8,  vxc_half8,   half4, vxc_short8)\n\
+PRELU_F16_3D(U8F16toU8,   vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)\n\
+PRELU_F16_3D(U8F16toF16,  vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)\n\
+PRELU_F16_3D(F16F16toF16, vxc_short8,  vxc_half8,   vxc_half8,   half4, vxc_short8)\n\
+PRELU_F16_3D(F16F16toI8,  vxc_short8,  vxc_half8,   vxc_char16,  int4,  vxc_char16)\n\
+PRELU_F16_3D(F16F16toI16, vxc_short8,  vxc_half8,   vxc_short8,  int4,  vxc_short8)\n\
+PRELU_F16_3D(F16F16toU8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_uchar16)\n\
 \n\
-#define PRELU_F16_2D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \\\n\
-    __kernel void prelu_##name0##to##name1##_2D( \\\n\
+#define PRELU_F16_2D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \\\n\
+    __kernel void prelu_##name##_2D( \\\n\
     __read_only  image2d_array_t input0, \\\n\
     __read_only  image2d_array_t input1, \\\n\
     __write_only image2d_array_t output) \\\n\
 {\\\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1));\\\n\
-    vxc_float4 vecA, vecB, vecC, vecD;\\\n\
+    float4 vecA, vecB, vecC, vecD;\\\n\
     input_type0 srcA;\\\n\
     copy_type0  src0;\\\n\
     vxc_short8 srcB;\\\n\
     vxc_half8  src1;\\\n\
-    input_type0 input_ZP;\\\n\
+    input_type0 zp;\\\n\
     VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\
     _viv_asm(COPY, src0, srcA, 16); \\\n\
     VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\
     _viv_asm(COPY, src1, srcB, 16); \\\n\
     \\\n\
-    _viv_asm(COPY, input_ZP, inputZP0, 4);\\\n\
-    VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\\\n\
-    VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\\\n\
+    _viv_asm(COPY, zp, input0_zp, 4);\\\n\
+    VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\\\n\
+    VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\\\n\
     VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\\\n\
     VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\\\n\
     \\\n\
-    vecA = vecA * input_scale0;\\\n\
-    vecB = vecB * input_scale0;\\\n\
-    vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \\\n\
-    vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \\\n\
-    vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \\\n\
-    vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \\\n\
-    vecA = maxData0 + vecC * minData0 + outputZP;\\\n\
-    vecB = maxData1 + vecD * minData1 + outputZP;\\\n\
+    vecA = vecA * input0_scale;\\\n\
+    vecB = vecB * input0_scale;\\\n\
+    float4 maxData0 = vecA > 0 ? vecA : 0.0; \\\n\
+    float4 maxData1 = vecB > 0 ? vecB : 0.0; \\\n\
+    float4 minData0 = vecA < 0 ? vecA : 0.0; \\\n\
+    float4 minData1 = vecB < 0 ? vecB : 0.0; \\\n\
+    vecA = maxData0 + vecC * minData0 + output_zp;\\\n\
+    vecB = maxData1 + vecD * minData1 + output_zp;\\\n\
     convert_type dst0, dst1;\\\n\
     _viv_asm(CONV_RTE, dst0, vecA);\\\n\
     _viv_asm(CONV_RTE, dst1, vecB);\\\n\
@@ -48328,49 +48700,49 @@ PRELU_F16_3D(F16F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_ucha
     _viv_asm(COPY, dst, dst2, 16); \\\n\
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\
 }\n\
-PRELU_F16_2D(I8F16,  F16, vxc_char16,  vxc_char16,  vxc_half8,   half4, vxc_short8)\n\
-PRELU_F16_2D(I8F16,  I8,  vxc_char16,  vxc_char16,  vxc_char16,  int4,  vxc_char16)\n\
-PRELU_F16_2D(I16F16, F16, vxc_short8,  vxc_short8,  vxc_half8,   half4, vxc_short8)\n\
-PRELU_F16_2D(U8F16,  U8,  vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)\n\
-PRELU_F16_2D(U8F16,  F16, vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)\n\
-PRELU_F16_2D(F16F16, F16, vxc_short8,  vxc_half8,   vxc_half8,   half4, vxc_short8)\n\
-PRELU_F16_2D(F16F16, I8,  vxc_short8,  vxc_half8,   vxc_char16,  int4,  vxc_char16)\n\
-PRELU_F16_2D(F16F16, I16, vxc_short8,  vxc_half8,   vxc_short8,  int4,  vxc_short8)\n\
-PRELU_F16_2D(I16F16, I16, vxc_short8,  vxc_short8,  vxc_short8,  int4,  vxc_short8)\n\
-PRELU_F16_2D(F16F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_uchar16)\n\
+PRELU_F16_2D(I8F16toF16,  vxc_char16,  vxc_char16,  vxc_half8,   half4, vxc_short8)\n\
+PRELU_F16_2D(I8F16toI8,   vxc_char16,  vxc_char16,  vxc_char16,  int4,  vxc_char16)\n\
+PRELU_F16_2D(I16F16toF16, vxc_short8,  vxc_short8,  vxc_half8,   half4, vxc_short8)\n\
+PRELU_F16_2D(U8F16toU8,   vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)\n\
+PRELU_F16_2D(U8F16toF16,  vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)\n\
+PRELU_F16_2D(F16F16toF16, vxc_short8,  vxc_half8,   vxc_half8,   half4, vxc_short8)\n\
+PRELU_F16_2D(F16F16toI8,  vxc_short8,  vxc_half8,   vxc_char16,  int4,  vxc_char16)\n\
+PRELU_F16_2D(F16F16toI16, vxc_short8,  vxc_half8,   vxc_short8,  int4,  vxc_short8)\n\
+PRELU_F16_2D(I16F16toI16, vxc_short8,  vxc_short8,  vxc_short8,  int4,  vxc_short8)\n\
+PRELU_F16_2D(F16F16toU8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_uchar16)\n\
 \n\
-#define PRELU_U8_2D(name, output_type, convert_type, copy_type) \\\n\
-    __kernel void prelu_U8U8to##name##_2D( \\\n\
+#define PRELU_INTEGER_2D(name, src0_type, src1_type, output_type, convert_type, copy_type) \\\n\
+    __kernel void prelu_##name##_2D( \\\n\
     __read_only  image2d_array_t input0, \\\n\
     __read_only  image2d_array_t input1, \\\n\
     __write_only image2d_array_t output) \\\n\
 {\\\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1));\\\n\
-    vxc_float4 vecA, vecB, vecC, vecD;\\\n\
-    vxc_uchar16  src0;\\\n\
-    vxc_uchar16  src1;\\\n\
-    vxc_uchar16 input_ZP0;\\\n\
-    vxc_uchar16 input_ZP1;\\\n\
+    float4 vecA, vecB, vecC, vecD;\\\n\
+    src0_type  src0;\\\n\
+    src1_type  src1;\\\n\
+    short zp0;\\\n\
+    short zp1;\\\n\
     VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\
     VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\
     \\\n\
-    _viv_asm(COPY, input_ZP0, inputZP0, 4);\\\n\
-    VXC_DP4x4(vecA, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\\\n\
-    VXC_DP4x4(vecB, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\\\n\
-    _viv_asm(COPY, input_ZP1, inputZP1, 4);\\\n\
-    VXC_DP4x4(vecC, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\\\n\
-    VXC_DP4x4(vecD, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\\\n\
+    _viv_asm(COPY, zp0, input0_zp, 2);\\\n\
+    VXC_DP4x4(vecA, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\\\n\
+    VXC_DP4x4(vecB, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\\\n\
+    _viv_asm(COPY, zp1, input1_zp, 4);\\\n\
+    VXC_DP4x4(vecC, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\\\n\
+    VXC_DP4x4(vecD, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\\\n\
     \\\n\
-    vecA = vecA * input_scale0;\\\n\
-    vecB = vecB * input_scale0;\\\n\
-    vecC = vecC * input_scale1;\\\n\
-    vecD = vecD * input_scale1;\\\n\
-    vxc_float4 maxData0 = vecA >= 0 ? vecA : 0.0; \\\n\
-    vxc_float4 maxData1 = vecB >= 0 ? vecB : 0.0; \\\n\
-    vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \\\n\
-    vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \\\n\
-    vecA = maxData0 + vecC * minData0 + outputZP;\\\n\
-    vecB = maxData1 + vecD * minData1 + outputZP;\\\n\
+    vecA = vecA * input0_scale;\\\n\
+    vecB = vecB * input0_scale;\\\n\
+    vecC = vecC * input1_scale;\\\n\
+    vecD = vecD * input1_scale;\\\n\
+    float4 maxData0 = vecA >= 0 ? vecA : 0.0; \\\n\
+    float4 maxData1 = vecB >= 0 ? vecB : 0.0; \\\n\
+    float4 minData0 = vecA < 0 ? vecA : 0.0; \\\n\
+    float4 minData1 = vecB < 0 ? vecB : 0.0; \\\n\
+    vecA = maxData0 + vecC * minData0 + output_zp;\\\n\
+    vecB = maxData1 + vecD * minData1 + output_zp;\\\n\
     convert_type dst0, dst1;\\\n\
     _viv_asm(CONV_RTE, dst0, vecA);\\\n\
     _viv_asm(CONV_RTE, dst1, vecB);\\\n\
@@ -48380,8 +48752,9 @@ PRELU_F16_2D(F16F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_ucha
     _viv_asm(COPY, dst, dst2, 16); \\\n\
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\
 }\n\
-PRELU_U8_2D(U8,  vxc_uchar16, int4,  vxc_uchar16)\n\
-PRELU_U8_2D(F16, vxc_half8,   half4, vxc_short8)\n\
+PRELU_INTEGER_2D(U8U8toU8,    vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)\n\
+PRELU_INTEGER_2D(U8U8toF16,   vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)\n\
+\n\
 \n\
 \n\
 "; /* end of prelu_vx*/
@@ -54918,6 +55291,462 @@ __kernel void resize_bilinear_U8toU8_SAME_8x_upsample_half_pixel_centers\n\
 }\n\
 "; /* end of resize_bilinear_U8_half_pixel_centers_2_vx*/
 
+static const char resize_bilinear_U8_half_pixel_centers_3_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniU8PostProcess_2x8;\n\
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\
+_viv_uniform VXC_512Bits uniResize2xUp_0_4x8;\n\
+_viv_uniform VXC_512Bits uniResize2xUp_1_4x8;\n\
+_viv_uniform int out_height;\n\
+\n\
+__kernel void resize_bilinear_U8toU8_2x_upsample_half_pixel_centers\n\
+    (\n\
+    __read_only  image2d_array_t   input,\n\
+    __write_only image2d_array_t   output,\n\
+                             int   align_corners,\n\
+                             int   half_pixel_centers\n\
+    )\n\
+{\n\
+    int4 coord_out  = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\
+    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);\n\
+    coord_in.x = (coord_out.x * 2 - 1) >> 2;\n\
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;\n\
+\n\
+    vxc_uchar16 in0, in1, tmp, result;\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    vxc_ushort8 multiplier;\n\
+    _viv_asm(COPY, multiplier, multAndoutZP, 16);\n\
+\n\
+    vxc_ushort8 dst0;\n\
+    while (coord_out.y < out_height)\n\
+    {\n\
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.y += 2;\n\
+        coord_out.y++;\n\
+    }\n\
+}\n\
+\n\
+_viv_uniform VXC_512Bits uniResize4xUp_l00_4x8;\n\
+_viv_uniform VXC_512Bits uniResize4xUp_l01_4x8;\n\
+_viv_uniform VXC_512Bits uniResize4xUp_l10_4x8;\n\
+_viv_uniform VXC_512Bits uniResize4xUp_l11_4x8;\n\
+__kernel void resize_bilinear_U8toU8_4x_upsample_half_pixel_centers\n\
+    (\n\
+    __read_only  image2d_array_t   input,\n\
+    __write_only image2d_array_t   output,\n\
+                             int   align_corners,\n\
+                             int   half_pixel_centers\n\
+    )\n\
+{\n\
+    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\
+    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);\n\
+    coord_in.x = (coord_out.x * 2 - 3) >> 3;\n\
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;\n\
+\n\
+    vxc_uchar16 in0, in1, dst0, dst1;\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    vxc_ushort8 multiplier;\n\
+    _viv_asm(COPY, multiplier, multAndoutZP, 16);\n\
+\n\
+    vxc_ushort8 tmp;\n\
+    while (coord_out.y < out_height)\n\
+    {\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);\n\
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);\n\
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);\n\
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);\n\
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);\n\
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l10_4x8);\n\
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l10_4x8);\n\
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);\n\
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.y += 2;\n\
+        coord_out.y++;\n\
+    }\n\
+}\n\
+"; /* end of resize_bilinear_U8_half_pixel_centers_3_vx*/
+
+static const char resize_bilinear_U8_half_pixel_centers_4_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniU8PostProcess_2x8;\n\
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\
+_viv_uniform VXC_512Bits uniResize3xUp_l00_2x8;\n\
+_viv_uniform VXC_512Bits uniResize3xUp_l01_2x8;\n\
+_viv_uniform VXC_512Bits uniResize3xUp_l10_4x4;\n\
+_viv_uniform VXC_512Bits uniResize3xUp_l11_4x4;\n\
+_viv_uniform VXC_512Bits uniResize3xUp_l12_4x4;\n\
+_viv_uniform VXC_512Bits uniResize3xUp_l13_4x4;\n\
+__kernel void resize_bilinear_U8toU8_3x_upsample_half_pixel_centers\n\
+    (\n\
+    __read_only  image2d_array_t   input,\n\
+    __write_only image2d_array_t   output,\n\
+                             int   align_corners,\n\
+                             int   half_pixel_centers\n\
+    )\n\
+{\n\
+    int4 coord_out  = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_in   = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    coord_in.x = (short)(coord_out.x * 2 - 1) / (short)6;\n\
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;\n\
+    coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;\n\
+    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;\n\
+\n\
+    vxc_uchar16 in0, in1, in2, in3, tmp, dst0, dst1, dst2;\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, in2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, in3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    vxc_ushort8 multiplier;\n\
+    _viv_asm(COPY, multiplier, multAndoutZP, 16);\n\
+\n\
+    vxc_ushort8 data;\n\
+\n\
+    VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);\n\
+    VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);\n\
+    VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+    VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);\n\
+    VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\
+    VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+\n\
+    VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l00_2x8);\n\
+    VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+    VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l01_2x8);\n\
+    VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+    VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);\n\
+    VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);\n\
+    VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+    VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);\n\
+    VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\
+    VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+    VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);\n\
+    VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);\n\
+    VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+    VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);\n\
+    VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\
+    VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+\n\
+    VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l00_2x8);\n\
+    VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+    VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l01_2x8);\n\
+    VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+    VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);\n\
+    VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);\n\
+    VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+    VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);\n\
+    VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\
+    VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of resize_bilinear_U8_half_pixel_centers_4_vx*/
+
+static const char resize_bilinear_U8_half_pixel_centers_5_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniU8PostProcess_2x8;\n\
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\
+_viv_uniform int out_height;\n\
+_viv_uniform VXC_512Bits uniResize8xUp_l00_4x8;\n\
+_viv_uniform VXC_512Bits uniResize8xUp_l01_4x8;\n\
+_viv_uniform VXC_512Bits uniResize8xUp_l10_4x8;\n\
+_viv_uniform VXC_512Bits uniResize8xUp_l11_4x8;\n\
+_viv_uniform VXC_512Bits uniResize8xUp_l20_4x8;\n\
+_viv_uniform VXC_512Bits uniResize8xUp_l21_4x8;\n\
+_viv_uniform VXC_512Bits uniResize8xUp_l30_4x8;\n\
+_viv_uniform VXC_512Bits uniResize8xUp_l31_4x8;\n\
+__kernel void resize_bilinear_U8toU8_8x_upsample_half_pixel_centers\n\
+    (\n\
+    __read_only  image2d_array_t   input,\n\
+    __write_only image2d_array_t   output,\n\
+                             int   align_corners,\n\
+                             int   half_pixel_centers\n\
+    )\n\
+{\n\
+    int4 coord_out  = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\
+    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);\n\
+    coord_in.x = (coord_out.x * 2 - 7) >> 4;\n\
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;\n\
+\n\
+    vxc_uchar16 in0, in1, in2, dst0, dst1, dst2, dst3;\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    vxc_ushort8 multiplier;\n\
+    _viv_asm(COPY, multiplier, multAndoutZP, 16);\n\
+\n\
+    vxc_ushort8 tmp;\n\
+    while (coord_out.y < out_height)\n\
+    {\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);\n\
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);\n\
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);\n\
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);\n\
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);\n\
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);\n\
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);\n\
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);\n\
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);\n\
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);\n\
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);\n\
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);\n\
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);\n\
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);\n\
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);\n\
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);\n\
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);\n\
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);\n\
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);\n\
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);\n\
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);\n\
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);\n\
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);\n\
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);\n\
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.y += 2;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);\n\
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);\n\
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);\n\
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);\n\
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);\n\
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);\n\
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);\n\
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);\n\
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+    }\n\
+}\n\
+"; /* end of resize_bilinear_U8_half_pixel_centers_5_vx*/
+
 static const char resize_bilinear_U8_opt_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 #if (VX_VERSION==2)\n\
@@ -56088,6 +56917,1186 @@ __kernel void resize_nearest_I16toI16_op\n\
 }\n\
 "; /* end of resize_nearest_vx*/
 
+static const char rope_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float scale0;\n\
+_viv_uniform float scale1;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform int half_head_size;\n\
+_viv_uniform VXC_512Bits uniATimesB_0_4x4;\n\
+_viv_uniform VXC_512Bits uniATimesB_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+\n\
+#define ROPE_BNHS_SYMM(name, src_type, src1_type, copy_type, dst_type) \\\n\
+__kernel void rope_##name##_bnhs \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_array_t cos_cache, \\\n\
+    __read_only  image2d_array_t sin_cache, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int   axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+    int4 coord_out = coord_in; \\\n\
+ \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+ \\\n\
+    src_type data0, data1; \\\n\
+    src1_type cos, sin; \\\n\
+    copy_type v0, v1; \\\n\
+    dst_type dst; \\\n\
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, cos, v0, 16); \\\n\
+    VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, sin, v1, 16); \\\n\
+    coord_in.y += half_head_size; \\\n\
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, \\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+ \\\n\
+    float4 data2, data3, data4, data5; \\\n\
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \\\n\
+    VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \\\n\
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \\\n\
+    VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \\\n\
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \\\n\
+    data3 = data3 * scale0 - data5 * scale1 + output_zp; \\\n\
+ \\\n\
+    int4 dst0 = convert_int4_rte(data2); \\\n\
+    int4 dst1 = convert_int4_rte(data3); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\
+                dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \\\n\
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \\\n\
+    VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \\\n\
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \\\n\
+    data2 = data2 * scale1 + data4 * scale0 + output_zp; \\\n\
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \\\n\
+ \\\n\
+    dst0 = convert_int4_rte(data2); \\\n\
+    dst1 = convert_int4_rte(data3); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    coord_out.y += half_head_size; \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\
+            dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+ROPE_BNHS_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)\n\
+ROPE_BNHS_SYMM(I16_I16toI8,  vxc_short8, vxc_short8, vxc_short8, vxc_char8)\n\
+ROPE_BNHS_SYMM(I16_I16toU8,  vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)\n\
+ROPE_BNHS_SYMM(I16_F16toI16, vxc_short8, vxc_half8,  vxc_short8, vxc_short8)\n\
+ROPE_BNHS_SYMM(I16_F16toI8,  vxc_short8, vxc_half8,  vxc_short8, vxc_char8)\n\
+ROPE_BNHS_SYMM(I16_F16toU8,  vxc_short8, vxc_half8,  vxc_short8, vxc_uchar8)\n\
+\n\
+__kernel void rope_F16_F16toF16_bnhs\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_array_t cos_cache,\n\
+    __read_only  image2d_array_t sin_cache,\n\
+    __write_only image2d_array_t output,\n\
+                 int   axis\n\
+    )\n\
+{\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    int4 coord_out = coord_in;\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+\n\
+    vxc_short8 v0, v1, v2, v3, dst;\n\
+    vxc_half8 data0, data1, cos, sin, dst2;\n\
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, data0, v0, 16);\n\
+    VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, cos, v1, 16);\n\
+    VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, sin, v2, 16);\n\
+    coord_in.y += half_head_size;\n\
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, data1, v3, 16);\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    float4 data2, data3, data4, data5;\n\
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);\n\
+    VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);\n\
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);\n\
+    VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);\n\
+    data2 = data2 - data4;\n\
+    data3 = data3 - data5;\n\
+\n\
+    half4 dst0;\n\
+    half4 dst1;\n\
+    _viv_asm(CONV_RTE, dst0, data2);\n\
+    _viv_asm(CONV_RTE, dst1, data3);\n\
+\n\
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\
+    _viv_asm(COPY, dst, dst2, 16);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);\n\
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);\n\
+    VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);\n\
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);\n\
+    data2 = data2 * scale1 + data4 * scale0 + output_zp;\n\
+    data3 = data3 * scale1 + data5 * scale0 + output_zp;\n\
+\n\
+    _viv_asm(CONV_RTE, dst0, data2);\n\
+    _viv_asm(CONV_RTE, dst1, data3);\n\
+\n\
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\
+    _viv_asm(COPY, dst, dst2, 16);\n\
+    coord_out.y += half_head_size;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+_viv_uniform int in0_zp;\n\
+_viv_uniform int cos_zp;\n\
+_viv_uniform int sin_zp;\n\
+_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;\n\
+_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;\n\
+#define ROPE_ASYM_BNHS(name, src1_type, copy_type, dtype) \\\n\
+__kernel void rope_##name##_bnhs \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_array_t cos_cache, \\\n\
+    __read_only  image2d_array_t sin_cache, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int   axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+    int4 coord_out = coord_in; \\\n\
+ \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+ \\\n\
+    dtype data0, data1, dst; \\\n\
+    src1_type cos, sin; \\\n\
+    copy_type v0, v1; \\\n\
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, cos, v0, 16); \\\n\
+    VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, sin, v1, 16); \\\n\
+    coord_in.y += half_head_size; \\\n\
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+ \\\n\
+    float4 l00, l01, cos0, cos1; \\\n\
+    float4 l10, l11, sin0, sin1; \\\n\
+    VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\
+    VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\
+    VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\
+    VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\
+    VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\
+    VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\
+    VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\
+    VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\
+    float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \\\n\
+    float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \\\n\
+ \\\n\
+    int4 dst0 = convert_int4_rte(data2); \\\n\
+    int4 dst1 = convert_int4_rte(data3); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, \\\n\
+            coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \\\n\
+    data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \\\n\
+ \\\n\
+    dst0 = convert_int4_rte(data2); \\\n\
+    dst1 = convert_int4_rte(data3); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    coord_out.y += half_head_size; \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, \\\n\
+        coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+ROPE_ASYM_BNHS(I8_I8toI8,    vxc_char8,   vxc_char8,   vxc_char8)\n\
+ROPE_ASYM_BNHS(U8_U8toU8,    vxc_uchar8,  vxc_uchar8,  vxc_uchar8)\n\
+ROPE_ASYM_BNHS(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)\n\
+ROPE_ASYM_BNHS(I8_F16toI8,   vxc_half8,   vxc_short8,  vxc_char8)\n\
+ROPE_ASYM_BNHS(U8_F16toU8,   vxc_half8,   vxc_short8,  vxc_uchar8)\n\
+ROPE_ASYM_BNHS(U16_F16toU16, vxc_half8,   vxc_short8,  vxc_ushort8)\n\
+\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
+__kernel void rope_BF16_BF16toBF16_bnhs\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_array_t cos_cache,\n\
+    __read_only  image2d_array_t sin_cache,\n\
+    __write_only image2d_array_t output,\n\
+                 int   axis\n\
+    )\n\
+{\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    int4 coord_out = coord_in;\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+\n\
+    vxc_ushort8 v0, v1, v2, v3, dst;\n\
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.y += half_head_size;\n\
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    vxc_short8 data;\n\
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, src0, data, 16);\n\
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, src1, data, 16);\n\
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, cos0, data, 16);\n\
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, cos1, data, 16);\n\
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, sin0, data, 16);\n\
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, sin1, data, 16);\n\
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, src2, data, 16);\n\
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, src3, data, 16);\n\
+\n\
+    float4 data0 = src0 * cos0 - src2 * sin0;\n\
+    float4 data1 = src1 * cos1 - src3 * sin1;\n\
+\n\
+    _viv_asm(COPY, v0, data0, 16);\n\
+    _viv_asm(COPY, v1, data1, 16);\n\
+\n\
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    data0 = src0 * sin0 + src2 * cos0;\n\
+    data1 = src1 * sin1 + src3 * cos1;\n\
+\n\
+    _viv_asm(COPY, v0, data0, 16);\n\
+    _viv_asm(COPY, v1, data1, 16);\n\
+\n\
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    coord_out.y += half_head_size;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of rope_0_vx*/
+
+static const char rope_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float scale0;\n\
+_viv_uniform float scale1;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform int half_head_size;\n\
+_viv_uniform VXC_512Bits uniATimesB_0_4x4;\n\
+_viv_uniform VXC_512Bits uniATimesB_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+\n\
+#define ROPE_BNH1_SYMM(name, src_type, src1_type, copy_type, dst_type) \\\n\
+__kernel void rope_##name##_bnh1 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_array_t cos_cache, \\\n\
+    __read_only  image2d_array_t sin_cache, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int   axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    src_type data0, data1; \\\n\
+    src1_type cos, sin; \\\n\
+    copy_type v0, v1; \\\n\
+    VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, cos, v0, 16); \\\n\
+    VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, sin, v1, 16); \\\n\
+    coord.x += half_head_size; \\\n\
+    VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 data2, data3, data4, data5; \\\n\
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \\\n\
+    VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \\\n\
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \\\n\
+    VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \\\n\
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \\\n\
+    data3 = data3 * scale0 - data5 * scale1 + output_zp; \\\n\
+ \\\n\
+    int4 dst0 = convert_int4_rte(data2); \\\n\
+    int4 dst1 = convert_int4_rte(data3); \\\n\
+ \\\n\
+    dst_type dst; \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \\\n\
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \\\n\
+    VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \\\n\
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \\\n\
+    data2 = data2 * scale1 + data4 * scale0 + output_zp; \\\n\
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \\\n\
+ \\\n\
+    dst0 = convert_int4_rte(data2); \\\n\
+    dst1 = convert_int4_rte(data3); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+ROPE_BNH1_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)\n\
+ROPE_BNH1_SYMM(I16_I16toI8,  vxc_short8, vxc_short8, vxc_short8, vxc_char8)\n\
+ROPE_BNH1_SYMM(I16_I16toU8,  vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)\n\
+ROPE_BNH1_SYMM(I16_F16toI16, vxc_short8, vxc_half8,  vxc_short8, vxc_short8)\n\
+ROPE_BNH1_SYMM(I16_F16toI8,  vxc_short8, vxc_half8,  vxc_short8, vxc_char8)\n\
+ROPE_BNH1_SYMM(I16_F16toU8,  vxc_short8, vxc_half8,  vxc_short8, vxc_uchar8)\n\
+\n\
+__kernel void rope_F16_F16toF16_bnh1\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_array_t cos_cache,\n\
+    __read_only  image2d_array_t sin_cache,\n\
+    __write_only image2d_array_t output,\n\
+                 int   axis\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    vxc_short8 v0, v1, v2, v3, dst;\n\
+    vxc_half8 data0, data1, cos, sin, dst2;\n\
+    VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, data0, v0, 16);\n\
+    VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, cos, v1, 16);\n\
+    VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, sin, v2, 16);\n\
+    coord.x += half_head_size;\n\
+    VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, data1, v3, 16);\n\
+\n\
+    float4 data2, data3, data4, data5;\n\
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);\n\
+    VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);\n\
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);\n\
+    VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);\n\
+    data2 = data2 - data4;\n\
+    data3 = data3 - data5;\n\
+\n\
+    half4 dst0;\n\
+    half4 dst1;\n\
+    _viv_asm(CONV_RTE, dst0, data2);\n\
+    _viv_asm(CONV_RTE, dst1, data3);\n\
+\n\
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\
+    _viv_asm(COPY, dst, dst2, 16);\n\
+    VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);\n\
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);\n\
+    VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);\n\
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);\n\
+    data2 = data2 + data4;\n\
+    data3 = data3 + data5;\n\
+\n\
+    _viv_asm(CONV_RTE, dst0, data2);\n\
+    _viv_asm(CONV_RTE, dst1, data3);\n\
+\n\
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\
+    _viv_asm(COPY, dst, dst2, 16);\n\
+    VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+_viv_uniform int in0_zp;\n\
+_viv_uniform int cos_zp;\n\
+_viv_uniform int sin_zp;\n\
+_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;\n\
+_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;\n\
+#define ROPE_ASYM_BNH1(name, src1_type, copy_type, dtype) \\\n\
+__kernel void rope_##name##_bnh1 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_array_t cos_cache, \\\n\
+    __read_only  image2d_array_t sin_cache, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int   axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    dtype data0, data1, dst; \\\n\
+    src1_type cos, sin; \\\n\
+    copy_type v0, v1; \\\n\
+    VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, cos, v0, 16); \\\n\
+    VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, sin, v1, 16); \\\n\
+    coord.x += half_head_size; \\\n\
+    VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 l00, l01, cos0, cos1; \\\n\
+    float4 l10, l11, sin0, sin1; \\\n\
+    VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\
+    VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\
+    VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\
+    VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\
+    VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\
+    VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\
+    VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\
+    VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\
+    float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \\\n\
+    float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \\\n\
+ \\\n\
+    int4 dst0 = convert_int4_rte(data2); \\\n\
+    int4 dst1 = convert_int4_rte(data3); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \\\n\
+    data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \\\n\
+ \\\n\
+    dst0 = convert_int4_rte(data2); \\\n\
+    dst1 = convert_int4_rte(data3); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+ROPE_ASYM_BNH1(I8_I8toI8,    vxc_char8,   vxc_char8,   vxc_char8)\n\
+ROPE_ASYM_BNH1(U8_U8toU8,    vxc_uchar8,  vxc_uchar8,  vxc_uchar8)\n\
+ROPE_ASYM_BNH1(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)\n\
+ROPE_ASYM_BNH1(I8_F16toI8,   vxc_half8,   vxc_short8,  vxc_char8)\n\
+ROPE_ASYM_BNH1(U8_F16toU8,   vxc_half8,   vxc_short8,  vxc_uchar8)\n\
+ROPE_ASYM_BNH1(U16_F16toU16, vxc_half8,   vxc_short8,  vxc_ushort8)\n\
+\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
+__kernel void rope_BF16_BF16toBF16_bnh1\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_array_t cos_cache,\n\
+    __read_only  image2d_array_t sin_cache,\n\
+    __write_only image2d_array_t output,\n\
+                 int   axis\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    vxc_ushort8 v0, v1, v2, v3, dst;\n\
+    VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    coord.x += half_head_size;\n\
+    VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    vxc_short8 data;\n\
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, src0, data, 16);\n\
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, src1, data, 16);\n\
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, cos0, data, 16);\n\
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, cos1, data, 16);\n\
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, sin0, data, 16);\n\
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, sin1, data, 16);\n\
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, src2, data, 16);\n\
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, src3, data, 16);\n\
+\n\
+    float4 data0 = src0 * cos0 - src2 * sin0;\n\
+    float4 data1 = src1 * cos1 - src3 * sin1;\n\
+\n\
+    _viv_asm(COPY, v0, data0, 16);\n\
+    _viv_asm(COPY, v1, data1, 16);\n\
+\n\
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    data0 = src0 * sin0 + src2 * cos0;\n\
+    data1 = src1 * sin1 + src3 * cos1;\n\
+\n\
+    _viv_asm(COPY, v0, data0, 16);\n\
+    _viv_asm(COPY, v1, data1, 16);\n\
+\n\
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of rope_1_vx*/
+
+static const char rope_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float scale0;\n\
+_viv_uniform float scale1;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;\n\
+_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;\n\
+_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;\n\
+_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+\n\
+#define ROPE_BSNH_SYMM(name, src_type, src1_type, copy_type, dst_type) \\\n\
+__kernel void rope_##name##_bsnh \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_array_t cos_cache, \\\n\
+    __read_only  image2d_array_t sin_cache, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int   axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    src_type data0, data1; \\\n\
+    src1_type cos, sin; \\\n\
+    copy_type v0, v1; \\\n\
+    dst_type dst; \\\n\
+    VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, cos, v0, 16); \\\n\
+    VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, sin, v1, 16); \\\n\
+ \\\n\
+    coord_in.x *= 2; \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    int4 coord_out = coord_in; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+ \\\n\
+    float4 data2, data3, data4, data5; \\\n\
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \\\n\
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \\\n\
+    VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \\\n\
+    VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \\\n\
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \\\n\
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \\\n\
+ \\\n\
+    int4 dst0 = convert_int4_rte(data2); \\\n\
+    int4 dst1 = convert_int4_rte(data3); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\
+                dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \\\n\
+    VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \\\n\
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \\\n\
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \\\n\
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \\\n\
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \\\n\
+ \\\n\
+    dst0 = convert_int4_rte(data2); \\\n\
+    dst1 = convert_int4_rte(data3); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    coord_out.x += 8; \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\
+            dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+ROPE_BSNH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)\n\
+ROPE_BSNH_SYMM(I16_I16toI8,  vxc_short8, vxc_short8, vxc_short8, vxc_char8)\n\
+ROPE_BSNH_SYMM(I16_I16toU8,  vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)\n\
+ROPE_BSNH_SYMM(I16_F16toI16, vxc_short8, vxc_half8,  vxc_short8, vxc_short8)\n\
+ROPE_BSNH_SYMM(I16_F16toI8,  vxc_short8, vxc_half8,  vxc_short8, vxc_char8)\n\
+ROPE_BSNH_SYMM(I16_F16toU8,  vxc_short8, vxc_half8,  vxc_short8, vxc_uchar8)\n\
+\n\
+__kernel void rope_F16_F16toF16_bsnh\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_array_t cos_cache,\n\
+    __read_only  image2d_array_t sin_cache,\n\
+    __write_only image2d_array_t output,\n\
+                 int   axis\n\
+    )\n\
+{\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+\n\
+    vxc_short8 v0, v1, v2, v3, dst;\n\
+    vxc_half8 data0, data1, cos, sin, dst2;\n\
+    VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, cos, v1, 16);\n\
+    VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, sin, v2, 16);\n\
+\n\
+    coord_in.x *= 2;\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+\n\
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, data0, v0, 16);\n\
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, data1, v3, 16);\n\
+\n\
+    int4 coord_out = coord_in;\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    float4 data2, data3, data4, data5;\n\
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);\n\
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);\n\
+    VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);\n\
+    VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);\n\
+    data2 = data2 - data4;\n\
+    data3 = data3 + data5;\n\
+\n\
+    half4 dst0;\n\
+    half4 dst1;\n\
+    _viv_asm(CONV_RTE, dst0, data2);\n\
+    _viv_asm(CONV_RTE, dst1, data3);\n\
+\n\
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\
+    _viv_asm(COPY, dst, dst2, 16);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);\n\
+    VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);\n\
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);\n\
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);\n\
+    data2 = data2 - data4;\n\
+    data3 = data3 + data5;\n\
+\n\
+    _viv_asm(CONV_RTE, dst0, data2);\n\
+    _viv_asm(CONV_RTE, dst1, data3);\n\
+\n\
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\
+    _viv_asm(COPY, dst, dst2, 16);\n\
+    coord_out.x += 8;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+_viv_uniform int in0_zp;\n\
+_viv_uniform int cos_zp;\n\
+_viv_uniform int sin_zp;\n\
+_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;\n\
+_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;\n\
+_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;\n\
+_viv_uniform VXC_512Bits uniAOddMinusZp_4x4;\n\
+#define ROPE_ASYM_BSNH(name, src1_type, copy_type, dtype) \\\n\
+__kernel void rope_##name##_bsnh \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_array_t cos_cache, \\\n\
+    __read_only  image2d_array_t sin_cache, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int   axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    dtype data0, data1, dst; \\\n\
+    src1_type cos, sin; \\\n\
+    copy_type v0, v1; \\\n\
+ \\\n\
+    VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, cos, v0, 16); \\\n\
+    VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, sin, v1, 16); \\\n\
+    coord_in.x *= 2; \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    int4 coord_out = coord_in; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+ \\\n\
+    float4 l00, l01, cos0, cos1; \\\n\
+    float4 l10, l11, sin0, sin1; \\\n\
+    VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \\\n\
+    VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \\\n\
+    VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\
+    VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\
+    VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\
+    VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\
+    float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \\\n\
+    float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \\\n\
+ \\\n\
+    int4 dst0 = convert_int4_rte(data2); \\\n\
+    int4 dst1 = convert_int4_rte(data3); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, \\\n\
+            coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \\\n\
+    VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \\\n\
+    data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \\\n\
+    data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \\\n\
+ \\\n\
+    dst0 = convert_int4_rte(data2); \\\n\
+    dst1 = convert_int4_rte(data3); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    coord_out.x += 8; \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, \\\n\
+        coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+ROPE_ASYM_BSNH(I8_I8toI8,    vxc_char8,   vxc_char8,   vxc_char8)\n\
+ROPE_ASYM_BSNH(U8_U8toU8,    vxc_uchar8,  vxc_uchar8,  vxc_uchar8)\n\
+ROPE_ASYM_BSNH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)\n\
+ROPE_ASYM_BSNH(I8_F16toI8,   vxc_half8,   vxc_short8,  vxc_char8)\n\
+ROPE_ASYM_BSNH(U8_F16toU8,   vxc_half8,   vxc_short8,  vxc_uchar8)\n\
+ROPE_ASYM_BSNH(U16_F16toU16, vxc_half8,   vxc_short8,  vxc_ushort8)\n\
+\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
+__kernel void rope_BF16_BF16toBF16_bsnh\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_array_t cos_cache,\n\
+    __read_only  image2d_array_t sin_cache,\n\
+    __write_only image2d_array_t output,\n\
+                 int   axis\n\
+    )\n\
+{\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+\n\
+    vxc_ushort8 v0, v1, v2, v3, dst;\n\
+    VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_in.x *= 2;\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    int4 coord_out = coord_in;\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    vxc_short8 data;\n\
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, src0, data, 16);\n\
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, src1, data, 16);\n\
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, cos0, data, 16);\n\
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, cos1, data, 16);\n\
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, sin0, data, 16);\n\
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, sin1, data, 16);\n\
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, src2, data, 16);\n\
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, src3, data, 16);\n\
+\n\
+    float4 even = (float4)(src0.xz, src1.xz);\n\
+    float4 odd = (float4)(src0.yw, src1.yw);\n\
+    float4 data0 = even * cos0 - odd * sin0;\n\
+    float4 data1 = even * sin0 + odd * cos0;\n\
+\n\
+    _viv_asm(COPY, v0, data0, 16);\n\
+    _viv_asm(COPY, v1, data1, 16);\n\
+\n\
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    even = (float4)(src2.xz, src3.xz);\n\
+    odd = (float4)(src2.yw, src3.yw);\n\
+    data0 = even * cos1 - odd * sin1;\n\
+    data1 = even * sin1 + odd * cos1;\n\
+\n\
+    _viv_asm(COPY, v0, data0, 16);\n\
+    _viv_asm(COPY, v1, data1, 16);\n\
+\n\
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    coord_out.x += 8;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of rope_2_vx*/
+
+static const char rope_3_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float scale0;\n\
+_viv_uniform float scale1;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;\n\
+_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;\n\
+_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;\n\
+_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+\n\
+#define ROPE_BNSH_SYMM(name, src_type, src1_type, copy_type, dst_type) \\\n\
+__kernel void rope_##name##_bnsh \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_array_t cos_cache, \\\n\
+    __read_only  image2d_array_t sin_cache, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int   axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    src_type data0, data1; \\\n\
+    src1_type cos, sin; \\\n\
+    copy_type v0, v1; \\\n\
+    dst_type dst; \\\n\
+    VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, cos, v0, 16); \\\n\
+    VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, sin, v1, 16); \\\n\
+ \\\n\
+    coord_in.x *= 2; \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    int4 coord_out = coord_in; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+ \\\n\
+    float4 data2, data3, data4, data5; \\\n\
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \\\n\
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \\\n\
+    VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \\\n\
+    VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \\\n\
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \\\n\
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \\\n\
+ \\\n\
+    int4 dst0 = convert_int4_rte(data2); \\\n\
+    int4 dst1 = convert_int4_rte(data3); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\
+                dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \\\n\
+    VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \\\n\
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \\\n\
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \\\n\
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \\\n\
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \\\n\
+ \\\n\
+    dst0 = convert_int4_rte(data2); \\\n\
+    dst1 = convert_int4_rte(data3); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    coord_out.x += 8; \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\
+            dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+ROPE_BNSH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)\n\
+ROPE_BNSH_SYMM(I16_I16toI8,  vxc_short8, vxc_short8, vxc_short8, vxc_char8)\n\
+ROPE_BNSH_SYMM(I16_I16toU8,  vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)\n\
+ROPE_BNSH_SYMM(I16_F16toI16, vxc_short8, vxc_half8,  vxc_short8, vxc_short8)\n\
+ROPE_BNSH_SYMM(I16_F16toI8,  vxc_short8, vxc_half8,  vxc_short8, vxc_char8)\n\
+ROPE_BNSH_SYMM(I16_F16toU8,  vxc_short8, vxc_half8,  vxc_short8, vxc_uchar8)\n\
+\n\
+__kernel void rope_F16_F16toF16_bnsh\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_array_t cos_cache,\n\
+    __read_only  image2d_array_t sin_cache,\n\
+    __write_only image2d_array_t output,\n\
+                 int   axis\n\
+    )\n\
+{\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+\n\
+    vxc_short8 v0, v1, v2, v3, dst;\n\
+    vxc_half8 data0, data1, cos, sin, dst2;\n\
+    VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, cos, v1, 16);\n\
+    VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, sin, v2, 16);\n\
+\n\
+    coord_in.x *= 2;\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+\n\
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, data0, v0, 16);\n\
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, data1, v3, 16);\n\
+\n\
+    int4 coord_out = coord_in;\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    float4 data2, data3, data4, data5;\n\
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);\n\
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);\n\
+    VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);\n\
+    VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);\n\
+    data2 = data2 - data4;\n\
+    data3 = data3 + data5;\n\
+\n\
+    half4 dst0;\n\
+    half4 dst1;\n\
+    _viv_asm(CONV_RTE, dst0, data2);\n\
+    _viv_asm(CONV_RTE, dst1, data3);\n\
+\n\
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\
+    _viv_asm(COPY, dst, dst2, 16);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);\n\
+    VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);\n\
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);\n\
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);\n\
+    data2 = data2 - data4;\n\
+    data3 = data3 + data5;\n\
+\n\
+    _viv_asm(CONV_RTE, dst0, data2);\n\
+    _viv_asm(CONV_RTE, dst1, data3);\n\
+\n\
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\
+    _viv_asm(COPY, dst, dst2, 16);\n\
+    coord_out.x += 8;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+_viv_uniform int in0_zp;\n\
+_viv_uniform int cos_zp;\n\
+_viv_uniform int sin_zp;\n\
+_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;\n\
+_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;\n\
+_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;\n\
+_viv_uniform VXC_512Bits uniAOddMinusZp_4x4;\n\
+#define ROPE_ASYM_BNSH(name, src1_type, copy_type, dtype) \\\n\
+__kernel void rope_##name##_bnsh \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_array_t cos_cache, \\\n\
+    __read_only  image2d_array_t sin_cache, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int   axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    dtype data0, data1, dst; \\\n\
+    src1_type cos, sin; \\\n\
+    copy_type v0, v1; \\\n\
+ \\\n\
+    VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, cos, v0, 16); \\\n\
+    VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, sin, v1, 16); \\\n\
+    coord_in.x *= 2; \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    int4 coord_out = coord_in; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+ \\\n\
+    float4 l00, l01, cos0, cos1; \\\n\
+    float4 l10, l11, sin0, sin1; \\\n\
+    VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \\\n\
+    VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \\\n\
+    VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\
+    VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\
+    VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\
+    VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\
+    float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \\\n\
+    float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \\\n\
+ \\\n\
+    int4 dst0 = convert_int4_rte(data2); \\\n\
+    int4 dst1 = convert_int4_rte(data3); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, \\\n\
+            coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \\\n\
+    VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \\\n\
+    data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \\\n\
+    data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \\\n\
+ \\\n\
+    dst0 = convert_int4_rte(data2); \\\n\
+    dst1 = convert_int4_rte(data3); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    coord_out.x += 8; \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, \\\n\
+        coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+ROPE_ASYM_BNSH(I8_I8toI8,    vxc_char8,   vxc_char8,   vxc_char8)\n\
+ROPE_ASYM_BNSH(U8_U8toU8,    vxc_uchar8,  vxc_uchar8,  vxc_uchar8)\n\
+ROPE_ASYM_BNSH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)\n\
+ROPE_ASYM_BNSH(I8_F16toI8,   vxc_half8,   vxc_short8,  vxc_char8)\n\
+ROPE_ASYM_BNSH(U8_F16toU8,   vxc_half8,   vxc_short8,  vxc_uchar8)\n\
+ROPE_ASYM_BNSH(U16_F16toU16, vxc_half8,   vxc_short8,  vxc_ushort8)\n\
+\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
+__kernel void rope_BF16_BF16toBF16_bnsh\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_array_t cos_cache,\n\
+    __read_only  image2d_array_t sin_cache,\n\
+    __write_only image2d_array_t output,\n\
+                 int   axis\n\
+    )\n\
+{\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+\n\
+    vxc_ushort8 v0, v1, v2, v3, dst;\n\
+    VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_in.x *= 2;\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    int4 coord_out = coord_in;\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    vxc_short8 data;\n\
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, src0, data, 16);\n\
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, src1, data, 16);\n\
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, cos0, data, 16);\n\
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, cos1, data, 16);\n\
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, sin0, data, 16);\n\
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, sin1, data, 16);\n\
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, src2, data, 16);\n\
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, src3, data, 16);\n\
+\n\
+    float4 even = (float4)(src0.xz, src1.xz);\n\
+    float4 odd = (float4)(src0.yw, src1.yw);\n\
+    float4 data0 = even * cos0 - odd * sin0;\n\
+    float4 data1 = even * sin0 + odd * cos0;\n\
+\n\
+    _viv_asm(COPY, v0, data0, 16);\n\
+    _viv_asm(COPY, v1, data1, 16);\n\
+\n\
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    even = (float4)(src2.xz, src3.xz);\n\
+    odd = (float4)(src2.yw, src3.yw);\n\
+    data0 = even * cos1 - odd * sin1;\n\
+    data1 = even * sin1 + odd * cos1;\n\
+\n\
+    _viv_asm(COPY, v0, data0, 16);\n\
+    _viv_asm(COPY, v1, data1, 16);\n\
+\n\
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    coord_out.x += 8;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of rope_3_vx*/
+
 static const char scatter_nd_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniAccumulateSum_2x8;\n\
@@ -57985,6 +59994,104 @@ __kernel void scatter_nd_update_cpy2out_##src0_type##to##src0_type( \\\n\
 }\n\
 SCATTER_ND_UPDATE_COPY2OUT(U8,  vxc_uchar16, 1)\n\
 SCATTER_ND_UPDATE_COPY2OUT(I8,  vxc_char16, 1)\n\
+SCATTER_ND_UPDATE_COPY2OUT(U16,  vxc_ushort8, 2)\n\
+SCATTER_ND_UPDATE_COPY2OUT(I16,  vxc_short8, 2)\n\
+\n\
+#define SCATTER_ND_UPDATE_REF2OUT_16BITS(src0_type, data_type) \\\n\
+__kernel void scatter_nd_update_ref2out_##src0_type##to##src0_type( \\\n\
+    __read_only image2d_t   input_ref, \\\n\
+    image2d_t  temp_ref, \\\n\
+    image2d_t  output0 \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    Image img0 = create_image_from_image2d(input_ref, 2); \\\n\
+    Image img1 = create_image_from_image2d(temp_ref, 2); \\\n\
+    __global data_type* in_ptr = (__global data_type*)img0.ptr; \\\n\
+    __global data_type* out_ptr = (__global data_type*)img1.ptr; \\\n\
+    data_type src, dst; \\\n\
+    src = in_ptr[gidx]; \\\n\
+    vxc_ushort8 mp0; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    VXC_DP2x8(dst, src, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    out_ptr[gidx] = dst; \\\n\
+}\n\
+SCATTER_ND_UPDATE_REF2OUT_16BITS(U16,  vxc_ushort8)\n\
+SCATTER_ND_UPDATE_REF2OUT_16BITS(I16,  vxc_short8)\n\
+\n\
+#define SCATTER_ND_UPDATE_UPDATE2REF_16BITS(src0_type, data_type) \\\n\
+__kernel void scatter_nd_update_update2ref_##src0_type##to##src0_type##_16x( \\\n\
+    __read_only image2d_t   input_index, \\\n\
+    __read_only image2d_t   input_update, \\\n\
+    image2d_t  temp_ref, \\\n\
+    image2d_t  input0, \\\n\
+    image2d_t  output1, \\\n\
+    int width, int area, int vol, int coord_dim \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+ \\\n\
+    Image img1 = create_image_from_image2d(input_index, 4); \\\n\
+    Image img2 = create_image_from_image2d(input_update, 2); \\\n\
+    Image img3 = create_image_from_image2d(temp_ref, 2); \\\n\
+    __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+    __global data_type* update_ptr = (__global data_type*)img2.ptr; \\\n\
+    __global data_type* output_ptr = (__global data_type*)img3.ptr; \\\n\
+    data_type dst; \\\n\
+ \\\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx); \\\n\
+    data_type src = update_ptr[gidy * update_width + gidx]; \\\n\
+    int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \\\n\
+    int loc = idx * output_width + gidx; \\\n\
+    vxc_ushort8 mp1; \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(dst, src, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+    output_ptr[loc] = dst; \\\n\
+}\n\
+SCATTER_ND_UPDATE_UPDATE2REF_16BITS(U16,  vxc_ushort8)\n\
+SCATTER_ND_UPDATE_UPDATE2REF_16BITS(I16,  vxc_short8)\n\
+\n\
+__kernel void scatter_nd_update_ref2out_F16toF16(\n\
+    __read_only image2d_t   input_ref,\n\
+    image2d_t  temp_ref,\n\
+    image2d_t  output0\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    Image img0 = create_image_from_image2d(input_ref, 2);\n\
+    Image img1 = create_image_from_image2d(temp_ref, 2);\n\
+    __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)img0.ptr;\n\
+    __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)img1.ptr;\n\
+    out_ptr[gidx] = in_ptr[gidx];\n\
+}\n\
+\n\
+__kernel void scatter_nd_update_update2ref_F16toF16_16x(\n\
+    __read_only image2d_t   input_index,\n\
+    __read_only image2d_t   input_update,\n\
+    image2d_t  temp_ref,\n\
+    image2d_t  input0,\n\
+    image2d_t  output1,\n\
+    int width, int area, int vol, int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+\n\
+    Image img1 = create_image_from_image2d(input_index, 4);\n\
+    Image img2 = create_image_from_image2d(input_update, 2);\n\
+    Image img3 = create_image_from_image2d(temp_ref, 2);\n\
+    __global int* index_ptr = (__global int*)img1.ptr;\n\
+    __global vxc_ushort8* update_ptr = (__global vxc_ushort8*)img2.ptr;\n\
+    __global vxc_ushort8* output_ptr = (__global vxc_ushort8*)img3.ptr;\n\
+\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx);\n\
+    int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;\n\
+    int loc = idx * output_width + gidx;\n\
+    output_ptr[loc] = update_ptr[gidy * update_width + gidx];\n\
+}\n\
 "; /* end of scatter_nd_update_special_vx*/
 
 static const char select_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -63813,8 +65920,8 @@ static const char cumsum_cl[] = "__kernel void cumsum_F32toF32_axis2(\n\
     }\n\
 }\n\
 \n\
-#define CUMSUM_toU8_AXIS2_SH(name, src_type, read_image_type) \\\n\
-__kernel void cumsum_##name##toU8_axis2( \\\n\
+#define CUMSUM_toINT_AXIS2_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \\\n\
+__kernel void cumsum_##name##_axis2( \\\n\
     __read_only image2d_array_t  input, \\\n\
     __write_only image2d_array_t  output, \\\n\
     int axis, \\\n\
@@ -63833,19 +65940,19 @@ __kernel void cumsum_##name##toU8_axis2( \\\n\
     int4 coord_out = coord; \\\n\
  \\\n\
     src_type sum = (src_type)(0); \\\n\
-    uint4 dst = (uint4)(0); \\\n\
+    dst_type dst = (dst_type)(0); \\\n\
     int tmp_zp = convert_int_rte(output_zp); \\\n\
-    dst.x = convert_uint_sat(tmp_zp); \\\n\
+    dst.x = convert_dtype(tmp_zp); \\\n\
  \\\n\
     float cnt = 0.0f; \\\n\
  \\\n\
     if(exclusive && rev) \\\n\
     { \\\n\
         coord_out.z = channel - 1; \\\n\
-        write_imageui(output, coord_out, dst); \\\n\
+        image_write(output, coord_out, dst); \\\n\
         for(coord.z = channel - 1; coord.z > 0; coord.z--) \\\n\
         { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
+            src_type data = image_read(input, coord); \\\n\
             coord_out.z--; \\\n\
             cnt += 1.0f; \\\n\
             sum += data; \\\n\
@@ -63853,17 +65960,17 @@ __kernel void cumsum_##name##toU8_axis2( \\\n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
  \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord_out, dst); \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord_out, dst); \\\n\
         } \\\n\
     } \\\n\
     else if(exclusive) \\\n\
     { \\\n\
         coord_out.z = 0; \\\n\
-        write_imageui(output, coord_out, dst); \\\n\
+        image_write(output, coord_out, dst); \\\n\
         for(coord.z = 0; coord.z < channel - 1; coord.z++) \\\n\
         { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
+            src_type data = image_read(input, coord); \\\n\
             coord_out.z++; \\\n\
             cnt += 1.0f; \\\n\
             sum += data; \\\n\
@@ -63871,45 +65978,44 @@ __kernel void cumsum_##name##toU8_axis2( \\\n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
  \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord_out, dst); \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord_out, dst); \\\n\
         } \\\n\
     } \\\n\
     else if(rev) \\\n\
     { \\\n\
         for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\
         { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
+            src_type data = image_read(input, coord); \\\n\
             cnt += 1.0f; \\\n\
             sum += data; \\\n\
  \\\n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
  \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord, dst); \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord, dst); \\\n\
         } \\\n\
     } \\\n\
     else \\\n\
     { \\\n\
         for(coord.z = 0; coord.z < channel; coord.z++) \\\n\
         { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
+            src_type data = image_read(input, coord); \\\n\
             cnt += 1.0f; \\\n\
             sum += data; \\\n\
  \\\n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
  \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord, dst); \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord, dst); \\\n\
         } \\\n\
     } \\\n\
 }\n\
-CUMSUM_toU8_AXIS2_SH(U8,uint4,read_imageui)\n\
-CUMSUM_toU8_AXIS2_SH(F32,float4,read_imagef)\n\
-\n\
-\n\
+CUMSUM_toINT_AXIS2_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)\n\
+CUMSUM_toINT_AXIS2_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)\n\
+CUMSUM_toINT_AXIS2_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)\n\
 \n\
 __kernel void cumsum_F32toF32_axis1(\n\
     __read_only image2d_array_t  input,\n\
@@ -63979,10 +66085,10 @@ __kernel void cumsum_F32toF32_axis1(\n\
     }\n\
 }\n\
 \n\
-#define CUMSUM_toU8_AXIS1_SH(name, src_type, read_image_type) \\\n\
-__kernel void cumsum_##name##toU8_axis1( \\\n\
-    __read_only image2d_array_t  input, \\\n\
-    __write_only image2d_array_t  output, \\\n\
+#define CUMSUM_toINT_AXIS1_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \\\n\
+__kernel void cumsum_##name##_axis1( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
     int axis, \\\n\
     int exclusive, \\\n\
     int rev, \\\n\
@@ -63999,20 +66105,20 @@ __kernel void cumsum_##name##toU8_axis1( \\\n\
     int4 coord_out = coord; \\\n\
  \\\n\
     src_type sum = (src_type)(0); \\\n\
-    uint4 dst = (uint4)(0); \\\n\
+    dst_type dst = (dst_type)(0); \\\n\
     int tmp_zp = convert_int_rte(output_zp); \\\n\
-    dst.x = convert_uint_sat(tmp_zp); \\\n\
+    dst.x = convert_dtype(tmp_zp); \\\n\
  \\\n\
     float cnt = 0; \\\n\
  \\\n\
     if(exclusive && rev) \\\n\
     { \\\n\
         coord_out.y = height - 1; \\\n\
-        write_imageui(output, coord_out, dst); \\\n\
+        image_write(output, coord_out, dst); \\\n\
  \\\n\
         for(coord.y = height - 1; coord.y > 0; coord.y--) \\\n\
         { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
+            src_type data = image_read(input, coord); \\\n\
             cnt += 1.0f; \\\n\
             coord_out.y--; \\\n\
             sum += data; \\\n\
@@ -64020,17 +66126,17 @@ __kernel void cumsum_##name##toU8_axis1( \\\n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
  \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord_out, dst); \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord_out, dst); \\\n\
         } \\\n\
     } \\\n\
     else if(exclusive) \\\n\
     { \\\n\
         coord_out.y = 0; \\\n\
-        write_imageui(output, coord_out, dst); \\\n\
+        image_write(output, coord_out, dst); \\\n\
         for(coord.y = 0; coord.y < height - 1; coord.y++) \\\n\
         { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
+            src_type data = image_read(input, coord); \\\n\
             cnt += 1.0f; \\\n\
             coord_out.y++; \\\n\
             sum += data; \\\n\
@@ -64038,44 +66144,44 @@ __kernel void cumsum_##name##toU8_axis1( \\\n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
  \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord_out, dst); \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord_out, dst); \\\n\
         } \\\n\
     } \\\n\
     else if(rev) \\\n\
     { \\\n\
         for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\
         { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
+            src_type data = image_read(input, coord); \\\n\
             cnt += 1.0f; \\\n\
             sum += data; \\\n\
  \\\n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
  \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord, dst); \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord, dst); \\\n\
         } \\\n\
     } \\\n\
     else \\\n\
     { \\\n\
         for(coord.y = 0; coord.y < height; coord.y++) \\\n\
         { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
+            src_type data = image_read(input, coord); \\\n\
             cnt += 1.0f; \\\n\
             sum += data; \\\n\
  \\\n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
  \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord, dst); \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord, dst); \\\n\
         } \\\n\
     } \\\n\
 }\n\
-CUMSUM_toU8_AXIS1_SH(U8,uint4,read_imageui)\n\
-CUMSUM_toU8_AXIS1_SH(F32,float4,read_imagef)\n\
-\n\
+CUMSUM_toINT_AXIS1_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)\n\
+CUMSUM_toINT_AXIS1_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)\n\
+CUMSUM_toINT_AXIS1_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)\n\
 \n\
 __kernel void cumsum_F32toF32_axis0(\n\
     __read_only image2d_array_t  input,\n\
@@ -64145,8 +66251,8 @@ __kernel void cumsum_F32toF32_axis0(\n\
     }\n\
 }\n\
 \n\
-#define CUMSUM_toU8_AXIS0_SH(name, src_type, read_image_type) \\\n\
-__kernel void cumsum_##name##toU8_axis0( \\\n\
+#define CUMSUM_toINT_AXIS0_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \\\n\
+__kernel void cumsum_##name##_axis0( \\\n\
     __read_only image2d_array_t  input, \\\n\
     __write_only image2d_array_t  output, \\\n\
     int axis, \\\n\
@@ -64165,19 +66271,19 @@ __kernel void cumsum_##name##toU8_axis0( \\\n\
     int4 coord_out = coord; \\\n\
  \\\n\
     src_type sum = (src_type)(0); \\\n\
-    uint4 dst = (uint4)(0); \\\n\
+    dst_type dst = (dst_type)(0); \\\n\
     int tmp_zp = convert_int_rte(output_zp); \\\n\
-    dst.x = convert_uint_sat(tmp_zp); \\\n\
+    dst.x = convert_dtype(tmp_zp); \\\n\
  \\\n\
     float cnt = 0; \\\n\
  \\\n\
     if(exclusive && rev) \\\n\
     { \\\n\
         coord_out.x = width - 1; \\\n\
-        write_imageui(output, coord_out, dst); \\\n\
+        image_write(output, coord_out, dst); \\\n\
         for(coord.x = width - 1; coord.x > 0; coord.x--) \\\n\
         { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
+            src_type data = image_read(input, coord); \\\n\
             coord_out.x--; \\\n\
             cnt += 1.0f; \\\n\
             sum += data; \\\n\
@@ -64185,8 +66291,8 @@ __kernel void cumsum_##name##toU8_axis0( \\\n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
  \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord_out, dst); \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord_out, dst); \\\n\
         } \\\n\
     } \\\n\
     else if(exclusive) \\\n\
@@ -64195,7 +66301,7 @@ __kernel void cumsum_##name##toU8_axis0( \\\n\
         write_imageui(output, coord_out, dst); \\\n\
         for(coord.x = 0; coord.x < width - 1; coord.x++) \\\n\
         { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
+            src_type data = image_read(input, coord); \\\n\
             coord_out.x++; \\\n\
             cnt += 1.0f; \\\n\
             sum += data; \\\n\
@@ -64203,43 +66309,45 @@ __kernel void cumsum_##name##toU8_axis0( \\\n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
  \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord_out, dst); \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord_out, dst); \\\n\
         } \\\n\
     } \\\n\
     else if(rev) \\\n\
     { \\\n\
         for(coord.x = width - 1; coord.x >= 0; coord.x--) \\\n\
         { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
+            src_type data = image_read(input, coord); \\\n\
             cnt += 1.0f; \\\n\
             sum += data; \\\n\
  \\\n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
  \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord, dst); \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord, dst); \\\n\
         } \\\n\
     } \\\n\
     else \\\n\
     { \\\n\
         for(coord.x = 0; coord.x < width; coord.x++) \\\n\
         { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
+            src_type data = image_read(input, coord); \\\n\
             cnt += 1.0f; \\\n\
             sum += data; \\\n\
  \\\n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
             float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
  \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord, dst); \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord, dst); \\\n\
         } \\\n\
     } \\\n\
 }\n\
-CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui)\n\
-CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)\n\
+CUMSUM_toINT_AXIS0_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)\n\
+CUMSUM_toINT_AXIS0_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)\n\
+CUMSUM_toINT_AXIS0_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)\n\
+\n\
 "; /* end of cumsum_cl*/
 
 static const char cumsum_2d_cl[] = "\n\
@@ -64309,188 +66417,100 @@ __kernel void cumsum_F32toF32_axis1_2D(\n\
     }\n\
 }\n\
 \n\
-__kernel void cumsum_U8toU8_axis1_2D(\n\
-    __read_only image2d_t  input,\n\
-    __write_only image2d_t  output,\n\
-    int axis,\n\
-    int exclusive,\n\
-    int rev,\n\
-    int width,\n\
-    int height,\n\
-    int chn,\n\
-    int input_zp,\n\
-    float in_out_scale,\n\
-    float in_out_zp_scale,\n\
-    float output_zp\n\
-    )\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
-\n\
-    uint4 sum = (uint4)(0);\n\
-    uint4 dst = (uint4)(0);\n\
-\n\
-    int tmp_zp = convert_int_rte(output_zp);\n\
-    dst.x = convert_uint_sat(tmp_zp);\n\
-\n\
-    float cnt = 0;\n\
-\n\
-    if(exclusive && rev)\n\
-    {\n\
-        coord.w = height - 1;\n\
-        write_imageui(output, coord.zw, dst);\n\
-        for(coord.y = height - 1; coord.y > 0; coord.y--)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord.xy);\n\
-            cnt += 1.0f;\n\
-            coord.w--;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.zw, dst);\n\
-        }\n\
-    }\n\
-    else if(exclusive)\n\
-    {\n\
-        write_imageui(output, coord.zw, dst);\n\
-        for(coord.y = 0; coord.y < height - 1; coord.y++)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord.xy);\n\
-            cnt += 1.0f;\n\
-            coord.w++;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.zw, dst);\n\
-        }\n\
-    }\n\
-    else if(rev)\n\
-    {\n\
-        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord.xy);\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.xy, dst);\n\
-        }\n\
-    }\n\
-    else\n\
-    {\n\
-        for(coord.y = 0; coord.y < height; coord.y++)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord.xy);\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.xy, dst);\n\
-        }\n\
-    }\n\
-}\n\
-\n\
-__kernel void cumsum_F32toU8_axis1_2D(\n\
-    __read_only image2d_t  input,\n\
-    __write_only image2d_t  output,\n\
-    int axis,\n\
-    int exclusive,\n\
-    int rev,\n\
-    int width,\n\
-    int height,\n\
-    int chn,\n\
-    int input_zp,\n\
-    float in_out_scale,\n\
-    float in_out_zp_scale,\n\
-    float output_zp\n\
-    )\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
-\n\
-    float4 sum = (float4)(0);\n\
-    uint4 dst = (uint4)(0);\n\
-    int tmp_zp = convert_int_rte(output_zp);\n\
-    dst.x = convert_uint_sat(tmp_zp);\n\
-\n\
-    float cnt = 0;\n\
-\n\
-    if(exclusive && rev)\n\
-    {\n\
-        coord.w = height - 1;\n\
-        write_imageui(output, coord.zw, dst);\n\
-        for(coord.y = height - 1; coord.y > 0; coord.y--)\n\
-        {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
-            cnt += 1.0f;\n\
-            coord.w--;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.zw, dst);\n\
-        }\n\
-    }\n\
-    else if(exclusive)\n\
-    {\n\
-        write_imageui(output, coord.zw, dst);\n\
-        for(coord.y = 0; coord.y < height - 1; coord.y++)\n\
-        {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
-            cnt += 1.0f;\n\
-            coord.w++;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.zw, dst);\n\
-        }\n\
-    }\n\
-    else if(rev)\n\
-    {\n\
-        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
-        {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.xy, dst);\n\
-        }\n\
-    }\n\
-    else\n\
-    {\n\
-        for(coord.y = 0; coord.y < height; coord.y++)\n\
-        {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.xy, dst);\n\
-        }\n\
-    }\n\
+#define CUMSUM_INT_AXIS1_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \\\n\
+__kernel void cumsum_##name##_axis1_2D( \\\n\
+    __read_only  image2d_t input, \\\n\
+    __write_only image2d_t output, \\\n\
+    int axis, \\\n\
+    int exclusive, \\\n\
+    int rev, \\\n\
+    int width, \\\n\
+    int height, \\\n\
+    int chn, \\\n\
+    int input_zp, \\\n\
+    float in_out_scale, \\\n\
+    float in_out_zp_scale, \\\n\
+    float output_zp \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    src_type sum = (src_type)(0); \\\n\
+    dst_type dst = (dst_type)(0); \\\n\
+    int tmp_zp = convert_int_rte(output_zp); \\\n\
+    dst.x = convert_dtype(tmp_zp); \\\n\
+ \\\n\
+    float cnt = 0; \\\n\
+ \\\n\
+    if(exclusive && rev) \\\n\
+    { \\\n\
+        coord.w = height - 1; \\\n\
+        image_write(output, coord.zw, dst); \\\n\
+        for(coord.y = height - 1; coord.y > 0; coord.y--) \\\n\
+        { \\\n\
+            src_type data = image_read(input, coord.xy); \\\n\
+            cnt += 1.0f; \\\n\
+            coord.w--; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord.zw, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive) \\\n\
+    { \\\n\
+        image_write(output, coord.zw, dst); \\\n\
+        for(coord.y = 0; coord.y < height - 1; coord.y++) \\\n\
+        { \\\n\
+            src_type data = image_read(input, coord.xy); \\\n\
+            cnt += 1.0f; \\\n\
+            coord.w++; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord.zw, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(rev) \\\n\
+    { \\\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\
+        { \\\n\
+            src_type data = image_read(input, coord.xy); \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord.xy, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else \\\n\
+    { \\\n\
+        for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+        { \\\n\
+            src_type data = image_read(input, coord.xy); \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord.xy, dst); \\\n\
+        } \\\n\
+    } \\\n\
 }\n\
+CUMSUM_INT_AXIS1_2D_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)\n\
+CUMSUM_INT_AXIS1_2D_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)\n\
+CUMSUM_INT_AXIS1_2D_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)\n\
 \n\
 __kernel void cumsum_F32toF32_axis0_2D(\n\
     __read_only image2d_t  input,\n\
@@ -64560,191 +66580,103 @@ __kernel void cumsum_F32toF32_axis0_2D(\n\
     }\n\
 }\n\
 \n\
-__kernel void cumsum_U8toU8_axis0_2D(\n\
-    __read_only image2d_t  input,\n\
-    __write_only image2d_t  output,\n\
-    int axis,\n\
-    int exclusive,\n\
-    int rev,\n\
-    int width,\n\
-    int height,\n\
-    int chn,\n\
-    int input_zp,\n\
-    float in_out_scale,\n\
-    float in_out_zp_scale,\n\
-    float output_zp\n\
-    )\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
-\n\
-    uint4 sum = (uint4)(0);\n\
-    uint4 dst = (uint4)(0);\n\
-\n\
-    int tmp_zp = convert_int_rte(output_zp);\n\
-    dst.x = convert_uint_sat(tmp_zp);\n\
-\n\
-    float cnt = 0.0f;\n\
-\n\
-    if(exclusive && rev)\n\
-    {\n\
-        coord.x = width - 1;\n\
-        coord.z = coord.x;\n\
-        write_imageui(output, coord.zw, dst);\n\
-        for(; coord.x > 0; coord.x--)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord.xy);\n\
-            coord.z--;\n\
-            cnt += 1.0;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.zw, dst);\n\
-        }\n\
-    }\n\
-    else if(exclusive)\n\
-    {\n\
-        coord.z = 0;\n\
-        write_imageui(output, coord.zw, dst);\n\
-        for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord.xy);\n\
-            cnt += 1.0f;\n\
-            coord.z++;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.zw, dst);\n\
-        }\n\
-    }\n\
-    else if(rev)\n\
-    {\n\
-        for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord.xy);\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.xy, dst);\n\
-        }\n\
-    }\n\
-    else\n\
-    {\n\
-        for(coord.x = 0; coord.x < width; coord.x++)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord.xy);\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.xy, dst);\n\
-        }\n\
-    }\n\
-}\n\
-\n\
-__kernel void cumsum_F32toU8_axis0_2D(\n\
-    __read_only image2d_t  input,\n\
-    __write_only image2d_t  output,\n\
-    int axis,\n\
-    int exclusive,\n\
-    int rev,\n\
-    int width,\n\
-    int height,\n\
-    int chn,\n\
-    int input_zp,\n\
-    float in_out_scale,\n\
-    float in_out_zp_scale,\n\
-    float output_zp\n\
-    )\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
-\n\
-    float4 sum = (float4)(0);\n\
-    uint4 dst = (uint4)(0);\n\
-    int tmp_zp = convert_int_rte(output_zp);\n\
-    dst.x = convert_uint_sat(tmp_zp);\n\
-\n\
-    float cnt = 0.0f;\n\
-    if(exclusive && rev)\n\
-    {\n\
-        coord.x = width - 1;\n\
-        coord.z = coord.x;\n\
-        write_imageui(output, coord.zw, dst);\n\
-        for(; coord.x > 0; coord.x--)\n\
-        {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
-            coord.z--;\n\
-            cnt += 1.0;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.zw, dst);\n\
-        }\n\
-    }\n\
-    else if(exclusive)\n\
-    {\n\
-        coord.z = 0;\n\
-        write_imageui(output, coord.zw, dst);\n\
-        for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
-        {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
-            cnt += 1.0f;\n\
-            coord.z++;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.zw, dst);\n\
-        }\n\
-    }\n\
-    else if(rev)\n\
-    {\n\
-        for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
-        {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.xy, dst);\n\
-        }\n\
-    }\n\
-    else\n\
-    {\n\
-        for(coord.x = 0; coord.x < width; coord.x++)\n\
-        {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.xy, dst);\n\
-        }\n\
-    }\n\
+#define CUMSUM_INT_AXIS0_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \\\n\
+__kernel void cumsum_##name##_axis0_2D( \\\n\
+    __read_only  image2d_t input, \\\n\
+    __write_only image2d_t output, \\\n\
+    int axis, \\\n\
+    int exclusive, \\\n\
+    int rev, \\\n\
+    int width, \\\n\
+    int height, \\\n\
+    int chn, \\\n\
+    int input_zp, \\\n\
+    float in_out_scale, \\\n\
+    float in_out_zp_scale, \\\n\
+    float output_zp \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    src_type sum = (src_type)(0); \\\n\
+    dst_type dst = (dst_type)(0); \\\n\
+ \\\n\
+    int tmp_zp = convert_int_rte(output_zp); \\\n\
+    dst.x = convert_dtype(tmp_zp); \\\n\
+ \\\n\
+    float cnt = 0.0f; \\\n\
+ \\\n\
+    if(exclusive && rev) \\\n\
+    { \\\n\
+        coord.x = width - 1; \\\n\
+        coord.z = coord.x; \\\n\
+        image_write(output, coord.zw, dst); \\\n\
+        for(; coord.x > 0; coord.x--) \\\n\
+        { \\\n\
+            src_type data = image_read(input, coord.xy); \\\n\
+            coord.z--; \\\n\
+            cnt += 1.0; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord.zw, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive) \\\n\
+    { \\\n\
+        coord.z = 0; \\\n\
+        image_write(output, coord.zw, dst); \\\n\
+        for(coord.x = 0; coord.x < width - 1; coord.x++) \\\n\
+        { \\\n\
+            src_type data = image_read(input, coord.xy); \\\n\
+            cnt += 1.0f; \\\n\
+            coord.z++; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord.zw, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(rev) \\\n\
+    { \\\n\
+        for(coord.x = width - 1; coord.x >= 0; coord.x--) \\\n\
+        { \\\n\
+            src_type data = image_read(input, coord.xy); \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord.xy, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else \\\n\
+    { \\\n\
+        for(coord.x = 0; coord.x < width; coord.x++) \\\n\
+        { \\\n\
+            src_type data = image_read(input, coord.xy); \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = convert_dtype(tmpSum); \\\n\
+            image_write(output, coord.xy, dst); \\\n\
+        } \\\n\
+    } \\\n\
 }\n\
+CUMSUM_INT_AXIS0_2D_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)\n\
+CUMSUM_INT_AXIS0_2D_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)\n\
+CUMSUM_INT_AXIS0_2D_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)\n\
 "; /* end of cumsum_2d_cl*/
 
 static const char cumsum_array_2d_axis0_cl[] = "\n\
@@ -78995,7 +80927,33 @@ __kernel void one_hot_U8toU8\n\
         coord.z ++;\n\
     } while (coord.z < depth);\n\
 }\n\
-"; /* end of one_hot_cl*/
+\n\
+__kernel void one_hot_I32toBF16\n\
+    (\n\
+        __read_only  image2d_t       input,\n\
+        __write_only image2d_array_t output,\n\
+                     int             depth,\n\
+                     uint            on_value,\n\
+                     uint            off_value,\n\
+                     float           inputScale,\n\
+                     float           inputTail\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+    int4 src = read_imagei(input, coord.xy);\n\
+\n\
+    int  val = convert_int(convert_float(src.x) * inputScale - inputTail);\n\
+    do\n\
+    {\n\
+        uint4 dst;\n\
+        dst.x = val == coord.z ? on_value : off_value;\n\
+\n\
+        write_imageui(output, coord.xzyw, dst.xxxx);\n\
+\n\
+        coord.z ++;\n\
+    } while (coord.z < depth);\n\
+}"; /* end of one_hot_cl*/
 
 static const char poolwithargmax_cl[] = "\n\
 #define POOLWITHARGMAX_PROCESS(data_type, read_fun, write_fun0, write_fun1) \\\n\
@@ -82788,6 +84746,381 @@ __kernel void roi_align_U8_U16toU8\n\
     }\n\
 }"; /* end of roi_align_cl*/
 
+static const char rope_0_cl[] = "__kernel void rope_F32_F32toF32_axis0\n\
+  (\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_array_t cos_cache,\n\
+    __read_only  image2d_array_t sin_cache,\n\
+    __write_only image2d_array_t output,\n\
+                 int axis,\n\
+                 float input_zp,\n\
+                 float cos_zp,\n\
+                 float sin_zp,\n\
+                 float scale0,\n\
+                 float scale1,\n\
+                 float output_zp,\n\
+                 int half_head_size,\n\
+                 int step\n\
+  )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    float4 cos, sin;\n\
+\n\
+    READ_IMAGEF_2DARRAY(cos, cos_cache, coord);\n\
+    READ_IMAGEF_2DARRAY(sin, sin_cache, coord);\n\
+    coord.x = coord.x * step;\n\
+    float4 src0 = read_imagef(input, coord);\n\
+    int4 coord_out = coord;\n\
+\n\
+    coord.x += half_head_size;\n\
+    float4 src1 = read_imagef(input, coord);\n\
+\n\
+    float4 dst0 = src0 * cos - src1 * sin;\n\
+    float4 dst1 = src0 * sin + src1 * cos;\n\
+\n\
+    write_imagef(output, coord_out, dst0);\n\
+    coord_out.x += half_head_size;\n\
+    write_imagef(output, coord_out, dst1);\n\
+}\n\
+\n\
+__kernel void rope_F32_F32toF32_axis1\n\
+  (\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_array_t cos_cache,\n\
+    __read_only  image2d_array_t sin_cache,\n\
+    __write_only image2d_array_t output,\n\
+                 int axis,\n\
+                 float input_zp,\n\
+                 float cos_zp,\n\
+                 float sin_zp,\n\
+                 float scale0,\n\
+                 float scale1,\n\
+                 float output_zp,\n\
+                 int half_head_size,\n\
+                 int step\n\
+  )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    float4 cos, sin;\n\
+\n\
+    READ_IMAGEF_2DARRAY(cos, cos_cache, coord);\n\
+    READ_IMAGEF_2DARRAY(sin, sin_cache, coord);\n\
+    coord.y = coord.y * step;\n\
+    float4 src0 = read_imagef(input, coord);\n\
+    int4 coord_out = coord;\n\
+    coord.y += half_head_size;\n\
+    float4 src1 = read_imagef(input, coord);\n\
+\n\
+    float4 dst0 = src0 * cos - src1 * sin;\n\
+    float4 dst1 = src0 * sin + src1 * cos;\n\
+\n\
+    write_imagef(output, coord_out, dst0);\n\
+    coord_out.y += half_head_size;\n\
+    write_imagef(output, coord_out, dst1);\n\
+}\n\
+\n\
+__kernel void rope_F32_F32toF32_axis2\n\
+  (\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_array_t cos_cache,\n\
+    __read_only  image2d_array_t sin_cache,\n\
+    __write_only image2d_array_t output,\n\
+                 int axis,\n\
+                 float input_zp,\n\
+                 float cos_zp,\n\
+                 float sin_zp,\n\
+                 float scale0,\n\
+                 float scale1,\n\
+                 float output_zp,\n\
+                 int half_head_size,\n\
+                 int step\n\
+  )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+\n\
+    float4 cos = read_imagef(cos_cache, coord);\n\
+    float4 sin = read_imagef(sin_cache, coord);\n\
+    coord.z = coord.z * step;\n\
+    float4 src0 = read_imagef(input, coord);\n\
+    int4 coord_out = coord;\n\
+    coord.z += half_head_size;\n\
+    float4 src1 = read_imagef(input, coord);\n\
+\n\
+    float4 dst0 = src0 * cos - src1 * sin;\n\
+    float4 dst1 = src0 * sin + src1 * cos;\n\
+\n\
+    write_imagef(output, coord_out, dst0);\n\
+    coord_out.z += half_head_size;\n\
+    write_imagef(output, coord_out, dst1);\n\
+}\n\
+\n\
+__kernel void rope_I32_I32toI32_axis0\n\
+  (\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_array_t cos_cache,\n\
+    __read_only  image2d_array_t sin_cache,\n\
+    __write_only image2d_array_t output,\n\
+                 int axis,\n\
+                 float input_zp,\n\
+                 float cos_zp,\n\
+                 float sin_zp,\n\
+                 float scale0,\n\
+                 float scale1,\n\
+                 float output_zp,\n\
+                 int half_head_size,\n\
+                 int step\n\
+  )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    int4 _cos, _sin;\n\
+    float4 cos, sin;\n\
+\n\
+    READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);\n\
+    READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);\n\
+    coord.x = coord.x * step;\n\
+    float4 src0 = convert_float4(read_imagei(input, coord));\n\
+    int4 coord_out = coord;\n\
+\n\
+    coord.x += half_head_size;\n\
+    float4 src1 = convert_float4(read_imagei(input, coord));\n\
+\n\
+    src0 = src0 - input_zp;\n\
+    src1 = src1 - input_zp;\n\
+    cos = convert_float4(_cos) - cos_zp;\n\
+    sin = convert_float4(_sin) - sin_zp;\n\
+\n\
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;\n\
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;\n\
+    int4 dst0 = convert_int4_rte(_dst0);\n\
+    int4 dst1 = convert_int4_rte(_dst1);\n\
+\n\
+    write_imagei(output, coord_out, dst0);\n\
+    coord_out.x += half_head_size;\n\
+    write_imagei(output, coord_out, dst1);\n\
+}\n\
+\n\
+__kernel void rope_I32_I32toI32_axis1\n\
+  (\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_array_t cos_cache,\n\
+    __read_only  image2d_array_t sin_cache,\n\
+    __write_only image2d_array_t output,\n\
+                 int axis,\n\
+                 float input_zp,\n\
+                 float cos_zp,\n\
+                 float sin_zp,\n\
+                 float scale0,\n\
+                 float scale1,\n\
+                 float output_zp,\n\
+                 int half_head_size,\n\
+                 int step\n\
+  )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    int4 _cos, _sin;\n\
+    float4 cos, sin;\n\
+\n\
+    READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);\n\
+    READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);\n\
+    coord.y = coord.y * step;\n\
+    float4 src0 = convert_float4(read_imagei(input, coord));\n\
+    int4 coord_out = coord;\n\
+\n\
+    coord.y += half_head_size;\n\
+    float4 src1 = convert_float4(read_imagei(input, coord));\n\
+\n\
+    src0 = src0 - input_zp;\n\
+    src1 = src1 - input_zp;\n\
+    cos = convert_float4(_cos) - cos_zp;\n\
+    sin = convert_float4(_sin) - sin_zp;\n\
+\n\
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;\n\
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;\n\
+    int4 dst0 = convert_int4_rte(_dst0);\n\
+    int4 dst1 = convert_int4_rte(_dst1);\n\
+\n\
+    write_imagei(output, coord_out, dst0);\n\
+    coord_out.y += half_head_size;\n\
+    write_imagei(output, coord_out, dst1);\n\
+}\n\
+\n\
+__kernel void rope_I32_I32toI32_axis2\n\
+  (\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_array_t cos_cache,\n\
+    __read_only  image2d_array_t sin_cache,\n\
+    __write_only image2d_array_t output,\n\
+                 int axis,\n\
+                 float input_zp,\n\
+                 float cos_zp,\n\
+                 float sin_zp,\n\
+                 float scale0,\n\
+                 float scale1,\n\
+                 float output_zp,\n\
+                 int half_head_size,\n\
+                 int step\n\
+  )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+\n\
+    float4 cos = convert_float4(read_imagei(cos_cache, coord));\n\
+    float4 sin = convert_float4(read_imagei(sin_cache, coord));\n\
+    coord.z = coord.z * step;\n\
+    float4 src0 = convert_float4(read_imagei(input, coord));\n\
+    int4 coord_out = coord;\n\
+\n\
+    coord.z += half_head_size;\n\
+    float4 src1 = convert_float4(read_imagei(input, coord));\n\
+\n\
+    src0 = src0 - input_zp;\n\
+    src1 = src1 - input_zp;\n\
+    cos = cos - cos_zp;\n\
+    sin = sin - sin_zp;\n\
+\n\
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;\n\
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;\n\
+    int4 dst0 = convert_int4_rte(_dst0);\n\
+    int4 dst1 = convert_int4_rte(_dst1);\n\
+\n\
+    write_imagei(output, coord_out, dst0);\n\
+    coord_out.z += half_head_size;\n\
+    write_imagei(output, coord_out, dst1);\n\
+}\n\
+\n\
+__kernel void rope_U32_U32toU32_axis0\n\
+  (\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_array_t cos_cache,\n\
+    __read_only  image2d_array_t sin_cache,\n\
+    __write_only image2d_array_t output,\n\
+                 int axis,\n\
+                 float input_zp,\n\
+                 float cos_zp,\n\
+                 float sin_zp,\n\
+                 float scale0,\n\
+                 float scale1,\n\
+                 float output_zp,\n\
+                 int half_head_size,\n\
+                 int step\n\
+  )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    uint4 _cos, _sin;\n\
+    float4 cos, sin;\n\
+\n\
+    READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);\n\
+    READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);\n\
+    coord.x = coord.x * step;\n\
+    float4 src0 = convert_float4(read_imageui(input, coord));\n\
+    int4 coord_out = coord;\n\
+\n\
+    coord.x += half_head_size;\n\
+    float4 src1 = convert_float4(read_imageui(input, coord));\n\
+\n\
+    src0 = src0 - input_zp;\n\
+    src1 = src1 - input_zp;\n\
+    cos = convert_float4(_cos) - cos_zp;\n\
+    sin = convert_float4(_sin) - sin_zp;\n\
+\n\
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;\n\
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;\n\
+    uint4 dst0 = convert_uint4_rte(_dst0);\n\
+    uint4 dst1 = convert_uint4_rte(_dst1);\n\
+\n\
+    write_imageui(output, coord_out, dst0);\n\
+    coord_out.x += half_head_size;\n\
+    write_imageui(output, coord_out, dst1);\n\
+}\n\
+\n\
+__kernel void rope_U32_U32toU32_axis1\n\
+  (\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_array_t cos_cache,\n\
+    __read_only  image2d_array_t sin_cache,\n\
+    __write_only image2d_array_t output,\n\
+                 int axis,\n\
+                 float input_zp,\n\
+                 float cos_zp,\n\
+                 float sin_zp,\n\
+                 float scale0,\n\
+                 float scale1,\n\
+                 float output_zp,\n\
+                 int half_head_size,\n\
+                 int step\n\
+  )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    uint4 _cos, _sin;\n\
+    float4 cos, sin;\n\
+\n\
+    READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);\n\
+    READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);\n\
+    coord.y = coord.y * step;\n\
+    float4 src0 = convert_float4(read_imageui(input, coord));\n\
+    int4 coord_out = coord;\n\
+\n\
+    coord.y += half_head_size;\n\
+    float4 src1 = convert_float4(read_imageui(input, coord));\n\
+\n\
+    src0 = src0 - input_zp;\n\
+    src1 = src1 - input_zp;\n\
+    cos = convert_float4(_cos) - cos_zp;\n\
+    sin = convert_float4(_sin) - sin_zp;\n\
+\n\
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;\n\
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;\n\
+    uint4 dst0 = convert_uint4_rte(_dst0);\n\
+    uint4 dst1 = convert_uint4_rte(_dst1);\n\
+\n\
+    write_imageui(output, coord_out, dst0);\n\
+    coord_out.y += half_head_size;\n\
+    write_imageui(output, coord_out, dst1);\n\
+}\n\
+\n\
+__kernel void rope_U32_U32toU32_axis2\n\
+  (\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_array_t cos_cache,\n\
+    __read_only  image2d_array_t sin_cache,\n\
+    __write_only image2d_array_t output,\n\
+                 int axis,\n\
+                 float input_zp,\n\
+                 float cos_zp,\n\
+                 float sin_zp,\n\
+                 float scale0,\n\
+                 float scale1,\n\
+                 float output_zp,\n\
+                 int half_head_size,\n\
+                 int step\n\
+  )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+\n\
+    float4 cos = convert_float4(read_imageui(cos_cache, coord));\n\
+    float4 sin = convert_float4(read_imageui(sin_cache, coord));\n\
+    coord.z = coord.z * step;\n\
+    float4 src0 = convert_float4(read_imageui(input, coord));\n\
+    int4 coord_out = coord;\n\
+\n\
+    coord.z += half_head_size;\n\
+    float4 src1 = convert_float4(read_imageui(input, coord));\n\
+\n\
+    src0 = src0 - input_zp;\n\
+    src1 = src1 - input_zp;\n\
+    cos = cos - cos_zp;\n\
+    sin = sin - sin_zp;\n\
+\n\
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;\n\
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;\n\
+    uint4 dst0 = convert_uint4_rte(_dst0);\n\
+    uint4 dst1 = convert_uint4_rte(_dst1);\n\
+\n\
+    write_imageui(output, coord_out, dst0);\n\
+    coord_out.z += half_head_size;\n\
+    write_imageui(output, coord_out, dst1);\n\
+}\n\
+"; /* end of rope_0_cl*/
+
 static const char scatter_elements_cl[] = "\n\
 #define SCATTER_ELEMENTS_AXIS0_32BITS_IMPL(name, dtype) \\\n\
 __kernel void scatter_elements_axis0_##name \\\n\
@@ -86589,6 +88922,7 @@ static const source_map_t evis_resource[] =
     {"cumsum_ex_rev_axis1_vx", cumsum_ex_rev_axis1_vx},
     {"cumsum_ex_rev_axis2_vx", cumsum_ex_rev_axis2_vx},
     {"cumsum_f16_u8_vx", cumsum_f16_u8_vx},
+    {"custom_letterbox_vx", custom_letterbox_vx},
     {"custom_softmax_vx", custom_softmax_vx},
     {"custom_warp_affine_vx", custom_warp_affine_vx},
     {"custom_warp_affine_2d_vx", custom_warp_affine_2d_vx},
@@ -86812,12 +89146,19 @@ static const source_map_t evis_resource[] =
     {"resize_bilinear_U8_vx", resize_bilinear_U8_vx},
     {"resize_bilinear_U8_half_pixel_centers_1_vx", resize_bilinear_U8_half_pixel_centers_1_vx},
     {"resize_bilinear_U8_half_pixel_centers_2_vx", resize_bilinear_U8_half_pixel_centers_2_vx},
+    {"resize_bilinear_U8_half_pixel_centers_3_vx", resize_bilinear_U8_half_pixel_centers_3_vx},
+    {"resize_bilinear_U8_half_pixel_centers_4_vx", resize_bilinear_U8_half_pixel_centers_4_vx},
+    {"resize_bilinear_U8_half_pixel_centers_5_vx", resize_bilinear_U8_half_pixel_centers_5_vx},
     {"resize_bilinear_U8_opt_vx", resize_bilinear_U8_opt_vx},
     {"resize_bilinear_align_corners_vx", resize_bilinear_align_corners_vx},
     {"resize_bilinear_nhwc_vx", resize_bilinear_nhwc_vx},
     {"resize_bilinear_nhwc_bound_vx", resize_bilinear_nhwc_bound_vx},
     {"resize_cubic_vx", resize_cubic_vx},
     {"resize_nearest_vx", resize_nearest_vx},
+    {"rope_0_vx", rope_0_vx},
+    {"rope_1_vx", rope_1_vx},
+    {"rope_2_vx", rope_2_vx},
+    {"rope_3_vx", rope_3_vx},
     {"scatter_nd_vx", scatter_nd_vx},
     {"scatter_nd_big_vx", scatter_nd_big_vx},
     {"scatter_nd_update_vx", scatter_nd_update_vx},
@@ -86987,6 +89328,7 @@ static const source_map_t cl_resource[] =
     {"resize_nearest_cl", resize_nearest_cl},
     {"reversesequence_cl", reversesequence_cl},
     {"roi_align_cl", roi_align_cl},
+    {"rope_0_cl", rope_0_cl},
     {"scatter_elements_cl", scatter_elements_cl},
     {"scatter_elements_add_cl", scatter_elements_add_cl},
     {"scatter_elements_mul_cl", scatter_elements_mul_cl},
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
index 2c63c1e..5be282c 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
@@ -29,6 +29,7 @@
 #include "VX/vx_ext_program.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_prv.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_log.h"
 #include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
@@ -198,10 +199,11 @@ static vsi_status vsi_nn_RegisterVXKernel
     vx_size * program_len = NULL;
     const char **program_src = NULL;
     vx_context ctx = NULL;
-    vsi_nn_context_t context = NULL;
     vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];
     uint8_t i = 0;
     vsi_bool load_from_file = FALSE;
+    vsi_nn_runtime_option_t* options;
+    options = ((vsi_nn_graph_prv_t*)graph)->options;
 
 #define MAX_BUILDPROGRAM_LEN 128
     char cmd[MAX_BUILDPROGRAM_LEN] = {0};
@@ -210,8 +212,7 @@ static vsi_status vsi_nn_RegisterVXKernel
     memset(cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN);
     status = VSI_FAILURE;
     ctx = vxGetContext( (vx_reference)graph->g );
-    context = graph->ctx;
-    evis = context->config.evis.ver;
+    evis = options->config.evis.ver;
 
     program_src = (const char**)malloc(kernel_info->resource_num * sizeof(char *));
     CHECK_PTR_FAIL_GOTO( program_src, "Create buffer fail.", final );
@@ -244,12 +245,12 @@ static vsi_status vsi_nn_RegisterVXKernel
     {
         // set default evis version is 2
         snprintf(cmd, MAX_BUILDPROGRAM_LEN,
-            "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va);
+            "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", options->config.use_40bits_va);
     }
     else
     {
         snprintf(cmd, MAX_BUILDPROGRAM_LEN,
-            "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va);
+            "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, options->config.use_40bits_va);
     }
     status = vxBuildProgram(program, cmd);
 
@@ -302,7 +303,7 @@ static vsi_status vsi_nn_RegisterBinKernel
     vx_size program_len = 0;
     const uint8_t *program_ptr = NULL;
     vx_context ctx;
-    vsi_nn_context_t context;
+    vsi_nn_runtime_option_t* options;
     vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];
 
 #define MAX_BUILDPROGRAM_LEN 128
@@ -313,8 +314,8 @@ static vsi_status vsi_nn_RegisterBinKernel
     status = VSI_FAILURE;
 
     ctx = vxGetContext( (vx_reference)graph->g );
-    context = graph->ctx;
-    evis = context->config.evis.ver;
+    options = ((vsi_nn_graph_prv_t*)graph)->options;
+    evis = options->config.evis.ver;
 
     program_ptr = vsi_nn_VxBinResourceGetResource(
             kernel_info->resource_name[kernel_info->resource_num - 1], &program_len);
@@ -337,12 +338,12 @@ static vsi_status vsi_nn_RegisterBinKernel
     {
         // set default evis version is 2
         snprintf(cmd, MAX_BUILDPROGRAM_LEN,
-            "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va);
+            "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", options->config.use_40bits_va);
     }
     else
     {
         snprintf(cmd, MAX_BUILDPROGRAM_LEN,
-            "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va);
+            "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, options->config.use_40bits_va);
     }
 #else
     snprintf(cmd, MAX_BUILDPROGRAM_LEN, "-cl-viv-vx-extension");
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
index d1ca746..be08a5e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
@@ -35,6 +35,8 @@
 #include "utils/vsi_nn_constraint_check.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
+#include "vsi_nn_tensor_util_prv.h"
+#include "vsi_nn_error.h"
 
 static vsi_status _try_set_high_presision_tensor
     (
@@ -120,9 +122,22 @@ static vsi_status _static_batchnorm
     vsi_nn_tensor_t ** outputs
     )
 {
+#define _TENSOR_LEN 64
     vsi_status         status;
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_tensor_t* reshape_tensors[6] = { NULL };
+    vsi_size_t shape[VSI_NN_MAX_DIM_NUM];
+    uint32_t new_rank = 4;
+    vsi_nn_tensor_t* input0 = NULL;
+    vsi_nn_tensor_t* output = NULL;
+    char reshape0_tensor_name[_TENSOR_LEN];
+    char reshape1_tensor_name[_TENSOR_LEN];
+    char batch_norm_tensor_name[_TENSOR_LEN];
+
+    memset(reshape0_tensor_name, 0, sizeof(reshape0_tensor_name));
+    memset(reshape1_tensor_name, 0, sizeof(reshape1_tensor_name));
+    memset(batch_norm_tensor_name, 0, sizeof(batch_norm_tensor_name));
+
     status = VSI_FAILURE;
 
     status = _try_set_high_presision_tensor(inputs);
@@ -131,10 +146,43 @@ static vsi_status _static_batchnorm
         VSILOGE("Set tensor attr of high presision fail");
         return status;
     }
-    if(_require_reshape(self, inputs))
+    if (_require_reshape(self, inputs))
     {
-        reshape_tensors[0] = self->nn_param.batch_norm.local->reshaped_input;
-        reshape_tensors[5] = self->nn_param.batch_norm.local->reshaped_output;
+        if (3 == inputs[0]->attr.dim_num)
+        {
+            shape[0] = inputs[0]->attr.size[0];
+            shape[1] = 1;
+            shape[2] = inputs[0]->attr.size[1];
+            shape[3] = inputs[0]->attr.size[2];
+        }
+        else if (5 == inputs[0]->attr.dim_num)
+        {
+            shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
+            shape[1] = inputs[0]->attr.size[2];
+            shape[2] = inputs[0]->attr.size[3];
+            shape[3] = inputs[0]->attr.size[4];
+        }
+
+        input0 = vsi_nn_kernel_insert_reshape_node(self->graph,
+            inputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_BACKWARD);
+        CHECK_PTR_FAIL_GOTO(input0, "Create tensor fail.", final);
+        reshape_tensors[0] = input0;
+        snprintf(reshape0_tensor_name, sizeof(reshape0_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0);
+        if (vxSetReferenceName((vx_reference)reshape_tensors[0]->t, reshape0_tensor_name) == VSI_FAILURE)
+        {
+            VSILOGW("Set uid %u reshape 0 node output name fail", self->uid);
+            goto final;
+        }
+        output = vsi_nn_kernel_insert_reshape_node(self->graph,
+            outputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_FORWARD);
+        CHECK_PTR_FAIL_GOTO(output, "Create tensor fail.", final);
+        reshape_tensors[5] = output;
+        snprintf(reshape1_tensor_name, sizeof(reshape1_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1);
+        if (vxSetReferenceName((vx_reference)outputs[0]->t, reshape1_tensor_name) == VSI_FAILURE)
+        {
+            VSILOGW("Set uid %u reshap 1 node output name fail", self->uid);
+            goto final;
+        }
     }
     else
     {
@@ -155,12 +203,26 @@ static vsi_status _static_batchnorm
         reshape_tensors, 5,
         &reshape_tensors[5], 1, param );
 
-    if( self->n )
+    if ( self->n )
     {
         status = VSI_SUCCESS;
     }
 
-    vsi_nn_kernel_param_release( &param );
+    vsi_nn_kernel_param_release(&param);
+
+    if (output)
+    {
+        snprintf(batch_norm_tensor_name, sizeof(batch_norm_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 2);
+        if (vxSetReferenceName((vx_reference)output->t, batch_norm_tensor_name) == VSI_FAILURE)
+        {
+            VSILOGW("Set uid %u instance_norm node output name fail", self->uid);
+            goto final;
+        }
+    }
+
+final:
+    vsi_safe_release_tensor(input0);
+    vsi_safe_release_tensor(output);
 
     return status;
 }
@@ -313,68 +375,6 @@ static vsi_status op_compute
     return status;
 } /* op_compute() */
 
-static vsi_status op_optimize
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs,
-    vsi_nn_opt_direction_e direction
-    )
-{
-    uint32_t dim = 0;
-    vsi_nn_batcnnorm_lcl_data *local = NULL;
-    vsi_size_t shape[VSI_NN_MAX_DIM_NUM];
-    char tensor_name[128];
-
-    dim = inputs[0]->attr.dim_num;
-    if(_require_reshape(self, inputs) == FALSE)
-    {
-        return VSI_SUCCESS;
-    }
-
-    VSILOGD("Optimize 3D %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
-    /*
-        reshape 3d input (xcn) --> 4d input (whcn)
-        reshape 3d output(xcn) --> 4d output(whcn)
-    */
-    dim = 4;
-    if (3 == inputs[0]->attr.dim_num)
-    {
-        shape[0] = inputs[0]->attr.size[0];
-        shape[1] = 1;
-        shape[2] = inputs[0]->attr.size[1];
-        shape[3] = inputs[0]->attr.size[2];
-    }
-    else if (5 == inputs[0]->attr.dim_num)
-    {
-        shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
-        shape[1] = inputs[0]->attr.size[2];
-        shape[2] = inputs[0]->attr.size[3];
-        shape[3] = inputs[0]->attr.size[4];
-    }
-    local = self->nn_param.batch_norm.local;
-    if (VSI_NN_OPTIMIZE_BACKWARD == direction)
-    {
-        local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim);
-    }
-    else
-    {
-        local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim);
-        if(local->reshaped_output && local->reshaped_output->t)
-        {
-            memset(tensor_name, 0, sizeof(tensor_name));
-            snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid);
-            if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE)
-            {
-                VSILOGW("Set uid %u batchnorm reshaped output name fail", self->uid);
-                return VSI_FAILURE;
-            }
-        }
-    }
-
-    return VSI_SUCCESS;
-} /* op_optimize() */
-
 static vsi_bool _dynamic_check
     (
     vsi_nn_node_t * self,
@@ -494,58 +494,6 @@ static vsi_bool op_check
     }
 } /* op_check() */
 
-static vsi_bool op_setup
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_nn_batcnnorm_lcl_data *local = NULL;
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
-    {
-        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
-        memcpy( outputs[0]->attr.size, inputs[0]->attr.size,
-            VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
-    }
-
-    if(_require_reshape(self, inputs))
-    {
-        local = (vsi_nn_batcnnorm_lcl_data *)malloc(sizeof(vsi_nn_batcnnorm_lcl_data));
-        if(NULL == local)
-        {
-            return VSI_FAILURE;
-        }
-        memset(local, 0, sizeof(vsi_nn_batcnnorm_lcl_data));
-        self->nn_param.batch_norm.local = local;
-    }
-    return TRUE;
-} /* op_setup() */
-
-static vsi_status op_deinit
-    (
-    vsi_nn_node_t * self
-    )
-{
-    vsi_nn_batch_norm_param *p = &(self->nn_param.batch_norm);
-    if(p->local)
-    {
-        if (p->local->reshaped_input)
-        {
-            vsi_nn_ReleaseTensor(&(p->local->reshaped_input));
-            p->local->reshaped_input = NULL;
-        }
-        if (p->local->reshaped_output)
-        {
-            vsi_nn_ReleaseTensor(&(p->local->reshaped_output));
-            p->local->reshaped_output = NULL;
-        }
-        vsi_nn_safe_free(p->local);
-    }
-    vsi_nn_op_common_deinit(self);
-    return VSI_SUCCESS;
-}
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -555,10 +503,10 @@ DEF_OP_REG
     /* op_name    */ BATCH_NORM,
     /* init       */ NULL,
     /* compute    */ op_compute,
-    /* deinit     */ op_deinit,
+    /* deinit     */ vsi_nn_op_common_deinit,
     /* check      */ op_check,
-    /* setup      */ op_setup,
-    /* optimize   */ op_optimize,
+    /* setup      */ vsi_nn_op_common_setup,
+    /* optimize   */ NULL,
     /* input_num  */ 5,
     /* output_num */ 1
     );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c
index c47bd27..71513e1 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c
@@ -118,6 +118,7 @@ static vsi_bool op_setup
             if (outputs[0]->attr.dim_num == 0)
             {
                 outputs[0]->attr.size[0] = 1;
+                outputs[0]->attr.dim_num = 1;
                 vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
             }
             else
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c
index 43f8a8f..117a578 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c
@@ -82,6 +82,7 @@ static vsi_bool op_check
 {
     BEGIN_IO_TYPE_DECL(CUMSUM, 1, 1)
         IO_TYPE(D_U32,         D_U32)
+        IO_TYPE(D_I32,         D_I32)
         IO_TYPE(D_F32,         D_F32)
         IO_TYPE(D_F16,         D_F16)
         IO_TYPE(D_BF16,        D_BF16)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
index 11f0268..d9dbd88 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@@ -253,6 +253,7 @@ static vsi_bool op_check
         IO_TYPE(D_BOOL8,      D_I32)
         IO_TYPE(D_BOOL8,      D_U16)
         IO_TYPE(D_BOOL8,      D_U32)
+        IO_TYPE(D_BOOL8,      D_BF16)
         IO_TYPE(D_U8|Q_ASYM,  D_BOOL8)
         IO_TYPE(D_I8|Q_ASYM,  D_BOOL8)
         IO_TYPE(D_I8|Q_DFP,   D_BOOL8)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
index a768b46..b6eb002 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
@@ -155,10 +155,10 @@ vsi_bool vsi_nn_op_eltwise_setup
     vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
     vsi_bool ret = TRUE;
 
-    out_rank = inputs[0]->attr.dim_num;
+    out_rank = vsi_nn_get_tensor_dims(inputs[0]);
     for ( i = 1; i < self->input.num; i++)
     {
-        in2_rank = inputs[i]->attr.dim_num;
+        in2_rank = vsi_nn_get_tensor_dims(inputs[i]);
         out_rank = vsi_nn_max( out_rank, in2_rank );
     }
 
@@ -166,10 +166,10 @@ vsi_bool vsi_nn_op_eltwise_setup
     {
         vsi_size_t sz0, sz1;
 
-        sz0 = i < inputs[0]->attr.dim_num ? inputs[0]->attr.size[i] : 1;
+        sz0 = i < vsi_nn_get_tensor_dims(inputs[0]) ? inputs[0]->attr.size[i] : 1;
         for ( j = 1; j < self->input.num; j++)
         {
-            sz1 = i < inputs[j]->attr.dim_num  ? inputs[j]->attr.size[i] : 1;
+            sz1 = i < vsi_nn_get_tensor_dims(inputs[j]) ? inputs[j]->attr.size[i] : 1;
             sz0 = vsi_nn_max( sz0, sz1 );
             if (sz0 != sz1 && sz0 != 1 && sz1 != 1)
             {
@@ -187,11 +187,12 @@ vsi_bool vsi_nn_op_eltwise_setup
     {
         outputs[0]->attr.dim_num = out_rank;
         memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) );
-        if (out_rank == 1 &&
-            vsi_nn_GetTensorIsScalar(inputs[0]) &&
+        if (vsi_nn_GetTensorIsScalar(inputs[0]) &&
             vsi_nn_GetTensorIsScalar(inputs[1]))
         {
             vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
+            outputs[0]->attr.size[0] = 1;
+            outputs[0]->attr.dim_num = 1;
         }
     }
     else
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
index a887591..3a8edea 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
@@ -199,6 +199,7 @@ static vsi_bool op_setup
         if (o_rank == 0)
         {
             outputs[0]->attr.size[0] = 1;
+            outputs[0]->attr.dim_num = 1;
             vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
         }
         else
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
index 31f7abc..878384a 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
@@ -306,6 +306,8 @@ static vsi_bool _op_check
         IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_I16|Q_ASYM)
         IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_I16|Q_SYM)
+        IO_TYPE(D_U16|Q_ASYM, D_F32,  D_F32,  D_U16|Q_ASYM)
+        IO_TYPE(D_U16|Q_SYM,  D_F32,  D_F32,  D_U16|Q_SYM)
     END_IO_TYPE_DECL(GROUP_NORM)
     if (!VALIDATE_OP_IO_TYPES(GROUP_NORM, self, inputs, self->input.num, outputs, self->output.num))
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c
index da15699..bfa87f3 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c
@@ -25,6 +25,7 @@
 #include <stdlib.h>
 
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_graph.h"
@@ -197,6 +198,7 @@ static vsi_bool op_setup_default
     vsi_nn_internal_tensor_t * hstate_fc_outputs[GRUCELL_GATE_CNT] = { NULL };
     vsi_nn_internal_tensor_t * h_times_r = NULL;
     vsi_nn_tensor_attr_t attr;
+    vsi_nn_activation_e recurrent_activation = p->recurrent_activation;
 
     vsi_nn_internal_init_node_wksp( self );
 
@@ -230,7 +232,8 @@ static vsi_bool op_setup_default
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
     if (inputs[GRUCELL_IN_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
-        self->graph->ctx->config.support_stream_processor)
+       (((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor &&
+           recurrent_activation == VSI_NN_ACT_SIGMOID))
     {
         attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c
index 5dbe4a4..e2ad82c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c
@@ -93,37 +93,15 @@ static vsi_bool op_check
     {
         BEGIN_IO_TYPE_DECL(L1_LAYER_NORM, 4, 1)
             IO_TYPE(D_F32,        D_F32,  D_F32,  D_F32,  D_F32)
-            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_F16)
             IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_F16)
-            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_U8|Q_ASYM)
             IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_U8|Q_ASYM)
-            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_DFP)
             IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_DFP)
-            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_ASYM)
             IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_ASYM)
-            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_SYM)
             IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_SYM)
-            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_DFP)
             IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_DFP)
-            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_ASYM)
             IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_ASYM)
-            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_SYM)
             IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_SYM)
             IO_TYPE(D_BF16,       D_F32,  D_F32,  D_F32,  D_BF16)
-            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_F16)
-            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_U8|Q_ASYM)
-            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F32,  D_I16|Q_DFP)
-            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_F32,  D_I16|Q_ASYM)
-            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_F32,  D_I16|Q_SYM)
-            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F32,  D_F16)
-            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_F32,  D_F16)
-            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_F32,  D_F16)
-            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F32,  D_I8|Q_DFP)
-            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_I8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_F32,  D_I8|Q_SYM)
-            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F32,  D_F16)
-            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_F16)
-            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_F32,  D_F16)
             IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_U8|Q_ASYM)
             IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_F16)
             IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F32,  D_I16|Q_DFP)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
index d52eb7d..7382b1b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
@@ -25,6 +25,7 @@
 #include <stdlib.h>
 
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_node.h"
@@ -351,7 +352,7 @@ static vsi_bool op_setup
     }
     else if ( ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 &&
                 outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) ||
-              self->graph->ctx->config.support_stream_processor )
+              ((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor )
     {
         vsi_nn_internal_tensor_t* output_tensor = NULL;
         vsi_nn_internal_tensor_t* reshape_tensor = NULL;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
index 46a389a..5c4502d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
@@ -106,7 +106,7 @@ static vsi_bool op_setup
 
     vsi_nn_internal_init_node_wksp( self );
 
-    if ( axis != 0 && !self->graph->ctx->config.support_stream_processor)
+    if ( axis != 0 && !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
     {
         vsi_nn_internal_tensor_t* mean_tensor = NULL;
         vsi_nn_internal_tensor_t* vari_tensor = NULL;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
index 22dfd66..af2d283 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
@@ -25,6 +25,7 @@
 #include <stdlib.h>
 
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_graph.h"
@@ -139,7 +140,7 @@ static vsi_bool op_setup
 
     p->is_cifg = inputs[LSTMUNIT_ACT_INPUT_FC_I] == NULL;
     p->is_projection = outputs[LSTMUNIT_ACT_HSTATE_OUT] == NULL;
-    if (self->graph->ctx->config.support_stream_processor)
+    if (((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
     {
         p->is_layer_norm = inputs[LSTMUNIT_ACT_HSTATE_FC_F] == NULL;
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c b/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c
index 2e0e48b..a7ec872 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c
@@ -100,6 +100,7 @@ static vsi_bool op_check
         IO_TYPE(D_I32,          D_I16|Q_ASYM)
         IO_TYPE(D_I32,          D_I16|Q_SYM)
         IO_TYPE(D_I32,          D_I32)
+        IO_TYPE(D_I32,          D_BF16)
         IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_SYM)
         IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_SYM)
         IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
@@ -111,8 +112,10 @@ static vsi_bool op_check
         IO_TYPE(D_U8|Q_ASYM,    D_BF16)
         IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
         IO_TYPE(D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_BF16)
         IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_BF16)
         IO_TYPE(D_I16|Q_DFP,    D_I8|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,    D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
@@ -124,11 +127,14 @@ static vsi_bool op_check
         IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
         IO_TYPE(D_I16|Q_SYM,    D_I8|Q_SYM)
         IO_TYPE(D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_BF16)
         IO_TYPE(D_I16|Q_ASYM,   D_F32)
         IO_TYPE(D_I16|Q_SYM,    D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_BF16)
         IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
         IO_TYPE(D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_BF16)
         IO_TYPE(D_BF16,         D_BF16)
     END_IO_TYPE_DECL(ONE_HOT)
     if (!VALIDATE_OP_IO_TYPES(ONE_HOT, self, inputs, self->input.num, outputs, self->output.num))
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
index 80acd79..60285f6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
@@ -36,6 +36,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
 
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
@@ -50,33 +51,52 @@ static vsi_status op_compute
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_kernel_node_t    n = NULL;
-    param =vsi_nn_kernel_param_create();
+    vsi_nn_tensor_t* reshape_tensor = NULL;
+    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_nn_pre_process_rgb_param* p = NULL;
 
-    vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_rgb.local.scale_x );
-    vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_rgb.local.scale_y );
-    vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_rgb.rect.left );
-    vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_rgb.rect.top );
-    vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_rgb.r_mean );
-    vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_rgb.g_mean );
-    vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb.b_mean );
-    vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_rgb.r_scale );
-    vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_rgb.g_scale );
-    vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_rgb.b_scale );
-    vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_rgb.reverse_channel );
-    vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_rgb.local.enable_perm );
-    vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb.local.enable_copy );
-    n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb", inputs, 1, outputs, 1, param );
-    if( n != NULL )
+    memcpy(shape, inputs[0]->attr.size, inputs[0]->attr.dim_num * sizeof(vsi_size_t));
+
+    shape[0] = shape[1] * shape[0];
+    shape[1] = shape[2];
+    shape[2] = 1;
+
+    reshape_tensor = vsi_nn_reshape_tensor(self->graph,
+        inputs[0], shape, inputs[0]->attr.dim_num);
+    CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor failed", final);
+
+    p = (vsi_nn_pre_process_rgb_param*)&(self->nn_param.pre_process_rgb);
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32( param, "scale_x", p->local->scale_x );
+    vsi_nn_kernel_param_add_int32( param, "scale_y", p->local->scale_y );
+    vsi_nn_kernel_param_add_int32( param, "left", p->rect.left );
+    vsi_nn_kernel_param_add_int32( param, "top", p->rect.top );
+    vsi_nn_kernel_param_add_float32( param, "r_mean", p->r_mean );
+    vsi_nn_kernel_param_add_float32( param, "g_mean", p->g_mean );
+    vsi_nn_kernel_param_add_float32( param, "b_mean", p->b_mean );
+    vsi_nn_kernel_param_add_float32( param, "r_scale", p->r_scale );
+    vsi_nn_kernel_param_add_float32( param, "g_scale", p->g_scale );
+    vsi_nn_kernel_param_add_float32( param, "b_scale", p->b_scale );
+    vsi_nn_kernel_param_add_int32( param, "reverse", p->reverse_channel );
+    vsi_nn_kernel_param_add_int32( param, "enable_perm", p->local->enable_perm );
+    vsi_nn_kernel_param_add_int32( param, "enable_copy", p->local->enable_copy );
+    n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb", &reshape_tensor, 1, outputs, 1, param );
+    if ( n != NULL )
     {
         self->n = (vx_node)n;
         status = VSI_SUCCESS;
     }
 
-    if(param != NULL)
+    if (param != NULL)
     {
         vsi_nn_kernel_param_release( &param );
     }
 
+final:
+    vsi_safe_release_tensor(reshape_tensor);
+
     return status;
 } /* op_compute() */
 
@@ -166,35 +186,57 @@ static vsi_bool op_setup
     }
 
 
-    self->nn_param.pre_process_rgb.local.enable_perm = FALSE;
+    p->local->enable_perm = FALSE;
 
-    if (self->nn_param.pre_process_rgb.local.enable_perm == FALSE)
+    if (p->local->enable_perm == FALSE)
     {
-        p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]);
-        p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
+        p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]);
+        p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
     }
     else
     {
-        p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
-        p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]);
+        p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
+        p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]);
     }
 
-    p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15)));
+    p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15)));
 
     return TRUE;
 } /* op_setup() */
 
+static vsi_status op_init
+(
+    vsi_nn_node_t* self
+)
+{
+    vsi_status status = VSI_SUCCESS;
+
+    self->nn_param.pre_process_rgb.local =
+        (vsi_nn_pre_process_rgb_lcl_data*)malloc(sizeof(vsi_nn_pre_process_rgb_lcl_data));
+
+    if (NULL == self->nn_param.pre_process_rgb.local)
+    {
+        return VX_ERROR_NO_MEMORY;
+    }
+
+    memset(self->nn_param.pre_process_rgb.local, 0, sizeof(vsi_nn_pre_process_rgb_lcl_data));
+
+    return status;
+} /* op_init() */
+
 static vsi_status op_deinit
     (
     vsi_nn_node_t * self
     )
 {
-    if (self->nn_param.pre_process_rgb.local.local_tensor != NULL)
+    if (self->nn_param.pre_process_rgb.local->local_tensor != NULL)
     {
-        vxReleaseTensor(&self->nn_param.pre_process_rgb.local.local_tensor);
-        self->nn_param.pre_process_rgb.local.local_tensor = NULL;
+        vxReleaseTensor(&self->nn_param.pre_process_rgb.local->local_tensor);
+        self->nn_param.pre_process_rgb.local->local_tensor = NULL;
     }
 
+    vsi_nn_safe_free(self->nn_param.pre_process_rgb.local);
+
     vsi_nn_op_common_deinit(self);
 
     return VSI_SUCCESS;
@@ -208,7 +250,7 @@ extern "C" {
 DEF_OP_REG
     (
     /* op_name    */ PRE_PROCESS_RGB,
-    /* init       */ NULL,
+    /* init       */ op_init,
     /* compute    */ op_compute,
     /* deinit     */ op_deinit,
     /* check      */ op_check,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c
index eacf99d..3f80fac 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c
@@ -79,7 +79,10 @@ static vsi_status _prelu_op_compute
     vsi_status status = VSI_FAILURE;
     vsi_nn_prelu_param *prelu = &self->nn_param.prelu;
     vsi_ssize_t shapes[VSI_NN_MAX_DIM_NUM] = { 1 };
-    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_nn_tensor_t* input0 = NULL;
+    vsi_nn_tensor_t* input1 = NULL;
+    vsi_nn_tensor_t* output = NULL;
     vsi_bool   one_rank = FALSE;
     vsi_bool   is_per_channel_alpha = 0;
     vsi_size_t alpha_shape = 1;
@@ -88,6 +91,7 @@ static vsi_status _prelu_op_compute
     uint32_t dims = outputs[0]->attr.dim_num;
 
     reshape_tensors[0] = inputs[0];
+    reshape_tensors[2] = outputs[0];
     one_rank = _is_one_rank_tensor(inputs[1], &alpha_shape);
 
     for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
@@ -114,18 +118,23 @@ static vsi_status _prelu_op_compute
                dims = inputs[1]->attr.dim_num;
             }
 
-            reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+            input1 = vsi_nn_reshape_tensor( self->graph,
                 inputs[1], (vsi_size_t*)shapes, dims );
+            CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
+            reshape_tensors[1] = input1;
         }
         else
         {
             memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t));
-            reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+            input1 = vsi_nn_reshape_tensor( self->graph,
                 inputs[1], (vsi_size_t*)shapes, inputs[1]->attr.dim_num );
+            CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
+            reshape_tensors[1] = input1;
         }
     }
     else
     {
+        uint32_t rank = inputs[0]->attr.dim_num;
         dims = inputs[1]->attr.dim_num;
 
         memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t));
@@ -141,9 +150,32 @@ static vsi_status _prelu_op_compute
             shapes[1] = 1;
             dims = 2;
         }
+        else if (one_rank && inputs[1]->attr.is_const == TRUE &&
+            alpha_shape == inputs[0]->attr.size[0] &&
+            alpha_shape == inputs[1]->attr.size[0] &&
+            rank < 3)
+        {
+            is_per_channel_alpha = TRUE;
+            shapes[0] = 1;
+            shapes[1] = 1;
+            shapes[2] = alpha_shape;
+            shapes[3] = rank > 1 ? inputs[0]->attr.size[1] : 1;
+            dims = 4;
+            input0 = vsi_nn_reshape_tensor(self->graph, inputs[0], (vsi_size_t*)shapes, dims);
+            CHECK_PTR_FAIL_GOTO(input0, "Create tensor fail.", final);
+            reshape_tensors[0] = input0;
+            output = vsi_nn_reshape_tensor(self->graph, outputs[0], (vsi_size_t*)shapes, dims);
+            CHECK_PTR_FAIL_GOTO(output, "Create tensor fail.", final);
+            reshape_tensors[2] = output;
+            shapes[0] = alpha_shape;
+            shapes[1] = 1;
+            dims = 2;
+        }
 
-        reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+        input1 = vsi_nn_reshape_tensor( self->graph,
             inputs[1], (vsi_size_t*)shapes, dims );
+        CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
+        reshape_tensors[1] = input1;
     }
 
     // Add params
@@ -153,15 +185,19 @@ static vsi_status _prelu_op_compute
     self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
         kernel_name,
         &reshape_tensors[0], 2,
-        outputs, 1, param );
+        &reshape_tensors[2], 1, param );
 
     vsi_nn_kernel_param_release( &param );
-    vsi_nn_ReleaseTensor( &reshape_tensors[1] );
-    if( self->n )
+    if ( self->n )
     {
         status = VSI_SUCCESS;
     }
 
+final:
+    vsi_safe_release_tensor(input0);
+    vsi_safe_release_tensor(input1);
+    vsi_safe_release_tensor(output);
+
     return status;
 } /* _prelu_op_compute() */
 
@@ -211,28 +247,36 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(PRELU, 2, 1)
-        IO_TYPE(D_F16,  D_F16, D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_F16, D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_F16, D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_F16, D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_F16)
-        IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_F16, D_F16)
-        IO_TYPE(D_BF16, D_F16, D_BF16)
-        IO_TYPE(D_BF16, D_BF16, D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
-        IO_TYPE(D_F32, D_F32, D_F32)
-        IO_TYPE(D_I32, D_I32, D_I32)
+        IO_TYPE(D_F16,          D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_F16)
+        IO_TYPE(D_BF16,         D_F16,          D_BF16)
+        IO_TYPE(D_BF16,         D_BF16,         D_BF16)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_F32,          D_F32,          D_F32)
+        IO_TYPE(D_I32,          D_I32,          D_I32)
 
         /* HW 9.0 */
-        IO_TYPE(D_F32, D_BF16, D_BF16)
-        IO_TYPE(D_BF16, D_BF16, D_F32)
+        IO_TYPE(D_F32,          D_BF16,         D_BF16)
+        IO_TYPE(D_BF16,         D_BF16,         D_F32)
     END_IO_TYPE_DECL(PRELU)
-    if(!VALIDATE_OP_IO_TYPES(PRELU, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(PRELU, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
index 4c314b8..84dc0b6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
@@ -162,7 +162,7 @@ static vsi_bool _check_is_sp_supported_type
     int32_t * axes = self->nn_param.reduce.local2->axes;
     int32_t axes_num = self->nn_param.reduce.local2->axes_num;
 
-    if ( !self->graph->ctx->config.support_stream_processor ||
+    if ( !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor ||
          (type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN && type != VSI_NN_REDUCE_MAX) )
     {
         return FALSE;
@@ -788,7 +788,7 @@ static vsi_bool op_set_reduce_axis(
         }
         *out_rank_x = inputs[0]->attr.dim_num;
     }
-    else if (!self->graph->ctx->config.support_stream_processor ||
+    else if (!((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor ||
              resolved_dim_count > 2)
     {
         optimzation_input_size(
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
index 662fa96..ce249e2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
@@ -61,7 +61,7 @@ static vsi_status op_compute
         vx_nn_reshape_params_t reshape_param;
 
         memset(&attr, 0, sizeof(attr));
-        attr.size[0] = self->nn_param.reshape.dim_num;
+        attr.size[0] = vsi_nn_max(self->nn_param.reshape.dim_num, 1);
         attr.dim_num = 1;
         attr.is_const = TRUE;
         attr.dtype.vx_type = VSI_NN_TYPE_INT32;
@@ -124,17 +124,28 @@ static vsi_bool op_setup
     vsi_bool ret = TRUE;
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
-        vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
-        uint32_t i = 0;
-        for (i = 0; i < self->nn_param.reshape.dim_num; i++)
+        if (self->nn_param.reshape.dim_num == 0 ||
+            self->nn_param.reshape.size == NULL
+            )
         {
-            shape[i] = (uint32_t)-1 == self->nn_param.reshape.size[i] ? \
-                (vsi_size_t)-1 : (vsi_size_t)self->nn_param.reshape.size[i];
+            outputs[0]->attr.size[0] = 1;
+            outputs[0]->attr.dim_num = 1;
+            vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
+        }
+        else
+        {
+            vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+            uint32_t i = 0;
+            for (i = 0; i < self->nn_param.reshape.dim_num; i++)
+            {
+                shape[i] = (uint32_t)-1 == self->nn_param.reshape.size[i] ? \
+                    (vsi_size_t)-1 : (vsi_size_t)self->nn_param.reshape.size[i];
+            }
+            ret = vsi_nn_CalcReshapeTensor(inputs[0],
+                outputs[0],
+                shape,
+                self->nn_param.reshape.dim_num);
         }
-        ret = vsi_nn_CalcReshapeTensor(inputs[0],
-            outputs[0],
-            shape,
-            self->nn_param.reshape.dim_num);
     }
 
     return ret;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
index 93d269d..dff517f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
@@ -66,7 +66,7 @@ static vsi_status op_compute
         }
 
         memset(&attr, 0, sizeof(attr));
-        attr.size[0] = self->nn_param.reshape2.dim_num;
+        attr.size[0] = vsi_nn_max(self->nn_param.reshape2.dim_num, 1);
         attr.dim_num = 1;
         attr.is_const = TRUE;
         attr.dtype.vx_type = VSI_NN_TYPE_INT32;
@@ -161,13 +161,24 @@ static vsi_bool op_setup
     vsi_bool ret = TRUE;
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
-        vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
-        memcpy(shape, self->nn_param.reshape2.size,
-            sizeof(vsi_size_t) * self->nn_param.reshape2.dim_num);
-        ret = vsi_nn_CalcReshapeTensor(inputs[0],
-            outputs[0],
-            shape,
-            self->nn_param.reshape2.dim_num);
+        if (self->nn_param.reshape2.dim_num == 0 ||
+            self->nn_param.reshape2.size == NULL
+            )
+        {
+            outputs[0]->attr.size[0] = 1;
+            outputs[0]->attr.dim_num = 1;
+            vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
+        }
+        else
+        {
+            vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+            memcpy(shape, self->nn_param.reshape2.size,
+                sizeof(vsi_size_t) * self->nn_param.reshape2.dim_num);
+            ret = vsi_nn_CalcReshapeTensor(inputs[0],
+                outputs[0],
+                shape,
+                self->nn_param.reshape2.dim_num);
+        }
     }
 
     return ret;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rope.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rope.c
new file mode 100644
index 0000000..45c0307
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rope.c
@@ -0,0 +1,145 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "vsi_nn_error.h"
+
+typedef struct _rope_local_data_t {
+    int32_t placeholder;
+} rope_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t* param = NULL;
+    int32_t axis = self->nn_param.rope.axis;
+    vsi_bool interleaved = self->nn_param.rope.interleaved;
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32(param, "axis", axis);
+    vsi_nn_kernel_param_add_int32(param, "interleaved", interleaved);
+    self->n = (vx_node)vsi_nn_kernel_selector(self->graph, "rope",
+        inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param);
+
+    if ( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+    if (param != NULL)
+    {
+        vsi_nn_kernel_param_release(&param);
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(ROPE, _INPUT_NUM, _OUTPUT_NUM)
+        IO_TYPE(D_F32,          D_F32,          D_F32,          D_F32)
+        IO_TYPE(D_BF16,         D_BF16,         D_BF16,         D_BF16)
+        IO_TYPE(D_F16,          D_F16,          D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_U16|Q_ASYM,   D_U16|Q_ASYM,   D_U16|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP,    D_I8|Q_DFP)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_I16|Q_SYM,    D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP,    D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_I16|Q_SYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_F16,          D_F16,          D_U16|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16,          D_U8|Q_ASYM)
+    END_IO_TYPE_DECL(ROPE)
+        if (!VALIDATE_OP_IO_TYPES(ROPE, self, inputs, self->input.num, outputs, self->output.num))
+        {
+            char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+            VSILOGE("Inputs/Outputs data type not support: %s", desc);
+            destroy_op_io_types_desc(desc);
+            return FALSE;
+        }
+
+    return TRUE;
+} /* op_check() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ ROPE,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ vsi_nn_op_common_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
index 0d85eb1..70b22e7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
@@ -25,6 +25,7 @@
 #include <stdlib.h>
 
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_node.h"
@@ -188,7 +189,7 @@ static vsi_status op_optimize
     }
     if ( _need_split_softmax(self, inputs) == FALSE ||
          self->nn_param.softmax_internal.axis != 0 ||
-         self->graph->ctx->config.support_stream_processor )
+         ((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor )
     {
         return status;
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
index 7e8ae34..652e6c4 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
@@ -39,6 +39,10 @@
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 #include "vsi_nn_error.h"
 
+typedef struct _topk_local_data_t {
+    vsi_bool use_internal_node;
+} topk_local_data_t;
+
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (2)
 
@@ -111,19 +115,43 @@ static vsi_status op_compute
     vsi_nn_tensor_t * out1_tensor = NULL;
     vsi_bool ret = FALSE;
 
-    if (inputs[0]->attr.size[axis] == 1)
+    if (self->nn_param.topk.local->use_internal_node)
     {
         return vsi_nn_internal_compute_node( self );
     }
 
-    ret = vsi_nn_kernel_optimize_softmax_shape(
-            inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
-            shapes[0], &rank_in, &new_axis0);
+    if (inputs[0]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH)
+    {
+        int32_t i = 1;
 
-    ret = vsi_nn_kernel_optimize_softmax_shape(
-            outputs[0]->attr.size, outputs[0]->attr.dim_num, axis,
-            shapes[1], &rank_out, &new_axis1);
+        shapes[0][0] = inputs[0]->attr.size[0];
+        shapes[1][0] = outputs[0]->attr.size[0];
+        shapes[0][1] = 1;
+        shapes[1][1] = 1;
+        for (i = 1; i < (int32_t)(inputs[0]->attr.dim_num); i++)
+        {
+            shapes[0][1] = shapes[0][1] * inputs[0]->attr.size[i];
+        }
+        for (i = 1; i < (int32_t)(outputs[0]->attr.dim_num); i++)
+        {
+            shapes[1][1] = shapes[1][1] * outputs[0]->attr.size[i];
+        }
+        new_axis0 = axis;
+        new_axis1 = axis;
+        rank_in = 2;
+        rank_out = 2;
+        ret = TRUE;
+    }
+    else
+    {
+        ret = vsi_nn_kernel_optimize_softmax_shape(
+                inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+                shapes[0], &rank_in, &new_axis0);
 
+        ret = vsi_nn_kernel_optimize_softmax_shape(
+                outputs[0]->attr.size, outputs[0]->attr.dim_num, axis,
+                shapes[1], &rank_out, &new_axis1);
+    }
     if (ret)
     {
         uint32_t perm_in[VSI_NN_MAX_DIM_NUM] = {0};
@@ -303,10 +331,12 @@ static vsi_bool op_setup
         vsi_nn_internal_tensor_t* const0_input = NULL;
         vsi_nn_tensor_attr_t attr;
 
+        p->local->use_internal_node = TRUE;
+
         vsi_nn_internal_init_node_wksp(self);
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
         CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
-        curr->inputs[0]  = inputs[0];
+        curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[0];
         vsi_nn_internal_setup_node(self, curr);
 
@@ -318,10 +348,42 @@ static vsi_bool op_setup
         CHECK_PTR_FAIL_GOTO(const0_input, "Create tensor failed", final);
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
         CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
-        curr->inputs[0]  = const0_input->t;
+        curr->inputs[0] = const0_input->t;
         curr->outputs[0] = outputs[1];
         vsi_nn_internal_setup_node(self, curr);
     }
+    else if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE)
+    {
+        vsi_nn_internal_node_t* curr = NULL;
+        vsi_nn_internal_tensor_t* temp_tensor = NULL;
+        vsi_nn_tensor_attr_t attr;
+
+        p->local->use_internal_node = TRUE;
+
+        vsi_nn_internal_init_node_wksp(self);
+
+        memcpy(&attr, &inputs[0]->attr, sizeof(vsi_nn_tensor_attr_t));
+        attr.dim_num = VSI_NN_DIM_AUTO;
+        attr.vtl = TRUE;
+        attr.is_const = FALSE;
+        temp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+        CHECK_PTR_FAIL_GOTO(temp_tensor, "Create tensor failed", final);
+
+        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_TOPK, 1, 2);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+        curr->node->nn_param.topk.axis = p->axis;
+        curr->node->nn_param.topk.k = p->k;
+        curr->inputs[0] = inputs[0];
+        curr->outputs[0] = temp_tensor->t;
+        curr->outputs[1] = outputs[1];
+        vsi_nn_internal_setup_node(self, curr);
+
+        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+        curr->inputs[0] = temp_tensor->t;
+        curr->outputs[0] = outputs[0];
+        vsi_nn_internal_setup_node(self, curr);
+    }
 
     return TRUE;
 final:
@@ -341,7 +403,7 @@ static vsi_status op_optimize
     VSI_UNREFERENCED(outputs);
 
     p = &(self->nn_param.topk);
-    if (inputs[0]->attr.size[p->axis] == 1)
+    if (p->local->use_internal_node)
     {
         return vsi_nn_internal_optimize_node( self, direction );
     }
@@ -357,6 +419,14 @@ static vsi_status op_init
     vsi_status status = VSI_SUCCESS;
     self->nn_param.topk.axis = 0;
 
+    self->nn_param.topk.local = \
+        (topk_local_data_t*)malloc(sizeof(topk_local_data_t));
+    if (NULL == self->nn_param.topk.local)
+    {
+        return  VX_ERROR_NO_MEMORY;
+    }
+    memset(self->nn_param.topk.local, 0, sizeof(topk_local_data_t));
+
     return status;
 } /* op_init() */
 
@@ -365,7 +435,12 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
-    vsi_nn_internal_deinit_node_wksp(self);
+    if (self->nn_param.topk.local->use_internal_node)
+    {
+        vsi_nn_internal_deinit_node_wksp(self);
+    }
+
+    vsi_nn_safe_free(self->nn_param.topk.local);
     vsi_nn_op_common_deinit(self);
 
     return VSI_SUCCESS;
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
index feaa0fc..d411a6b 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@@ -475,6 +475,7 @@ static _op_param_gen_t s_op_gen[] =
     /* GROUPED_CONV3D */        NULL,
     /* COL2IM */                NULL,
     /* L1_LAYER_NORM */         NULL,
+    /* ROPE */                  NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
 
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
index ac4aa2a..e59bc81 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
@@ -98,7 +98,7 @@ static VSI_INLINE_API void _convert_bfloat16_to_float
     uint32_t i;
     for( i = 0; i < size; i ++ )
     {
-        out_buffer[i] = bfp16_to_fp32( (int16_t)buffer[i] );
+        out_buffer[i] = bfp16_to_fp32( (uint16_t)buffer[i] );
     }
 } /* _convert_bfloat16_to_float */
 
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c
index 3a40e10..969ca51 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c
@@ -40,6 +40,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_log.h"
@@ -1261,7 +1262,9 @@ vsi_bool vsi_nn_is_same_quant_type(
             break;
         }
 #ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
-        case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC: {
+        case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC:
+        case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC:
+        {
             const float diff = (float)1e-5;
             int32_t i = 0;
             int32_t scale_cnt0 = src_dtype->group_count;
@@ -1627,12 +1630,12 @@ vsi_bool vsi_nn_is_stream_process_supported_types
 {
     size_t i = 0;
 
-    if ( graph->ctx->config.support_stream_processor == 0 )
+    if ( ((vsi_nn_graph_prv_t*)graph)->options->config.support_stream_processor == 0 )
     {
         return FALSE;
     }
 
-    if ( graph->ctx->config.sp_exec_count == 0 )
+    if ( ((vsi_nn_graph_prv_t*)graph)->options->config.sp_exec_count == 0 )
     {
         return FALSE;
     }
@@ -1769,3 +1772,11 @@ typedef enum
 
     return support;
 }
+
+uint32_t vsi_nn_get_tensor_dims
+    (
+        vsi_nn_tensor_t* tensor
+    )
+{
+    return vsi_nn_GetTensorIsScalar(tensor) ? 0 : tensor->attr.dim_num;
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c
index 4fd9be7..e669a2c 100644
--- a/src/tim/vx/internal/src/vsi_nn_context.c
+++ b/src/tim/vx/internal/src/vsi_nn_context.c
@@ -39,6 +39,9 @@ static vsi_status query_hardware_caps
 #endif
 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
     vx_hardware_caps_params_ext_t paramExt;
+#if VX_FIXED_FUNCTION_DEVICE_SUPPORT
+    vx_hardware_caps_params_ext3_t paramExt3;
+#endif
 
     memset(&paramExt, 0, sizeof(vx_hardware_caps_params_ext_t));
     status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt),
@@ -73,6 +76,13 @@ static vsi_status query_hardware_caps
     }
 #endif
 
+#if VX_FIXED_FUNCTION_DEVICE_SUPPORT
+    memset(&paramExt3, 0, sizeof(vx_hardware_caps_params_ext3_t));
+    status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt3),
+        sizeof(vx_hardware_caps_params_ext3_t));
+    context->config.support_ffd = paramExt3.supportFixedFunctionDevice;
+#endif
+
 #endif
 
     if(param.evis1 == TRUE && param.evis2 == FALSE)
@@ -93,6 +103,85 @@ final:
     return status;
 }
 
+vsi_status query_hardware_caps_runtime
+    (
+    vsi_nn_context_t context,
+    vsi_nn_runtime_option_t* options
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vx_hardware_caps_params_t param;
+    VSI_UNREFERENCED(options);
+    memset(&(options->config), 0, sizeof(vsi_nn_hw_config_t));
+#if VX_STREAM_PROCESSOR_SUPPORT
+    vx_hardware_caps_params_ext2_t paramExt2;
+#endif
+#if VX_FIXED_FUNCTION_DEVICE_SUPPORT
+    vx_hardware_caps_params_ext3_t paramExt3;
+#endif
+#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
+    vx_hardware_caps_params_ext_t paramExt;
+
+    memset(&paramExt, 0, sizeof(vx_hardware_caps_params_ext_t));
+    status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt),
+                sizeof(vx_hardware_caps_params_ext_t));
+    param.evis1 = paramExt.base.evis1;
+    param.evis2 = paramExt.base.evis2;
+#else
+    memset(&param, 0, sizeof(vx_hardware_caps_params_t));
+    status = vxQueryHardwareCaps(context->c, &param, sizeof(vx_hardware_caps_params_t));
+#endif
+    TEST_CHECK_STATUS(status, final);
+
+#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
+    options->config.subGroupSize = paramExt.subGroupSize;
+#ifdef VSI_40BIT_VA_SUPPORT
+    options->config.use_40bits_va = paramExt.supportVA40;
+#endif
+#if VX_STREAM_PROCESSOR_SUPPORT
+    memset(&paramExt2, 0, sizeof(vx_hardware_caps_params_ext2_t));
+    status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt2),
+                sizeof(vx_hardware_caps_params_ext2_t));
+    if (options->enable_stream_processor)
+    {
+        options->config.support_stream_processor = paramExt.supportStreamProcessor;
+        options->config.sp_exec_count = paramExt2.streamProcessorExecCount;
+        options->config.sp_vector_depth = paramExt2.streamProcessorVectorSize;
+        if (options->config.sp_exec_count > 0)
+        {
+            options->config.sp_per_core_vector_depth =
+                options->config.sp_vector_depth / options->config.sp_exec_count;
+        }
+    }
+#endif
+
+#if VX_FIXED_FUNCTION_DEVICE_SUPPORT
+    memset(&paramExt3, 0, sizeof(vx_hardware_caps_params_ext3_t));
+    status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt3),
+        sizeof(vx_hardware_caps_params_ext3_t));
+    options->config.support_ffd = paramExt3.supportFixedFunctionDevice;
+#endif
+
+#endif
+
+    if(param.evis1 == TRUE && param.evis2 == FALSE)
+    {
+        options->config.evis.ver = VSI_NN_HW_EVIS_1;
+    }
+    else if(param.evis1 == FALSE && param.evis2 == TRUE)
+    {
+        options->config.evis.ver = VSI_NN_HW_EVIS_2;
+    }
+    else
+    {
+        options->config.evis.ver = VSI_NN_HW_EVIS_NONE;
+        VSILOGW("Unsupported evis version");
+    }
+
+final:
+    return status;
+}
+
 #if (defined(__ANDROID__)) && ((ANDROID_SDK_VERSION >= 30) || (__ANDROID_API__ >= 30))
 static const char* ENV_ENABLE_SHADER = "vendor.VIV_VX_ENABLE_SHADER";
 static const char* ENV_ENABLE_OPCHECK = "vendor.VSI_NN_ENABLE_OPCHECK";
@@ -153,6 +242,44 @@ vsi_status vsi_nn_initOptions
     return VSI_SUCCESS;
 }
 
+vsi_status vsi_nn_initOptions_runtime
+    (
+    vsi_nn_runtime_option_t *options,
+    vsi_nn_context_t ctx
+    )
+{
+    int32_t default_value = 1;
+
+    options->enable_shader = vsi_nn_getenv_asint(ENV_ENABLE_SHADER, 1);
+    options->enable_opcheck = vsi_nn_getenv_asint(ENV_ENABLE_OPCHECK, 1);
+#if (VX_CONCAT_OPT_SUPPORT)
+    default_value = 0;
+#else
+    default_value = 1;
+#endif
+    options->enable_concat_optimize = vsi_nn_getenv_asint(ENV_ENABLE_CONCAT_OPTIMIZE, default_value);
+    options->enable_i8_to_u8 = vsi_nn_getenv_asint(ENV_ENABLE_I8TOU8, 1);
+    options->enable_dataconvert_optimize = vsi_nn_getenv_asint(ENV_ENABLE_DATACONVERT_OPTIMIZE, 1);
+    options->enable_stream_processor = vsi_nn_getenv_asint(ENV_ENABLE_STREAM_PROCESSOR, 1);
+    options->enable_rgb88_planar_nhwc = vsi_nn_getenv_asint(ENV_FORCE_RGB888_OUT_NHWC, 0);
+#if (VX_STRIDED_SLICE_OPT_SUPPORT)
+    default_value = 0;
+#else
+    default_value = 1;
+#endif
+    options->enable_slice_optimize = vsi_nn_getenv_asint(ENV_ENABLE_SLICE_OPTIMIZE, default_value);
+    options->enable_batch_opt = vsi_nn_getenv_asint(ENV_ENABLE_BATCH_OPT, 0);
+    options->enable_save_file_type = vsi_nn_getenv_asint(ENV_SAVE_FILE_TYPE, 0);
+    options->enable_use_image_process = vsi_nn_getenv_asint(VSI_USE_IMAGE_PROCESS, -1);
+    options->enable_use_from_handle = vsi_nn_getenv_asint(VSI_USE_FROM_HANDLE, -1);
+
+    /*init hw params*/
+    options->config = ctx->config;
+
+    return VSI_SUCCESS;
+}
+
+
 vsi_nn_context_t vsi_nn_CreateContext
     ( void )
 {
diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c
index 85cad88..2ee8f0b 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph.c
@@ -1362,7 +1362,7 @@ vsi_nn_graph_t * vsi_nn_CreateGraph
             graph->isAllowFastMode = TRUE;
             vsi_nn_MapInit( graph->node_table );
             vsi_nn_MapInit( graph->tensor_table );
-            vsi_nn_initOptions( ((vsi_nn_graph_prv_t*) graph)->options );
+            vsi_nn_initOptions_runtime( ((vsi_nn_graph_prv_t*) graph)->options, ctx );
         }
         else
         {
@@ -3398,6 +3398,7 @@ char* vsi_nn_GetRunTimeVariable
 #define varSize 256
     char* value_str = (char*)malloc(sizeof(char) * varSize);
     CHECK_PTR_FAIL_GOTO(value_str, "Create value_str fail.", final);
+    CHECK_PTR_FAIL_GOTO(graph, "Graph is NULL!", final);
     memset(value_str, 0, varSize);
     char tmp_value[varSize] = {0};
     VSI_UNREFERENCED(tmp_value);
@@ -3502,6 +3503,8 @@ vsi_status vsi_nn_SetRunTimeVariable
                 break;
             case VSI_VX_ENABLE_STREAM_PROCESSOR:
                 options->enable_stream_processor = atoi(value);
+                options->config.support_stream_processor = atoi(value);
+                status = query_hardware_caps_runtime(graph->ctx, options);
                 break;
             case VSI_VX_ENABLE_BATCH_OPT:
                 options->enable_batch_opt = atoi(value);
diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
index c017ea5..9cbe72a 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
@@ -895,10 +895,13 @@ static void _convert_const_I8toU8
     attr->dtype.vx_type = VSI_NN_TYPE_UINT8;
     attr->dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
     attr->dtype.zero_point += 128;
-
-    if ( tensor->t ) vxReleaseTensor(&tensor->t);
+    if (tensor->t) vxReleaseTensor(&tensor->t);
     tensor->t = vsi_nn_CreateRawTensorFromData(graph, data, attr);
-
+#if defined(VSI_TENSOR_SPARSITY_SUPPORT)
+    int32_t is_sparsity = 0;
+    is_sparsity = vsi_nn_GetTensorIsSparsity(tensor);
+    vsi_nn_SetTensorIsSparsity(tensor, is_sparsity);
+#endif
 final:
     vsi_nn_safe_free( data );
 }/* _convert_const_I8toU8() */
diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
index c30d031..f9c66bb 100644
--- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
+++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
@@ -247,7 +247,8 @@ static void _set_preproc_node_input_attr
     vsi_nn_tensor_attr_t* attr,
     vsi_nn_preprocess_image_size_t* input_size,
     vsi_nn_preprocess_source_format_e* source_format,
-    vsi_nn_preprocess_source_layout_e* source_layout
+    vsi_nn_preprocess_source_layout_e* source_layout,
+    vsi_nn_preprocess_dtype_convert_t* data_convert
     )
 {
     *input_attr = *attr;
@@ -266,26 +267,33 @@ static void _set_preproc_node_input_attr
     }
     if(*source_format == VSI_NN_SOURCE_FORMAT_TENSOR)
     {
-        input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-        input_attr->dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+        if(data_convert != NULL)
+        {
+            input_attr->dtype = data_convert->dtype;
+        }
+        else
+        {
+            input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+            input_attr->dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+        }
     }
     else
     {
         input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
         input_attr->dtype.vx_type = VSI_NN_TYPE_UINT8;
     }
-    if(*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB)
+    if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB)
     {
-        if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC)
+        if (*source_layout == VSI_NN_SOURCE_LAYOUT_NCHW)
         {
-            input_attr->size[0] = input_attr->size[1]*input_attr->size[0];
-            input_attr->size[1] = input_attr->size[2];
-            input_attr->size[2] = 1;
-        }
-        else
-        {
-            input_attr->size[0] = input_attr->size[2]*input_attr->size[0];
-            input_attr->size[2] = 1;
+            vsi_size_t channel = input_attr->size[2];
+            if (channel != 3)
+            {
+                VSILOGE("RGB chanel must be 3, please have a check!");
+            }
+            input_attr->size[2] = input_attr->size[1];
+            input_attr->size[1] = input_attr->size[0];
+            input_attr->size[0] = channel;
         }
     }
 
@@ -333,15 +341,10 @@ static void _set_preproc_node_input_attr
 static void _set_preproc_node_output_attr
     (
     vsi_nn_tensor_attr_t* output_attr,
-    vsi_nn_tensor_attr_t* attr,
-    vsi_nn_preprocess_dtype_convert_t* data_convert
+    vsi_nn_tensor_attr_t* attr
     )
 {
     *output_attr = *attr;
-    if(data_convert != NULL)
-    {
-        output_attr->dtype = data_convert->dtype;
-    }
     output_attr->dtype.fmt = VSI_NN_DIM_FMT_NCHW;
     output_attr->dim_num = VSI_NN_DIM_AUTO;
     output_attr->is_const = FALSE;
@@ -603,10 +606,11 @@ vsi_status vsi_nn_add_single_preproc_node
     _set_preproc_node_out_attr(node, image_resize, &org_norm_tensor->attr, source_layout);
 
     /* Set input tensor attr */
-    _set_preproc_node_input_attr(&input_attr, &org_norm_tensor->attr, input_size, source_format, source_layout);
+    _set_preproc_node_input_attr(&input_attr, &org_norm_tensor->attr, input_size,
+                                 source_format, source_layout, data_convert);
 
     /* Set output tensor attr */
-    _set_preproc_node_output_attr(&output_attr, &org_norm_tensor->attr, data_convert);
+    _set_preproc_node_output_attr(&output_attr, &org_norm_tensor->attr);
 
     /* Create new norm and virtual tensors */
     if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 ||
diff --git a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c
index 44ab53e..419cf2d 100644
--- a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c
+++ b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c
@@ -33,6 +33,7 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_util.h"
 #include "vsi_nn_rnn_helper.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_error.h"
 
 vsi_bool vsi_nn_rnn_find_best_kernel_size
@@ -804,7 +805,7 @@ vsi_status vsi_nn_rnn_data_check_aligned
         vsi_size_t tensor_size = vsi_nn_GetTensorSize( input[i]->attr.size,
             input[i]->attr.dim_num, input[i]->attr.dtype.vx_type );
 
-        if( ofst & 0x3f && !self->graph->ctx->config.support_stream_processor)
+        if( ofst & 0x3f && !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
         {
             vsi_nn_internal_init_tensor_attr(&attr, &input[i]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c
index 179755f..6a40412 100644
--- a/src/tim/vx/internal/src/vsi_nn_tensor.c
+++ b/src/tim/vx/internal/src/vsi_nn_tensor.c
@@ -155,6 +155,15 @@ static void print_tensor
                          tensor->attr.dtype.group_size);
         ext_attr[count] = 0;
         break;
+    case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC:
+        count = snprintf(&ext_attr[0],
+                         _EXT_ATTR_BUF_SZ,
+                         "ASYM GPTQ axis=%d, count=%d, group_size=%d",
+                         tensor->attr.dtype.group_channel_dim,
+                         tensor->attr.dtype.group_count,
+                         tensor->attr.dtype.group_size);
+        ext_attr[count] = 0;
+        break;
 #endif
     default:
         vsi_nn_strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ);
@@ -449,6 +458,11 @@ static vsi_bool _init_tensor
         scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.group_count);
         CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
         memcpy(scales, tensor->attr.dtype.group_scales, tensor->attr.dtype.group_count * sizeof(float));
+        zeroPoints = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.zero_points_dim);
+        CHECK_PTR_FAIL_GOTO( zeroPoints, "Create buffer fail.", final );
+        memcpy(zeroPoints,
+               tensor->attr.dtype.zero_points,
+               tensor->attr.dtype.zero_points_dim * sizeof(int32_t));
         params.quant_data.affinePerGroup.channel_dim = tensor->attr.dtype.group_channel_dim;
         params.quant_data.affinePerGroup.group_size = tensor->attr.dtype.group_size;
         params.quant_data.affinePerGroup.scale_group_count = tensor->attr.dtype.group_count;
@@ -460,6 +474,32 @@ static vsi_bool _init_tensor
         VSILOGE(
             "can't support qnt_type "
             "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC.");
+        break;
+#endif
+    case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC:
+#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
+        params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_GROUP;
+        // This is a hack that driver doesn't support const scales
+        scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.group_count);
+        CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
+        memcpy(scales, tensor->attr.dtype.group_scales, tensor->attr.dtype.group_count * sizeof(float));
+        zeroPoints = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.zero_points_dim);
+        CHECK_PTR_FAIL_GOTO( zeroPoints, "Create buffer fail.", final );
+        memcpy(zeroPoints,
+               tensor->attr.dtype.group_zero_points,
+               tensor->attr.dtype.group_count * sizeof(int32_t));
+        params.quant_data.affinePerGroup.channel_dim = tensor->attr.dtype.group_channel_dim;
+        params.quant_data.affinePerGroup.group_size = tensor->attr.dtype.group_size;
+        params.quant_data.affinePerGroup.scale_group_count = tensor->attr.dtype.group_count;
+        params.quant_data.affinePerGroup.scales = scales;
+        params.quant_data.affinePerGroup.zero_points = zeroPoints;
+        params.quant_data.affinePerGroup.zero_point_group_count = tensor->attr.dtype.group_count;
+        break;
+#else
+        VSILOGE(
+            "can't support qnt_type "
+            "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC.");
+        break;
 #endif
     default:
         break;
@@ -1788,6 +1828,57 @@ int8_t vsi_nn_GetTensorIsScalar
     return _get_tensor_is_scalar((vsi_nn_tensor_prv_t*)tensor);
 }
 
+int32_t _get_tensor_is_sparsity
+(
+    vsi_nn_tensor_prv_t* tensor
+)
+{
+    int32_t is_sparsity = FALSE;
+    if (NULL == tensor)
+    {
+        VSILOGE("To get is_sparsity, tensor pointer SHOULD NOT be NULL.");
+        goto final;
+    }
+#if defined(VSI_TENSOR_SPARSITY_SUPPORT)
+    is_sparsity = tensor->sparsity_type;
+#endif
+final:
+    return is_sparsity;
+}
+
+int32_t vsi_nn_GetTensorIsSparsity
+(
+    vsi_nn_tensor_t* tensor
+)
+{
+    return _get_tensor_is_sparsity((vsi_nn_tensor_prv_t*)tensor);
+}
+
+vsi_status vsi_nn_SetTensorIsSparsity
+(
+    vsi_nn_tensor_t* tensor,
+    int32_t is_sparsity
+)
+{
+    VSI_UNREFERENCED(is_sparsity);
+    vsi_status status = VSI_SUCCESS;
+    if (NULL == tensor) {
+        status = VSI_FAILURE;
+        goto final;
+    }
+#if defined(VSI_TENSOR_SPARSITY_SUPPORT)
+    vxSetTensorAttribute(tensor->t,
+                        VX_TENSOR_SPARSITY_TYPE,
+                        &is_sparsity,
+                        sizeof(vx_enum));
+    status = VSI_SUCCESS;
+    ((vsi_nn_tensor_prv_t*)tensor)->sparsity_type = is_sparsity;
+#endif
+final:
+    return status;
+}
+
+
 vsi_status vsi_nn_CopyRawDataToTensor
     (
     vsi_nn_graph_t*         graph,
diff --git a/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h b/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h
index c041c65..a7dcf56 100644
--- a/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h
+++ b/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h
@@ -75,6 +75,11 @@ vsi_status _set_tensor_is_scalar
     int8_t is_salar
     );
 
+vsi_status _set_tensor_is_sparsity(
+    vsi_nn_tensor_prv_t* tensor,
+    int32_t is_sparsity
+);
+
 int8_t _get_tensor_is_from_axisram
     (
     vsi_nn_tensor_prv_t* tensor
@@ -127,6 +132,11 @@ vsi_nn_tensor_t * vsi_nn_kernel_insert_reshape_node
         vsi_nn_opt_direction_e direction
     );
 
+uint32_t vsi_nn_get_tensor_dims
+    (
+        vsi_nn_tensor_t* tensor
+    );
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/tim/vx/internal/src/vsi_nn_types_prv.h b/src/tim/vx/internal/src/vsi_nn_types_prv.h
index 4f9fd0b..5d89b0b 100644
--- a/src/tim/vx/internal/src/vsi_nn_types_prv.h
+++ b/src/tim/vx/internal/src/vsi_nn_types_prv.h
@@ -108,6 +108,11 @@ typedef struct _vsi_nn_tensor_prv
     /** create tensor from axisram.*/
     int8_t is_from_axisram;
 
+    /** 2:4 sparsity attr. */
+#if defined(VSI_TENSOR_SPARSITY_SUPPORT)
+    vx_tensor_sparsity_param_e sparsity_type; /*!< \brief sparsity type for the tensor */
+#endif
+
     // Add tensor internal attribute here...
 } vsi_nn_tensor_prv_t;