Update internal ovxlib to release/1.2.22 (#706)

* Update internal ovxlib to release/1.2.22 Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com> * Refine yaml file for blocking tfhub model tests Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com> --------- Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com>
2025-01-08 13:22:46 +08:00 · 2025-01-08 13:22:46 +08:00 · 8494275d76
parent 149834832c
commit 8494275d76
94 changed files with 9466 additions and 3885 deletions
--- a/.github/workflows/cmake_x86_vsim.yml
+++ b/.github/workflows/cmake_x86_vsim.yml
@ -124,7 +124,7 @@ jobs:
        run: |
          git config --global user.email "xiang.zhang@verisilicon.com"
          git config --global user.name "xiang.zhang"
-          git clone https://github.com/tensorflow/tensorflow.git ${{github.workspace}}/3rd-party/tensorflow && cd ${{github.workspace}}/3rd-party/tensorflow/ && git checkout v2.10.0
+          git clone https://github.com/tensorflow/tensorflow.git ${{github.workspace}}/3rd-party/tensorflow && cd ${{github.workspace}}/3rd-party/tensorflow/ && git checkout v2.16.1
          git clone https://github.com/VeriSilicon/tflite-vx-delegate.git ${{github.workspace}}/vx-delegate
          cmake -B ${{github.workspace}}/vx-delegate/build -S ${{github.workspace}}/vx-delegate -DFETCHCONTENT_SOURCE_DIR_TENSORFLOW=${{github.workspace}}/3rd-party/tensorflow -DTIM_VX_INSTALL=${{github.workspace}}/tim-vx.install.dir/ -DTFLITE_ENABLE_NNAPI=OFF -DTFLITE_ENABLE_XNNPACK=OFF
          cmake --build ${{github.workspace}}/vx-delegate/build --config ${{env.BUILD_TYPE}}
@ -283,61 +283,61 @@ jobs:
  #         chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
  #         ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/tfhub.movenet.multipose.tflite
-  tfhub-efficientdet-lite0:
+  # tfhub-efficientdet-lite0:
-    runs-on: ubuntu-latest
+  #   runs-on: ubuntu-latest
-    needs: [vx-delegate-build, tim-vx-unit-test]
+  #   needs: [vx-delegate-build, tim-vx-unit-test]
-    steps:
+  #   steps:
-      - name: download test binary
+  #     - name: download test binary
-        uses: actions/download-artifact@v3
+  #       uses: actions/download-artifact@v3
-      - name: download model
+  #     - name: download model
-        run: |
+  #       run: |
-          wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite0/detection/metadata/1.tflite
+  #         wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite0/detection/metadata/1.tflite
-      - name: benchmark-model
+  #     - name: benchmark-model
-        run: |
+  #       run: |
-          chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
+  #         chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
-          ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
+  #         ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
-  tfhub-efficientdet-lite1:
+  # tfhub-efficientdet-lite1:
-    runs-on: ubuntu-latest
+  #   runs-on: ubuntu-latest
-    needs: [vx-delegate-build, tim-vx-unit-test]
+  #   needs: [vx-delegate-build, tim-vx-unit-test]
-    steps:
+  #   steps:
-      - name: download test binary
+  #     - name: download test binary
-        uses: actions/download-artifact@v3
+  #       uses: actions/download-artifact@v3
-      - name: download model
+  #     - name: download model
-        run: |
+  #       run: |
-          wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite1/detection/metadata/1.tflite
+  #         wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite1/detection/metadata/1.tflite
-      - name: benchmark-model
+  #     - name: benchmark-model
-        run: |
+  #       run: |
-          chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
+  #         chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
-          ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
+  #         ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
-  tfhub-efficientdet-lite2:
+  # tfhub-efficientdet-lite2:
-    runs-on: ubuntu-latest
+  #   runs-on: ubuntu-latest
-    needs: [vx-delegate-build, tim-vx-unit-test]
+  #   needs: [vx-delegate-build, tim-vx-unit-test]
-    steps:
+  #   steps:
-      - name: download test binary
+  #     - name: download test binary
-        uses: actions/download-artifact@v3
+  #       uses: actions/download-artifact@v3
-      - name: download model
+  #     - name: download model
-        run: |
+  #       run: |
-          wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
+  #         wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
-      - name: benchmark-model
+  #     - name: benchmark-model
-        run: |
+  #       run: |
-          chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
+  #         chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
-          ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
+  #         ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
-  tfhub-efficientdet-lite3:
+  # tfhub-efficientdet-lite3:
-    runs-on: ubuntu-latest
+  #   runs-on: ubuntu-latest
-    needs: [vx-delegate-build, tim-vx-unit-test]
+  #   needs: [vx-delegate-build, tim-vx-unit-test]
-    steps:
+  #   steps:
-      - name: download test binary
+  #     - name: download test binary
-        uses: actions/download-artifact@v3
+  #       uses: actions/download-artifact@v3
-      - name: download model
+  #     - name: download model
-        run: |
+  #       run: |
-          wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
+  #         wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
-      - name: benchmark-model
+  #     - name: benchmark-model
-        run: |
+  #       run: |
-          chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
+  #         chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
-          ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
+  #         ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
  # acuity-yolov3-608-quant:
  #   runs-on: ubuntu-latest
--- a/2
+++ b/2
@ -1 +1 @@
-1.2.14
+1.2.22
--- a/src/tim/vx/internal/include/custom/custom_node_type.def
+++ b/src/tim/vx/internal/include/custom/custom_node_type.def
@ -9,3 +9,4 @@ DEF_NODE_TYPE(custom_sample)
 DEF_NODE_TYPE(custom_tiny_yolov4_postprocess)
 DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_confidence)
 DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_box)
 DEF_NODE_TYPE(custom_letterbox)
--- a/src/tim/vx/internal/include/custom/custom_ops.def
+++ b/src/tim/vx/internal/include/custom/custom_ops.def
@ -9,3 +9,4 @@ DEF_OP(CUSTOM_SAMPLE)
 DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS)
 DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE)
 DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX)
 DEF_OP(CUSTOM_LETTERBOX)
--- a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_letterbox.h
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_letterbox.h
@ -0,0 +1,61 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #ifndef _VSI_NN_OP_CUSTOM_LETTERBOX_H
 #define _VSI_NN_OP_CUSTOM_LETTERBOX_H
 #include "vsi_nn_types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef struct _vsi_nn_custom_letterbox_param
 {
    struct _custom_letterbox_local_data_t* local;
    int32_t new_shape_w;
    int32_t new_shape_h;
    vx_bool auto_bool;
    vx_bool scaleFill;
    vx_bool scaleup;
    int32_t stride;
    vx_bool center;
    float mean_r;
    float mean_g;
    float mean_b;
    float scale_r;
    float scale_g;
    float scale_b;
    int32_t pad_value_r;
    int32_t pad_value_g;
    int32_t pad_value_b;
    vx_bool reverse_channel;
 } vsi_nn_custom_letterbox_param;
 _compiler_assert(offsetof(vsi_nn_custom_letterbox_param, local) == 0, \
    vsi_nn_custom_lertterbox_h );
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
+++ b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
@ -34,5 +34,6 @@
 #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h"
 #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h"
 #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h"
 #include "custom/ops/vsi_nn_op_custom_letterbox.h"
 #endif
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@ -203,3 +203,4 @@ DEF_OP(BITCAST)
 DEF_OP(GROUPED_CONV3D)
 DEF_OP(COL2IM)
 DEF_OP(L1_LAYER_NORM)
 DEF_OP(ROPE)
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
@ -80,7 +80,7 @@ typedef struct _vsi_nn_pre_process_rgb_param
    float g_scale;
    float b_scale;
    /* pre process rgb layer local data structure */
-    vsi_nn_pre_process_rgb_lcl_data local;
+    vsi_nn_pre_process_rgb_lcl_data *local;
 } vsi_nn_pre_process_rgb_param;
 #ifdef __cplusplus
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_rope.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_rope.h
@ -0,0 +1,49 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #ifndef _VSI_NN_OP_ROPE_H
 #define _VSI_NN_OP_ROPE_H
 #include "vsi_nn_types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef struct _vsi_nn_rope_param
 {
    struct _rope_local_data_t* local;
    // Add parameters here
    int32_t axis;
    vsi_bool interleaved;
 } vsi_nn_rope_param;
 _compiler_assert(offsetof(vsi_nn_rope_param, local) == 0, \
    vsi_nn_rope_h );
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
@ -34,6 +34,7 @@ typedef struct _vsi_nn_topk_param
 {
    uint32_t     k;
    int32_t      axis;
    struct _topk_local_data_t* local;
 } vsi_nn_topk_param;
 #ifdef __cplusplus
--- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
@ -384,25 +384,17 @@ static VSI_INLINE_API float fp16_to_fp32
 static VSI_INLINE_API float bfp16_to_fp32
    (
-    int16_t in
+    uint16_t in
    )
 {
    uint32_t t1, t2, t3;
    float out;
    fp32_bit_cast_t fp32_bit_cast;
-    t1 = in & 0x00FF;                       // Mantissa
+    fp32_bit_cast.data = (uint32_t)(in << 16);
    t2 = in & 0xFF00;                       // Sign bit + Exponent
    t3 = in & 0x7F00;                       // Exponent
    t1 <<= 16;
    t2 <<= 16;                              // Shift (sign + Exponent) bit into position
    t1 |= t2;                               // Re-insert (sign + Exponent) bit
    fp32_bit_cast.data = t1;
    out = fp32_bit_cast.val;
-    return t3 == 0 ? 0.0f : out;
+    return out;
 } /* bfp16_to_fp32() */
 static VSI_INLINE_API uint16_t fp32_to_fp16
@ -720,7 +712,7 @@ static VSI_INLINE_API vsi_status dtype_to_float32
        *dst = fp16_to_fp32( *(int16_t *)src );
        break;
    case VSI_NN_TYPE_BFLOAT16:
-        *dst = bfp16_to_fp32( *(int16_t *)src );
+        *dst = bfp16_to_fp32( *(uint16_t *)src );
        break;
    case VSI_NN_TYPE_FLOAT8_E4M3:
        *dst = fp8_e4m3_to_fp32(*(int8_t*)src, src_dtype->scale);
--- a/src/tim/vx/internal/include/vsi_nn/vsi_nn.h
+++ b/src/tim/vx/internal/include/vsi_nn/vsi_nn.h
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@ -61,14 +61,13 @@ typedef struct _vsi_nn_hw_config_t
 {
    char target_name[VSI_NN_MAX_TARGET_NAME];
    vsi_nn_hw_evis_t evis;
 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
    uint32_t subGroupSize;
 #endif
    uint32_t use_40bits_va;
    uint32_t support_stream_processor;
    uint32_t sp_exec_count;
    uint32_t sp_vector_depth;
    uint32_t sp_per_core_vector_depth;
    uint32_t support_ffd;
 } vsi_nn_hw_config_t;
 typedef struct _vsi_nn_runtime_option_t
@ -89,6 +88,7 @@ typedef struct _vsi_nn_runtime_option_t
    int32_t enable_save_file_type;
    int32_t enable_use_image_process;
    int32_t enable_use_from_handle;
    vsi_nn_hw_config_t config;
 } vsi_nn_runtime_option_t;
 /**
@ -101,6 +101,15 @@ typedef struct _vsi_nn_context_t
    vsi_nn_runtime_option_t options;
 } VSI_PUBLIC_TYPE *vsi_nn_context_t;
 /**
 * Query and set options->config hw params.
 */
 OVXLIB_API vsi_status query_hardware_caps_runtime
    (
    vsi_nn_context_t ctx,
    vsi_nn_runtime_option_t *options
    );
 /**
 * Create context
 * Create ovxlib NN runtime context.
@ -113,6 +122,11 @@ OVXLIB_API vsi_status vsi_nn_initOptions
    (
    vsi_nn_runtime_option_t *options
    );
 OVXLIB_API vsi_status vsi_nn_initOptions_runtime
    (
    vsi_nn_runtime_option_t *options,
    vsi_nn_context_t ctx
    );
 /**
 * Release context
 * Release ovxlib NN runtime resource and reset context handle to NULL.
--- a/src/tim/vx/internal/include/vsi_nn_feature_config.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h
@ -57,5 +57,8 @@
 #define VSI_PER_GROUP_QUANTIZATION_SUPPORT
 #endif
 #define VSI_GRAPH_RUNTIME_ENV_SUPPORT
 #if defined(VX_TENSOR_SPARSITY_SUPPORT)
 #define VSI_TENSOR_SPARSITY_SUPPORT
 #endif
 #endif
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@ -216,6 +216,7 @@
 #include "ops/vsi_nn_op_grouped_conv3d.h"
 #include "ops/vsi_nn_op_col2im.h"
 #include "ops/vsi_nn_op_l1_layer_norm.h"
 #include "ops/vsi_nn_op_rope.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 #include "ops/vsi_nn_op_inverse_sigmoid.h"
@ -420,6 +421,7 @@ typedef union _vsi_nn_nn_param
    vsi_nn_grouped_conv3d_param     grouped_conv3d;
    vsi_nn_col2im_param             col2im;
    vsi_nn_l1_layer_norm_param      l1_layer_norm;
    vsi_nn_rope_param               rope;
    void*                         client_param;
    /* custom node data struct define */
--- a/src/tim/vx/internal/include/vsi_nn_tensor.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor.h
@ -86,8 +86,10 @@ typedef enum
    VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6,
    /** perchannel float8 */
    VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7,
-    /** GPQT */
+    /** pergroup symmetric */
    VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC = 0x8,
    /** pergroup asymmetric */
    VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC = 0x9,
    /** undefined type */
    VSI_NN_QNT_TYPE_NA = 0xff,
 } vsi_nn_qnt_type_e;
--- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
@ -418,6 +418,34 @@ OVXLIB_API vsi_status vsi_nn_SetTensorIsScalar
    int8_t is_scalar
 );
 /**
 * Get Tensor is_scalar
 * Get the is_sparsity of the tensor
 *
 * @param[in] tensor Tensor.
 *
 * @return is_sparsity flag of the tensor.
 */
 OVXLIB_API int32_t vsi_nn_GetTensorIsSparsity
 (
    vsi_nn_tensor_t* tensor
 );
 /**
 * Set Weight Tensor whether is sparsity
 * Set the is_sparsity for the tensor
 *
 * @param[in] tensor Tensor.
 * @param[in] new is_sparsity value of the tensor.
 *
 * @return VSI_SUCCESS on success, or error core otherwise.
 **/
 OVXLIB_API vsi_status vsi_nn_SetTensorIsSparsity(
    vsi_nn_tensor_t* tensor,
    int32_t is_sparsity
 );
 OVXLIB_API vsi_status vsi_nn_CopyRawDataToTensor
    (
    vsi_nn_graph_t*         graph,
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@ -33,7 +33,7 @@ extern "C"{
 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 2
-#define VSI_NN_VERSION_PATCH 14
+#define VSI_NN_VERSION_PATCH 22
 #define VSI_NN_VERSION \
    (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_letterbox_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_letterbox_evis.c
@ -0,0 +1,475 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_error.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_dtype_util_prv.h"
 __BEGIN_DECLS
 /*
 * Define kernel meta.
 */
 #define _CUSTOM_LETTERBOX_KERNEL_SOURCE      "custom_letterbox"
 // Add kernel hashtable here
 #define CUSTOM_LETTERBOX_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
        (( IN_DTYPE ) | ( OUT_DTYPE << 8 ))
 #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
        { CUSTOM_LETTERBOX_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
          CVIVANTE_NAMESPACE("evis.custom_letterbox_"#IN_DTYPE"to"#OUT_DTYPE), \
          _CUSTOM_LETTERBOX_KERNEL_SOURCE }
 typedef struct
 {
    uint32_t key;
    char * function_name;
    const char * source_name;
 } _kernel_map_type;
 static const _kernel_map_type _custom_letterbox_kernel_map[] =
 {
    // Register kernel here
    PACK_KERNEL_MAP( U8, U8 ),
    PACK_KERNEL_MAP( U8, I8 ),
    PACK_KERNEL_MAP( U8, F16 ),
 };
 /*
 * Kernel params
 */
 static vx_param_description_t _custom_letterbox_kernel_param_def[] =
 {
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _CUSTOM_LETTERBOX_PARAM_NUM  _cnt_of_array( _custom_letterbox_kernel_param_def )
 /*
 * Kernel initializer
 */
 DEF_KERNEL_INITIALIZER(_custom_letterbox_initializer)
    (
    vsi_nn_kernel_node_t                node,
    const vsi_nn_kernel_node_param_t  * param,
    size_t                              param_size
    )
 {
    vsi_status status = VSI_FAILURE;
    gpu_param_t gpu_param = {
        2,
        {0, 0, 0},
        {0, 0, 0},
        {0, 0, 0},
        {0, 0, 0}
        };
    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
    VSI_UNREFERENCED(param_size);
    int32_t      top = 0;
    int32_t      bottom = 0;
    int32_t      left = 0;
    int32_t      right = 0;
    float        scale_w = 0;
    float        scale_h = 0;
    int32_t      resize_w = 0;
    int32_t      resize_h = 0;
    int32_t      resize_max_w = 0;
    int32_t      resize_max_h = 0;
    float        output_scale = 1.0f;
    float        output_zp = 0;
    float        out_scale_r = 0;
    float        out_zp_r = 0;
    float        out_scale_g = 0;
    float        out_zp_g = 0;
    float        out_scale_b = 0;
    float        out_zp_b = 0;
    float        pad_v_r = 0;
    float        pad_v_g = 0;
    float        pad_v_b = 0;
    int32_t      in_width  = 0;
    int32_t      in_height = 0;
    int32_t      out_width  = 0;
    int32_t      out_height = 0;
    float        mean_r = 0;
    float        mean_g = 0;
    float        mean_b = 0;
    float        scale_r = 0;
    float        scale_g = 0;
    float        scale_b = 0;
    vx_int32     pad_value_r = 0;
    vx_int32     pad_value_g = 0;
    vx_int32     pad_value_b = 0;
    vx_int32     r_order = 0;
    vx_int32     b_order = 0;
    vx_int32     reverse_channel = 0;
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &top);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &bottom);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &left);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &right);
    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[6], &mean_r);
    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &mean_g);
    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &mean_b);
    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &scale_r);
    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &scale_g);
    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[11], &scale_b);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &pad_value_r);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &pad_value_g);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[14], &pad_value_b);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[15], &reverse_channel);
    CHECK_STATUS_FAIL_GOTO(status, final );
    in_width = (int32_t)attr[0]->shape->data[0] / 3;
    in_height = (int32_t)attr[0]->shape->data[1];
    out_width = (int32_t)attr[1]->shape->data[0];
    out_height = (int32_t)attr[1]->shape->data[1] / 3;
    output_scale = 1.0f / attr[1]->scale;
    output_zp = (float)(attr[1]->zero_point);
    resize_w = out_width - left - right;
    resize_h = out_height - top - bottom;
    resize_max_w = out_width - right;
    resize_max_h = out_height - bottom;
    scale_w = (float)in_width / resize_w;
    scale_h = (float)in_height / resize_h;
    out_scale_r = scale_r / output_scale;
    out_zp_r = output_zp - out_scale_r * mean_r;
    out_scale_g = scale_g / output_scale;
    out_zp_g = output_zp - out_scale_g * mean_g;
    out_scale_b = scale_b / output_scale;
    out_zp_b = output_zp - out_scale_b * mean_b;
    pad_v_r = pad_value_r * out_scale_r + out_zp_r;
    pad_v_g = pad_value_g * out_scale_g + out_zp_g;
    pad_v_b = pad_value_b * out_scale_b + out_zp_b;
    if (reverse_channel)
    {
        r_order = out_height * 2;
        b_order = 0;
    }
    else
    {
        r_order = 0;
        b_order = out_height * 2;
    }
    {
        gpu_dp_inst_t uniU8RightSubLeft_4x4 = {{
            0x00090909, // TCfg
            0x00000000, // ASelt
            0x00140003, 0x00000025, // ABin
            0x000a0a0a, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00000400, // AccumType, ConstantType, and PostShift
            0x00010001, 0x00000000, 0x00010001, 0x00000000,
            0x00010001, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16 };
        gpu_dp_inst_t uniLeftToFloat32_4x4 = {{
            0x00010101, // TCfg
            0x00000000, // ASelt
            0x00010000, 0x00000002, // ABin
            0x00020202, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00000400, // AccumType, ConstantType, and PostShift
            0x00000001, 0x00000000, 0x00000001, 0x00000000,
            0x00000001, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16 };
        gpu_dp_inst_t uniExtactHalf8_2x8 = {{
            0x11111111, // TCfg
            0x11110000, // ASelt
            0x06040200, 0x06040200, // ABin
            0x22222222, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00000100, // AccumType, ConstantType, and PostShift
            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
        }, GPU_DP_TYPE_16 };
        gpu_dp_inst_t uniExtract8Data_2x8 = {{
            0x33333333, // TCfg
            0x11110000, // ASelt
            0x03020100, 0x03020100, // ABin
            0x00000000, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00002400, // AccumType, ConstantType, and PostShift
            0x00000000, 0x00000000, 0x00000000, 0x00000000,
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16 };
        status |= vsi_nn_kernel_gpu_add_param( node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4 );
        status |= vsi_nn_kernel_gpu_add_param( node, "uniLeftToFloat32_4x4", &uniLeftToFloat32_4x4 );
        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8 );
        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Data_2x8", &uniExtract8Data_2x8 );
    }
    status |= vsi_nn_kernel_gpu_add_param( node, "top", &top );
    status |= vsi_nn_kernel_gpu_add_param( node, "left", &left );
    status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_r", &out_scale_r );
    status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_g", &out_scale_g );
    status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_b", &out_scale_b );
    status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_r", &out_zp_r );
    status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_g", &out_zp_g );
    status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_b", &out_zp_b );
    status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_r", &pad_v_r );
    status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_g", &pad_v_g );
    status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_b", &pad_v_b );
    status |= vsi_nn_kernel_gpu_add_param( node, "scale_w", &scale_w );
    status |= vsi_nn_kernel_gpu_add_param( node, "scale_h", &scale_h );
    status |= vsi_nn_kernel_gpu_add_param( node, "resize_max_w", &resize_max_w );
    status |= vsi_nn_kernel_gpu_add_param( node, "resize_max_h", &resize_max_h );
    status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height );
    status |= vsi_nn_kernel_gpu_add_param( node, "r_order", &r_order );
    status |= vsi_nn_kernel_gpu_add_param( node, "b_order", &b_order );
    gpu_param.global_scale[0] = 1;
    gpu_param.global_scale[1] = 1;
    gpu_param.global_size[0] = out_width;
    gpu_param.global_size[1] = out_height;
    status |= vsi_nn_kernel_gpu_config( node, &gpu_param );
    CHECK_STATUS_FAIL_GOTO(status, final );
 final:
    if (attr[0])
    {
        vsi_nn_kernel_tensor_attr_release( &attr[0] );
        attr[0] = NULL;
    }
    if (attr[1])
    {
        vsi_nn_kernel_tensor_attr_release( &attr[1] );
        attr[1] = NULL;
    }
    return status;
 } /* _custom_warp_affine_initializer() */
 /*
 * Query kernel
 */
 static vsi_status _query_kernel
    (
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_dtype_e in_dtype;
    vsi_nn_kernel_dtype_e out_dtype;
    const _kernel_map_type * kernel_map = _custom_letterbox_kernel_map;
    size_t kernel_map_size              = _cnt_of_array( _custom_letterbox_kernel_map );
    vx_param_description_t * param_def  = _custom_letterbox_kernel_param_def;
    size_t param_def_size               = _cnt_of_array( _custom_letterbox_kernel_param_def );
    vx_kernel_initialize_f  initializer = _custom_letterbox_initializer;
    uint32_t key = 0;
    uint32_t i = 0;
    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
    key = CUSTOM_LETTERBOX_HASH_KEY( in_dtype, out_dtype );
    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
    {
        if ( kernel_map[i].key == key )
        {
            break;
        }
    }
    if ( i < (uint32_t)kernel_map_size )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
        kernel->info.parameters  = param_def;
        kernel->info.numParams   = (vx_uint32)param_def_size;
        kernel->info.initialize  = initializer;
        // Register code source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                "vsi_nn_kernel_header",
                kernel_map[i].source_name );
        // Register binary source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
                kernel_map[i].source_name );
        status = VSI_SUCCESS;
    }
    return status;
 } /* _query_kernel() */
 static vsi_nn_kernel_node_t _setup
    (
    vsi_nn_graph_t              * graph,
    vsi_nn_tensor_t            ** inputs,
    size_t                        input_num,
    vsi_nn_tensor_t            ** outputs,
    size_t                        output_num,
    const vsi_nn_kernel_param_t * params,
    vsi_nn_kernel_t             * kernel
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t node_params[_CUSTOM_LETTERBOX_PARAM_NUM];
    vsi_nn_kernel_node_t node = NULL;
    size_t i = 0;
    int32_t top = vsi_nn_kernel_param_get_int32( params, "top");
    int32_t bottom = vsi_nn_kernel_param_get_int32( params, "bottom");
    int32_t left = vsi_nn_kernel_param_get_int32( params, "left");
    int32_t right = vsi_nn_kernel_param_get_int32( params, "right");
    float   mean_r = vsi_nn_kernel_param_get_float32( params, "mean_r");
    float   mean_g = vsi_nn_kernel_param_get_float32( params, "mean_g");
    float   mean_b = vsi_nn_kernel_param_get_float32( params, "mean_b");
    float   scale_r = vsi_nn_kernel_param_get_float32( params, "scale_r");
    float   scale_g = vsi_nn_kernel_param_get_float32( params, "scale_g");
    float   scale_b = vsi_nn_kernel_param_get_float32( params, "scale_b");
    int32_t pad_value_r = vsi_nn_kernel_param_get_int32( params, "pad_value_r");
    int32_t pad_value_g = vsi_nn_kernel_param_get_int32( params, "pad_value_g");
    int32_t pad_value_b = vsi_nn_kernel_param_get_int32( params, "pad_value_b");
    int32_t reverse_channel = vsi_nn_kernel_param_get_int32( params, "reverse_channel");
    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
    uint32_t param_num = _CUSTOM_LETTERBOX_PARAM_NUM;
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    shapes[0][0] = inputs[0]->attr.size[1] * 3;
    shapes[0][1] = inputs[0]->attr.size[2];
    shapes[1][0] = outputs[0]->attr.size[0];
    shapes[1][1] = outputs[0]->attr.size[1] * 3;
    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
            inputs[0], shapes[0], 2 );
    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
            outputs[0], shapes[1], 2 );
    if (reshape_tensors[0] == NULL ||
        reshape_tensors[1] == NULL)
    {
        goto final;
    }
    if (reverse_channel)
    {
        float mean_temp = mean_r;
        float scale_temp = scale_r;
        int32_t pad_value_temp = pad_value_r;
        mean_r = mean_b;
        mean_b = mean_temp;
        scale_r = scale_b;
        scale_b = scale_temp;
        pad_value_r = pad_value_b;
        pad_value_b = pad_value_temp;
    }
    status = _query_kernel( kernel, inputs, outputs );
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
        if ( node )
        {
            uint32_t index = 2;
            vsi_nn_kernel_node_pack_io( node_params, param_num,
                    reshape_tensors, 1, &reshape_tensors[1], 1 );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &bottom );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &right );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_r );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_g );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_b );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_r );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_g );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_b );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_r );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_g );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_b );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse_channel );
            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, param_num );
            vsi_nn_kernel_scalar_release( &node_params[2] );
            vsi_nn_kernel_scalar_release( &node_params[3] );
            vsi_nn_kernel_scalar_release( &node_params[4] );
            vsi_nn_kernel_scalar_release( &node_params[5] );
            vsi_nn_kernel_scalar_release( &node_params[6] );
            vsi_nn_kernel_scalar_release( &node_params[7] );
            vsi_nn_kernel_scalar_release( &node_params[8] );
            vsi_nn_kernel_scalar_release( &node_params[9] );
            vsi_nn_kernel_scalar_release( &node_params[10] );
            vsi_nn_kernel_scalar_release( &node_params[11] );
            vsi_nn_kernel_scalar_release( &node_params[12] );
            vsi_nn_kernel_scalar_release( &node_params[13] );
            vsi_nn_kernel_scalar_release( &node_params[14] );
            vsi_nn_kernel_scalar_release( &node_params[15] );
            CHECK_STATUS(status);
        }
    }
 final:
    for (i = 0; i < 2; i++)
    {
        vsi_safe_release_tensor(reshape_tensors[i]);
    }
    return node;
 } /* _setup() */
 __END_DECLS
 REGISTER_BACKEND_EVIS( custom_letterbox, _setup )
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
@ -35,6 +35,7 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 #define _CPU_ARG_NUM            (1)
 #define _CPU_INPUT_NUM          (1)
@ -42,6 +43,7 @@
 #define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
 #define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
 #define _KERNEL_NAME            ("com.vivantecorp.extension.Softmax2VXC")
 #define _KERNEL_NAME_U8         ("com.vivantecorp.extension.Softmax2VXC_u8")
 #define SCALAR_INPUT_AXIS          (2)
@ -64,7 +66,11 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
 {
    vsi_status status = VSI_FAILURE;
    int sf_size = 0;
-    vsi_nn_kernel_tensor_attr_t* attr = NULL;
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
    float srcZP = 0.0f;
    float srcScale = 1.0f;
    float dstZP = 0.0f;
    float dstScale = 1.0f;
    // Alignment with a power of two value.
    gpu_param_t gpu_param = {
        2,          // workdim
@ -75,14 +81,19 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
    VSI_UNREFERENCED(param_size);
-    attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    attr[0] = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
-    if (!attr)
+    attr[1] = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]);
    if ((!attr[0]) || (!attr[1]))
    {
        VSILOGE("Query failure! at line");
        return status;
    }
-    sf_size  =  (int)attr->shape->data[0];
+    sf_size  =  (int)attr[0]->shape->data[0];
    srcScale = attr[0]->scale;
    srcZP = (float)attr[0]->zero_point;
    dstScale = 1.0f / attr[1]->scale;
    dstZP = (float)attr[1]->zero_point;
    gpu_param.global_offset[0] = 0;
    gpu_param.global_offset[1] = 0;
@ -91,7 +102,7 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
    gpu_param.local_size[0]    = 1;
    gpu_param.local_size[1]    = 1;
    gpu_param.global_size[0]   =
-        gpu_align_p2((1 + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0],
+        gpu_align_p2((attr[0]->shape->data[1] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0],
                gpu_param.local_size[0]);
    gpu_param.global_size[1]   =
        gpu_align_p2((1 + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1],
@ -107,25 +118,50 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
            0x00000001, 0x00000000, 0x00000001, 0x00000000,
            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniExtract8Bin_2x8 = {{
            0x11111111, // TCfg
            0x11110000, // ASelt
            0x06040200, 0x06040200, // ABin
            0x22222222, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00002400, // AccumType, ConstantType, and PostShift
            0x00000001, 0x00000001, 0x00000001, 0x00000001,
            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
        }, GPU_DP_TYPE_16};
        status = vsi_nn_kernel_gpu_add_param( node,
                "Uni4x4_Fp16ToFp32", &Uni4x4_Fp16ToFp32 );
-        vsi_nn_kernel_gpu_add_param(node,
+        status |= vsi_nn_kernel_gpu_add_param( node,
                "uniExtract8Bin_2x8", &uniExtract8Bin_2x8 );
        status |= vsi_nn_kernel_gpu_add_param(node,
                "sf_size", &sf_size);
        status |= vsi_nn_kernel_gpu_add_param(node, "srcScale", &srcScale);
        status |= vsi_nn_kernel_gpu_add_param(node, "srcZP", &srcZP);
        status |= vsi_nn_kernel_gpu_add_param(node, "dstScale", &dstScale);
        status |= vsi_nn_kernel_gpu_add_param(node, "dstZP", &dstZP);
    }
-    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    status |= vsi_nn_kernel_gpu_config( node, &gpu_param );
    if(status != VSI_SUCCESS)
    {
        VSILOGE("Initializer  failure!");
    }
-    if (attr) vsi_nn_kernel_tensor_attr_release( &attr );
+    if (attr[0])
    {
        vsi_nn_kernel_tensor_attr_release( &attr[0] );
        attr[0] = NULL;
    }
    if (attr[1])
    {
        vsi_nn_kernel_tensor_attr_release( &attr[1] );
        attr[1] = NULL;
    }
    return status;
 }
-static const vx_kernel_description_t _kernel_info =
+static const vx_kernel_description_t _kernel_info1 =
 {
    KERNEL_ID_PLACEHOLDER,
    _KERNEL_NAME,
@ -139,6 +175,20 @@ static const vx_kernel_description_t _kernel_info =
    vsi_nn_KernelDeinitializer
 };
 static const vx_kernel_description_t _kernel_info2 =
 {
    KERNEL_ID_PLACEHOLDER,
    _KERNEL_NAME_U8,
    NULL,
    kernel_param_def,
    _cnt_of_array( kernel_param_def ),
    vsi_nn_KernelValidator,
    NULL,
    NULL,
    _softmax_initializer,
    vsi_nn_KernelDeinitializer
 };
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -146,9 +196,20 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    VSI_UNREFERENCED(inputs);
+    vsi_nn_kernel_dtype_e in_dtype;
-    VSI_UNREFERENCED(outputs);
+    vsi_nn_kernel_dtype_e out_dtype;
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+
    in_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
    out_dtype = vsi_nn_kernel_map_dtype(outputs[0]->attr.dtype.vx_type);
    if (in_dtype == U8 && out_dtype == U8)
    {
        memmove( &kernel->info, &_kernel_info2, sizeof(vx_kernel_description_t) );
    }
    else
    {
        memmove( &kernel->info, &_kernel_info1, sizeof(vx_kernel_description_t) );
    }
    vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
            "vsi_nn_kernel_header",
@ -173,12 +234,42 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
    vsi_nn_kernel_node_t node = NULL;
    int32_t axis = 0;
    vsi_nn_tensor_t* reshape_tensors[2] = {NULL};
    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
    uint32_t rank_in = 0;
    int32_t new_axis = 0;
    uint32_t i = 0;
    vsi_bool ret = vx_false_e;
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    axis = vsi_nn_kernel_param_get_int32(params, "axis");
    ret = vsi_nn_kernel_optimize_softmax_shape(inputs[0]->attr.size,
                                           inputs[0]->attr.dim_num,
                                           axis,
                                           shapes[0],
                                           &rank_in,
                                           &new_axis);
    if (ret)
    {
        reshape_tensors[0] = vsi_nn_reshape_tensor(graph, inputs[0], shapes[0], rank_in);
        reshape_tensors[1] = vsi_nn_reshape_tensor(graph, outputs[0], shapes[0], rank_in);
    }
    else
    {
        return NULL;
    }
    if (!vsi_nn_kernel_gpu_check_shape(reshape_tensors[0]->attr.size,
                                       reshape_tensors[0]->attr.dim_num) ||
        new_axis > 2)
    {
        return NULL;
    }
    status = _query_kernel( inputs, outputs, kernel );
    if( VSI_SUCCESS == status)
    {
@ -187,9 +278,9 @@ static vsi_nn_kernel_node_t _setup
        {
            /* Set inputs and outputs */
            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
+                    reshape_tensors, _CPU_INPUT_NUM, &reshape_tensors[1], _CPU_OUTPUT_NUM );
            backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
+                    graph, I32, &new_axis );
            /* Pass parameters to node. */
            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
@ -200,6 +291,11 @@ static vsi_nn_kernel_node_t _setup
            status = VSI_FAILURE;
        }
    }
    for (i = 0; i < 2; i++)
    {
        vsi_safe_release_tensor(reshape_tensors[i]);
    }
    return node;
 } /* _setup() */
--- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_letterbox.c
+++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_letterbox.c
@ -0,0 +1,227 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <string.h>
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_constraint_check.h"
 typedef struct _custom_letterbox_local_data_t {
    int32_t placeholder;
 } custom_letterbox_local_data_t;
 /*
 Declare number of input and output.
 */
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
 int32_t my_round(float in)
 {
    if (in >= 0)
    {
        return (int)(in + 0.5f);
    }
    else
    {
        return (int)(in - 0.5f);
    }
 }
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    vsi_nn_kernel_param_t * param = NULL;
    vsi_nn_custom_letterbox_param * p;
    p = &(self->nn_param.custom_letterbox);
    int32_t shape_w = (int32_t)inputs[0]->attr.size[1];
    int32_t shape_h = (int32_t)inputs[0]->attr.size[2];
    int32_t new_shape_w = (int32_t)outputs[0]->attr.size[0];
    int32_t new_shape_h = (int32_t)outputs[0]->attr.size[1];
    vx_bool auto_bool = p->auto_bool;
    vx_bool scaleFill = p->scaleFill;
    vx_bool scaleup = p->scaleup;
    int32_t stride = p->stride;
    vx_bool center = p->center;
    float r = 1.0f;
    int32_t new_unpad_w = 0;
    int32_t new_unpad_h = 0;
    int32_t dw = 0;
    int32_t dh = 0;
    int32_t top = 0;
    int32_t bottom = 0;
    int32_t left = 0;
    int32_t right = 0;
    r = (float)fmin((float)new_shape_w / shape_w, (float)new_shape_h / shape_h);
    if (!scaleup)
    {
        r = (float)fmin(r, 1.0f);
    }
    new_unpad_w = my_round(r * shape_w);
    new_unpad_h = my_round(r * shape_h);
    dw = new_shape_w - new_unpad_w;
    dh = new_shape_h - new_unpad_h;
    if (auto_bool)
    {
        dw = dw % stride;
        dh = dh % stride;
    }
    else if (scaleFill)
    {
        dw = 0;
        dh = 0;
        new_unpad_w = new_shape_w;
        new_unpad_h = new_shape_h;
    }
    if (center)
    {
        top = my_round(dh / 2.0f - 0.1f);
        bottom = my_round(dh / 2.0f + 0.1f);
        left = my_round(dw / 2.0f - 0.1f);
        right = my_round(dw / 2.0f + 0.1f);
    }
    else
    {
        top = 0;
        bottom = my_round(dh + 0.1f);
        left = 0;
        right = my_round(dw + 0.1f);
    }
    param = vsi_nn_kernel_param_create();
    vsi_nn_kernel_param_add_int32( param, "top", top);
    vsi_nn_kernel_param_add_int32( param, "bottom", bottom);
    vsi_nn_kernel_param_add_int32( param, "left", left);
    vsi_nn_kernel_param_add_int32( param, "right", right);
    vsi_nn_kernel_param_add_float32( param, "mean_r", p->mean_r);
    vsi_nn_kernel_param_add_float32( param, "mean_g", p->mean_g);
    vsi_nn_kernel_param_add_float32( param, "mean_b", p->mean_b);
    vsi_nn_kernel_param_add_float32( param, "scale_r", p->scale_r);
    vsi_nn_kernel_param_add_float32( param, "scale_g", p->scale_g);
    vsi_nn_kernel_param_add_float32( param, "scale_b", p->scale_b);
    vsi_nn_kernel_param_add_int32( param, "pad_value_r", p->pad_value_r);
    vsi_nn_kernel_param_add_int32( param, "pad_value_g", p->pad_value_g);
    vsi_nn_kernel_param_add_int32( param, "pad_value_b", p->pad_value_b);
    vsi_nn_kernel_param_add_int32( param, "reverse_channel", p->reverse_channel);
    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
            "custom_letterbox",
            inputs, 1,
            outputs, 1, param );
    vsi_nn_kernel_param_release( &param );
    return VSI_SUCCESS;
 } /* op_compute() */
 static vsi_bool op_check
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    BEGIN_IO_TYPE_DECL(LETTERBOX, 1, 1)
        IO_TYPE(D_U8,         D_F16)
        IO_TYPE(D_U8,         D_U8|Q_ASYM)
        IO_TYPE(D_U8,         D_I8|Q_DFP)
        IO_TYPE(D_U8,         D_I8|Q_ASYM)
        IO_TYPE(D_U8,         D_I8|Q_SYM)
    END_IO_TYPE_DECL(LETTERBOX)
    if (!VALIDATE_OP_IO_TYPES(LETTERBOX, self, inputs, self->input.num, outputs, self->output.num)) {
        char* desc = generate_op_io_types_desc(inputs,
                self->input.num, outputs, self->output.num);
        VSILOGE("Inputs/Outputs data type not support: %s", desc);
        destroy_op_io_types_desc(desc);
        return FALSE;
    }
    return TRUE;
 } /* op_check() */
 static vsi_bool op_setup
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
    {
        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
        outputs[0]->attr.size[0] = self->nn_param.custom_letterbox.new_shape_w;
        outputs[0]->attr.size[1] = self->nn_param.custom_letterbox.new_shape_h;
        outputs[0]->attr.size[2] = 3;
        outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
    }
    return TRUE;
 } /* op_setup() */
 static vsi_status op_deinit
    (
    vsi_nn_node_t* self
    )
 {
    vsi_status status = VSI_SUCCESS;
    status = vsi_nn_op_common_deinit(self);
    return status;
 } /* op_deinit() */
 __BEGIN_DECLS
 /* Registrar */
 DEF_OP_REG
    (
    /* op_name    */ CUSTOM_LETTERBOX,
    /* init       */ NULL,
    /* compute    */ op_compute,
    /* deinit     */ op_deinit,
    /* check      */ op_check,
    /* setup      */ op_setup,
    /* optimize   */ NULL,
    /* input_num  */ _INPUT_NUM,
    /* output_num */ _OUTPUT_NUM
    );
 __END_DECLS
--- a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
@ -85,18 +85,24 @@ static const struct {
    HASH_CUMSUM_KERNELS(0, U8,  U8)
    HASH_CUMSUM_KERNELS(0, F32, F32)
    HASH_CUMSUM_KERNELS(0, F32, U8)
    HASH_CUMSUM_KERNELS(0, I32, I32)
    HASH_CUMSUM_KERNELS(1, U8,  U8)
    HASH_CUMSUM_KERNELS(1, F32, F32)
    HASH_CUMSUM_KERNELS(1, F32, U8)
    HASH_CUMSUM_KERNELS(1, I32, I32)
    HASH_CUMSUM_KERNELS(2, U8,  U8)
    HASH_CUMSUM_KERNELS(2, F32, F32)
    HASH_CUMSUM_KERNELS(2, F32, U8)
    HASH_CUMSUM_KERNELS(2, I32, I32)
    HASH_CUMSUM_KERNELS_2D(0, U8,  U8)
    HASH_CUMSUM_KERNELS_2D(0, F32, F32)
    HASH_CUMSUM_KERNELS_2D(0, F32, U8)
    HASH_CUMSUM_KERNELS_2D(0, I32, I32)
    HASH_CUMSUM_KERNELS_2D(1, U8,  U8)
    HASH_CUMSUM_KERNELS_2D(1, F32, F32)
    HASH_CUMSUM_KERNELS_2D(1, F32, U8)
    HASH_CUMSUM_KERNELS_2D(1, I32, I32)
    HASH_CUMSUM_ARRAY_KERNELS(0, U8,  U8, KERNEL_SOURCE_3)
    HASH_CUMSUM_ARRAY_KERNELS(0, F32, F32, KERNEL_SOURCE_3)
--- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
@ -26,6 +26,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@ -644,7 +645,8 @@ static vsi_nn_kernel_node_t _setup
 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
            shader_cnt_support =
-                (graph->ctx->config.subGroupSize >= 64 && graph->ctx->config.use_40bits_va) ? TRUE : FALSE;
+                (((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize >= 64 &&
                    ((vsi_nn_graph_prv_t*)graph)->options->config.use_40bits_va) ? TRUE : FALSE;
 #endif
            if ((in1_h % 64 == 0) && (transFlg == 1) && (out_h % 8 == 0) && shader_cnt_support)
            {
--- a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
@ -75,6 +75,7 @@ static const _kernel_map_type _one_hot_kernel_map[] =
    PACK_ONE_HOT_KERNEL_MAP( F32, F32 ),
    PACK_ONE_HOT_KERNEL_MAP( I32, I32 ),
    PACK_ONE_HOT_KERNEL_MAP( I32, F32 ),
    PACK_ONE_HOT_KERNEL_MAP( I32, BF16 ),
    PACK_ONE_HOT_KERNEL_MAP( I32, U8 ),
    PACK_ONE_HOT_KERNEL_MAP( U8,  U8 ),
 };
--- a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c
@ -79,7 +79,7 @@ static const struct {
        const char* source_name;
    } kernel_map[] =
 {
-    PRELU_KERNELS_FLOAT(F32, F32, F32,  KERNEL_SOURCE_1)
+    PRELU_KERNELS_FLOAT(F32, F32, F32, KERNEL_SOURCE_1)
    PRELU_KERNELS_FLOAT(F16, F16, F16, KERNEL_SOURCE_1)
    PRELU_KERNELS(U8, U8, U8, KERNEL_SOURCE_1)
    PRELU_KERNELS(I32, I32, I32, KERNEL_SOURCE_1)
--- a/src/tim/vx/internal/src/kernel/cl/rope_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/rope_cl.c
@ -0,0 +1,329 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_error.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 __BEGIN_DECLS
 /*
 * Define kernel meta.
 */
 typedef enum
 {
    INTERNAL_KERNEL_ROPE,
 } _internal_kernel_e;
 #define _ROPE_KERNEL_SOURCE      "rope"
 #define _ROPE_KERNEL_NAME        CVIVANTE_NAMESPACE("cl.rope")
 // Add kernel hashtable here
 #define STR(a) #a
 #define ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ) \
        ((IN0_DTYPE) | (IN0_DTYPE << 8) | (OUT_DTYPE << 16) | (AXIS << 25))
 #define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ) \
        { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ), \
          CVIVANTE_NAMESPACE("cl.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_axis"STR(AXIS)), \
         "rope_0" }
 typedef struct
 {
    uint32_t key;
    char * function_name;
    const char * source_name;
 } _kernel_map_type;
 static const _kernel_map_type _rope_kernel_map[] =
 {
    // Register kernel here
    PACK_KERNEL_MAP( F32, F32, F32, 0 ),
    PACK_KERNEL_MAP( F32, F32, F32, 1 ),
    PACK_KERNEL_MAP( F32, F32, F32, 2 ),
    PACK_KERNEL_MAP( I32, I32, I32, 0 ),
    PACK_KERNEL_MAP( I32, I32, I32, 1 ),
    PACK_KERNEL_MAP( I32, I32, I32, 2 ),
    PACK_KERNEL_MAP( U32, U32, U32, 0 ),
    PACK_KERNEL_MAP( U32, U32, U32, 1 ),
    PACK_KERNEL_MAP( U32, U32, U32, 2 ),
 };
 /*
 * Kernel params
 */
 static vx_param_description_t _rope_kernel_param_def[] =
 {
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _ROPE_PARAM_NUM  _cnt_of_array( _rope_kernel_param_def )
 #define SCALAR_AXIS                 (4)
 #define SCALAR_IN_ZP                (5)
 #define SCALAR_COS_ZP               (6)
 #define SCALAR_SIN_ZP               (7)
 #define SCALAR_SCALE0               (8)
 #define SCALAR_SCALE1               (9)
 #define SCALAR_OUT_ZP               (10)
 #define SCALAR_HALF_HEAD_SIZE       (11)
 #define SCALAR_STEP                 (12)
 /*
 * Kernel initializer
 */
 DEF_KERNEL_INITIALIZER(_rope_initializer)
    (
    vsi_nn_kernel_node_t                node,
    const vsi_nn_kernel_node_param_t  * param,
    size_t                              param_size
    )
 {
    gpu_param_t gpu_param = {
        3,         // workdim
        {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
        {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
        {0, 0, 0}, // localWorkSize: local group size in thread
        {0, 0, 0}  // globalWorkSize: image size in thread
    };
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_tensor_attr_t* attr[2] = { NULL };
    int32_t axis = 0;
    vsi_size_array_t* out_shape = NULL;
    vsi_size_t shape[3] = { 1 };
    VSI_UNREFERENCED(node);
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis);
    CHECK_STATUS_FAIL_GOTO(status, final);
    out_shape = attr[1]->shape;
    shape[0] = out_shape->data[0];
    shape[1] = out_shape->data[1];
    shape[2] = out_shape->data[2];
    shape[axis] = shape[axis] / 2;
    gpu_param.global_scale[0] = 1;
    gpu_param.global_scale[1] = 1;
    gpu_param.global_scale[2] = 1;
    gpu_param.global_size[0] = shape[0];
    gpu_param.global_size[1] = shape[1];
    gpu_param.global_size[2] = out_shape->size > 2 ? shape[2] : 1;
    status = vsi_nn_kernel_gpu_config(node, &gpu_param);
 final:
 #define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
    SAFE_FREE_TENSOR_ATTR(attr[0]);
    SAFE_FREE_TENSOR_ATTR(attr[1]);
 #undef SAFE_FREE_TENSOR_ATTR
    return status;
 } /* _rope_initializer() */
 /*
 * Query kernel
 */
 static vsi_status _query_kernel
    (
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
    int32_t axis
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_dtype_e in0_dtype;
    vsi_nn_kernel_dtype_e in1_dtype;
    vsi_nn_kernel_dtype_e in2_dtype;
    vsi_nn_kernel_dtype_e out_dtype;
    const _kernel_map_type * kernel_map = _rope_kernel_map;
    size_t kernel_map_size              = _cnt_of_array( _rope_kernel_map );
    vx_param_description_t * param_def  = _rope_kernel_param_def;
    vx_kernel_initialize_f  initializer = _rope_initializer;
    uint32_t key = 0;
    uint32_t i;
    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type);
    in2_dtype = vsi_nn_kernel_map_dtype(inputs[2]->attr.dtype.vx_type);
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 #define _PACK_SELECT_KEY( in0_type, in1_type, in2_type, out_type ) \
    ((in0_type) | (in1_type << 8) | (in2_type << 16) | (out_type << 24))
    switch (_PACK_SELECT_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype))
    {
    case _PACK_SELECT_KEY(F32, F32, F32, F32):
    case _PACK_SELECT_KEY(F16, F16, F16, F16):
        key = ROPE_HASH_KEY(F32, F32, F32, axis);
        break;
    case _PACK_SELECT_KEY(U8,  U8,  U8,  U8):
    case _PACK_SELECT_KEY(U16, U16, U16, U16):
        key = ROPE_HASH_KEY(U32, U32, U32, axis);
        break;
    case _PACK_SELECT_KEY(I8,  I8,  I8,  I8):
    case _PACK_SELECT_KEY(I16, I16, I16, I16):
    case _PACK_SELECT_KEY(I32, I32, I32, I32):
        key = ROPE_HASH_KEY(I32, I32, I32, axis);
        break;
    default:
        break;
    }
 #undef _PACK_SELECT_KEY
    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
    {
        if ( kernel_map[i].key == key )
        {
            break;
        }
    }
    if ( i < (uint32_t)kernel_map_size )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
        kernel->info.parameters  = param_def;
        kernel->info.numParams   = _cnt_of_array( _rope_kernel_param_def );
        kernel->info.initialize  = initializer;
        // Register code source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                "eltwise_ops_helper",
                kernel_map[i].source_name );
        // Register binary source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
                kernel_map[i].source_name );
        status = VSI_SUCCESS;
    }
    return status;
 } /* _query_kernel() */
 static vsi_nn_kernel_node_t _setup
    (
    vsi_nn_graph_t              * graph,
    vsi_nn_tensor_t            ** inputs,
    size_t                        input_num,
    vsi_nn_tensor_t            ** outputs,
    size_t                        output_num,
    const vsi_nn_kernel_param_t * params,
    vsi_nn_kernel_t             * kernel
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t node_params[_ROPE_PARAM_NUM] = {NULL};
    vsi_nn_kernel_node_t node = NULL;
    int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
    int32_t interleaved = vsi_nn_kernel_param_get_int32(params, "interleaved");
    float in_scale = vsi_nn_get_tensor_scale(inputs[0]);
    float cos_scale = vsi_nn_get_tensor_scale(inputs[1]);
    float sin_scale = vsi_nn_get_tensor_scale(inputs[2]);
    float out_scale = vsi_nn_get_tensor_scale(outputs[0]);
    float in_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
    float cos_zp = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
    float sin_zp = (float)vsi_nn_get_tensor_zero_point(inputs[2]);
    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
    int32_t half_head_size = interleaved ? 1 : (int32_t)(inputs[0]->attr.size[axis] / 2);
    float scale0 = in_scale * cos_scale / out_scale;
    float scale1 = in_scale * sin_scale / out_scale;
    int32_t step = interleaved ? 2 : 1;
    int32_t i = 0;
    // Check if gpu can support the size
    if ( !vsi_nn_kernel_gpu_check_shape(
        inputs[0]->attr.size, inputs[0]->attr.dim_num ) )
    {
        return NULL;
    }
    status = _query_kernel( kernel, inputs, outputs, axis );
    if (VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
        if ( node )
        {
            /* Set inputs and outputs */
            vsi_nn_kernel_node_pack_io( node_params, _ROPE_PARAM_NUM,
                    inputs, input_num, outputs, output_num );
            /* Pass parameters to node. */
            node_params[SCALAR_AXIS] = vsi_nn_kernel_scalar_create(
                graph, I32, &axis);
            node_params[SCALAR_IN_ZP] = vsi_nn_kernel_scalar_create(
                graph, F32, &in_zp);
            node_params[SCALAR_COS_ZP] = vsi_nn_kernel_scalar_create(
                graph, F32, &cos_zp);
            node_params[SCALAR_SIN_ZP] = vsi_nn_kernel_scalar_create(
                graph, F32, &sin_zp);
            node_params[SCALAR_SCALE0] = vsi_nn_kernel_scalar_create(
                graph, F32, &scale0);
            node_params[SCALAR_SCALE1] = vsi_nn_kernel_scalar_create(
                graph, F32, &scale1);
            node_params[SCALAR_OUT_ZP] = vsi_nn_kernel_scalar_create(
                graph, F32, &output_zp);
            node_params[SCALAR_HALF_HEAD_SIZE] = vsi_nn_kernel_scalar_create(
                graph, I32, &half_head_size);
            node_params[SCALAR_STEP] = vsi_nn_kernel_scalar_create(
                graph, I32, &step);
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ROPE_PARAM_NUM );
        }
    }
    for (i = SCALAR_AXIS; i < (int32_t)_ROPE_PARAM_NUM; i++)
    {
        if (node_params[i])
        {
            vsi_nn_kernel_scalar_release(&node_params[i]);
        }
    }
    return node;
 } /* _setup() */
 __END_DECLS
 REGISTER_BACKEND_CL( rope, _setup )
--- a/src/tim/vx/internal/src/kernel/cl/swish_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/swish_cl.c
@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@ -299,7 +300,7 @@ static vsi_nn_kernel_node_t _setup
    VSI_UNREFERENCED(output_num);
 #if (VX_ACTIVATION_EXT_SUPPORT)
-    if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
+    if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
    {
        return NULL;
    }
--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@ -26,6 +26,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@ -457,7 +458,7 @@ static vsi_nn_kernel_node_t _setup
    vsi_bool is_odd_even_sort = FALSE;
    vsi_bool is_bitnoic_segment = FALSE;
    size_t param_num = _TOPK_PARAM_NUM;
-    int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2);
+    int32_t max_stages = 7 + (int32_t)log2(((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize >> 2);
    vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@ -483,6 +484,11 @@ static vsi_nn_kernel_node_t _setup
        return NULL;
    }
    if (block_size >= GPU_TENSOR_MAX_WIDTH)
    {
        return NULL;
    }
    shape[0][0] = block_size;
    shape[0][1] = block_num;
    shape[1][0] = top_k;
--- a/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c
@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@ -192,7 +193,7 @@ static vsi_bool _bucketize_support_types
        return FALSE;
    }
-    if (in_dtype == F16 && graph->ctx->config.evis.ver != VSI_NN_HW_EVIS_2)
+    if (in_dtype == F16 && ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver != VSI_NN_HW_EVIS_2)
    {
        return FALSE;
    }
--- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@ -771,7 +772,8 @@ static vsi_nn_kernel_node_t _setup
    temp_tensor[1] = weights;
    temp_tensor[2] = biases;
-    ks = get_kernel_size(weights->attr.size[0], dilation, stride, graph->ctx->config.evis.ver);
+    ks = get_kernel_size(weights->attr.size[0], dilation, stride,
        ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver);
    status = _query_kernel( kernel, temp_tensor, outputs, dilation, ks);
--- a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
@ -121,7 +121,9 @@ static const _kernel_map_type _groupnorm_sums_kernel_map[] =
    TENSOR_GROUPNORM_SUMS_KERNELS( U8, F32, KERNEL_SOURCE_0 )
    TENSOR_GROUPNORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_0 )
    TENSOR_GROUPNORM_SUMS_KERNELS( I16, F32, KERNEL_SOURCE_2 )
    TENSOR_GROUPNORM_SUMS_KERNELS( U16, F32, KERNEL_SOURCE_2 )
    TENSOR_GROUPNORM_SUMS_KERNELS_2D( I16, F32, KERNEL_SOURCE_2 )
    TENSOR_GROUPNORM_SUMS_KERNELS_2D( U16, F32, KERNEL_SOURCE_2 )
    TENSOR_GROUPNORM_SUMS_KERNELS( F16, F32, KERNEL_SOURCE_2 )
    TENSOR_GROUPNORM_SUMS_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 )
 };
@ -174,6 +176,9 @@ static const _kernel_map_type _groupnorm_kernel_map[] =
    TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, U8, KERNEL_SOURCE_2 )
    TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, F16, KERNEL_SOURCE_2 )
    TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, F16, KERNEL_SOURCE_2 )
    TENSOR_GROUPNORM_SCALE_KERNELS( U16, F32, U16, KERNEL_SOURCE_2 )
    TENSOR_GROUPNORM_SCALE_KERNELS_2D( U16, F32, U16, KERNEL_SOURCE_2 )
 };
 /*
@ -245,6 +250,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
    float sum_x2_tail0 = 1;
    float sum_x2_tail1 = 1;
    float work_item_pixels = 1;
    vsi_bool is_input_8bits = FALSE;
    VSI_UNREFERENCED(param_size);
@ -263,12 +269,13 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
    width = (int32_t)(input_shape->data[0]);
    height = (int32_t)(input_shape->data[1]);
    chn = (int32_t)(attr[1]->shape->data[1]);
    is_input_8bits = attr[0]->dtype == I8 || attr[0]->dtype == U8;
    if (is2D)
    {
        height = 1;
    }
-    work_item_pixels = (float)height * 16;
+    work_item_pixels = is_input_8bits ? 16 * (float)height : 8 * (float)height;
    sum_x_tail = -work_item_pixels * input_zp * input_scale;
    sum_x2_tail0 = work_item_pixels * input_zp * input_zp * input_scale2;
@ -281,11 +288,11 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
    shaderParam.local_size[1]  = 1;
    shaderParam.local_size[2]  = 1;
-    if (attr[0]->dtype == I8 || attr[0]->dtype == U8)
+    if (is_input_8bits)
    {
        shaderParam.global_size[0]   = (width + 255) / 256 * 16;
    }
-    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
    {
        shaderParam.global_size[0]   = (width + 127) / 128 * 16;
    }
@ -324,7 +331,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
        status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1);
        CHECK_STATUS_FAIL_GOTO(status, OnError );
    }
-    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
    {
        gpu_dp_inst_t uniSum_X_X2_8x2 = {{
            0x55555555, // TCfg
@ -483,7 +490,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
    }
    shaderParam.global_scale[0]  = 16;
-    if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
    {
        shaderParam.global_scale[0]  = 8;
    }
@ -610,6 +617,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
                    CHECK_STATUS_FAIL_GOTO(status, OnError );
                }
                break;
            case _PACK_SELECT_KEY( U16, U16 ):
            case _PACK_SELECT_KEY( I16, I16 ):
            case _PACK_SELECT_KEY( I16, F16 ):
            case _PACK_SELECT_KEY( F16, F16 ):
@ -838,8 +846,7 @@ static vsi_nn_kernel_node_t _setup
    attr.is_const = FALSE;
    attr.vtl = TRUE;
    attr.size[0] = ((new_shape[0] + 255) / 256) * 4;
-    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
+    if (in0_dtype == I16 || in0_dtype == F16 || in0_dtype == U16)
        || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
    {
        attr.size[0] = ((new_shape[0] + 127) / 128) * 4;
    }
--- a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
@ -124,22 +124,23 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
        {0, 0, 0}
        };
    int8_t      in0_fl          = 0;
-    int32_t     inputZP0        = 0;
+    int32_t     input0_zp       = 0;
-    float       input_scale0    = 1.0f;
+    float       input0_scale    = 1.0f;
-    int32_t     inputZP1        = 0;
+    int32_t     input1_zp       = 0;
-    float       input_scale1    = 1.0f;
+    float       input1_scale    = 1.0f;
    float       output_zp       = 0;
    int8_t      out_fl          = 0;
    float       outputZP        = 0;
-    int32_t  shift0             = 0;
+    int32_t     shift0          = 0;
-    vsi_bool is_ge_fl           = FALSE;
+    vsi_bool    is_ge_fl        = FALSE;
    vsi_bool is_2d_img          = FALSE;
    uint32_t evis_version       = 0;
    vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
    vsi_size_array_t * out_shape = NULL;
    uint32_t pack_key;
-    vx_context                  ctx       = vxGetContext((vx_reference)node);
+    vx_context ctx = vxGetContext((vx_reference)node);
    vx_hardware_caps_params_t   hw_param;
    VSI_UNREFERENCED(param_size);
@ -165,34 +166,30 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
    out_shape     = attr[2]->shape;
-    inputZP0      = attr[0]->zero_point;
+    input0_zp     = attr[0]->zero_point;
-    input_scale0  = attr[0]->scale;
+    input0_scale  = attr[0]->scale;
-    inputZP1      = attr[1]->zero_point;
+    input1_zp     = attr[1]->zero_point;
-    input_scale1  = attr[1]->scale;
+    input1_scale  = attr[1]->scale;
-    outputZP      = (float)attr[2]->zero_point;
+    output_zp     = (float)attr[2]->zero_point;
-    input_scale0  = input_scale0 / attr[2]->scale;
+    input0_scale  = input0_scale / attr[2]->scale;
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP &&
        attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
    {
        in0_fl = (int8_t)attr[0]->dfp.fl;
    }
    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
        out_fl = (int8_t)attr[2]->dfp.fl;
        shift0 = in0_fl - out_fl;
        is_ge_fl = shift0 >= 0;
    }
    shift0 = in0_fl - out_fl;
    is_2d_img = (out_shape->size < 3) || (out_shape->data[2] == 1);
    is_ge_fl  = shift0 >= 0;
 #define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, GE_FL, IMG_2D, EVIS2 )    \
        (IN0_TYPE  | ( OUT_TYPE << 16) | (GE_FL << 24) | (IMG_2D << 25) | (EVIS2 << 26))
-    pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype, is_ge_fl, is_2d_img, evis_version );
+    pack_key = _PACK_SELECT_KEY(attr[0]->dtype, attr[2]->dtype, is_ge_fl, is_2d_img, evis_version);
-    if ( attr[0]->dtype == I8 && attr[2]->dtype == I8 && is_ge_fl)
+    if (attr[0]->dtype == I8 && attr[2]->dtype == I8 && is_ge_fl)
    {
        gpu_param.global_scale[0] = 16;
        gpu_param.global_scale[1] = 1;
@ -204,7 +201,6 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
        gpu_param.global_scale[1] = 1;
        gpu_param.global_scale[2] = 1;
    }
    gpu_param.global_size[0] = gpu_align_p2(
            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
            / gpu_param.global_scale[0], 4);
@ -215,97 +211,97 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
    switch( pack_key )
    {
-        case _PACK_SELECT_KEY( I8,  I8,  1, 1, 2 ):
+    case _PACK_SELECT_KEY(I8,  I8,  1, 1, 2):
-        case _PACK_SELECT_KEY( I16, I16, 1, 1, 2 ):
+    case _PACK_SELECT_KEY(I16, I16, 1, 1, 2):
    {
        gpu_dp_inst_t uniPreluDFPLo_2x8b = { {
            0x77777777, // TCfg
            0x44444444, // ASelt
            0x33221100, 0x77665544, // ABin
            0x00000000, // BSelt
            0x30201000, 0x70605040, // BBin
            0x00004000, // AccumType, ConstantType, and PostShift
            0x00000000, 0x00000000, 0x00000000, 0x00000000,
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16 };
        gpu_dp_inst_t uniPreluDFPHi_2x8b = { {
            0x77777777, // TCfg
            0x44444444, // ASelt
            0xbbaa9988, 0xffeeddcc, // ABin
            0x00000000, // BSelt
            0x30201000, 0x70605040, // BBin
            0x00004000, // AccumType, ConstantType, and PostShift
            0x00000000, 0x00000000, 0x00000000, 0x00000000,
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16 };
        if (attr[0]->dtype == I16)
        {
-            gpu_dp_inst_t uniPreluDFPLo_2x8b = {{
+            uniPreluDFPLo_2x8b.data[7] = 0x00003000;
-                0x77777777, // TCfg
+            uniPreluDFPHi_2x8b.data[7] = 0x00003000;
                0x44444444, // ASelt
                0x33221100, 0x77665544, // ABin
                0x00000000, // BSelt
                0x30201000, 0x70605040, // BBin
                0x00004000, // AccumType, ConstantType, and PostShift
                0x00000000, 0x00000000, 0x00000000, 0x00000000,
                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniPreluDFPHi_2x8b = {{
                0x77777777, // TCfg
                0x44444444, // ASelt
                0xbbaa9988, 0xffeeddcc, // ABin
                0x00000000, // BSelt
                0x30201000, 0x70605040, // BBin
                0x00004000, // AccumType, ConstantType, and PostShift
                0x00000000, 0x00000000, 0x00000000, 0x00000000,
                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
            if ( attr[0]->dtype == I16 )
            {
                uniPreluDFPLo_2x8b.data[7] = 0x00003000;
                uniPreluDFPHi_2x8b.data[7] = 0x00003000;
            }
            gpu_dp_inst_update_postshfit( &uniPreluDFPLo_2x8b, shift0 );
            gpu_dp_inst_update_postshfit( &uniPreluDFPHi_2x8b, shift0 );
            status = vsi_nn_kernel_gpu_add_param( node,
                    "uniPreluDFPLo_2x8b", &uniPreluDFPLo_2x8b );
            status |= vsi_nn_kernel_gpu_add_param( node,
                    "uniPreluDFPHi_2x8b", &uniPreluDFPHi_2x8b );
            CHECK_STATUS_FAIL_GOTO(status, final );
        }
        break;
        case _PACK_SELECT_KEY( I8,  I8,  1, 1, 1 ):
        case _PACK_SELECT_KEY( I16, I16, 1, 1, 1 ):
        {
            gpu_dp_inst_t uniPreluInt8_2x8 = {{
                0x55555555, // TCfg
                0x00000000, // ASelt
                0xb3a29180, 0xf7e6d5c4, // ABin
                0x66666666, // BSelt
                0x30201000, 0x70605040, // BBin
                0x00000600, // AccumType, ConstantType, and PostShift
                0x00000001, 0x00000001, 0x00000001, 0x00000001,
                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniPreluInt16_part0_4x4 = {{
                0x05050505, // TCfg
                0x00000000, // ASelt
                0x00510040, 0x00730062, // ABin
                0x06060606, // BSelt
                0x00100000, 0x00300020, // BBin
                0x00000400, // AccumType, ConstantType, and PostShift
                0x00000001, 0x00000000, 0x00000001, 0x00000000,
                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniPreluInt16_part1_4x4 = {{
                0x05050505, // TCfg
                0x00000000, // ASelt
                0x00510040, 0x00730062, // ABin
                0x06060606, // BSelt
                0x00500040, 0x00700060, // BBin
                0x00000400, // AccumType, ConstantType, and PostShift
                0x00000001, 0x00000000, 0x00000001, 0x00000000,
                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_update_postshfit( &uniPreluInt8_2x8, shift0 );
+        gpu_dp_inst_update_postshfit(&uniPreluDFPLo_2x8b, shift0);
-            gpu_dp_inst_update_postshfit( &uniPreluInt16_part0_4x4, shift0 );
+        gpu_dp_inst_update_postshfit(&uniPreluDFPHi_2x8b, shift0);
            gpu_dp_inst_update_postshfit( &uniPreluInt16_part1_4x4, shift0 );
-            status = vsi_nn_kernel_gpu_add_param( node,
+        status = vsi_nn_kernel_gpu_add_param(node,
-                    "uniPreluInt8_2x8", &uniPreluInt8_2x8 );
+            "uniPreluDFPLo_2x8b", &uniPreluDFPLo_2x8b);
-            status |= vsi_nn_kernel_gpu_add_param( node,
+        status |= vsi_nn_kernel_gpu_add_param(node,
-                    "uniPreluInt16_part0_4x4", &uniPreluInt16_part0_4x4 );
+            "uniPreluDFPHi_2x8b", &uniPreluDFPHi_2x8b);
-            status |= vsi_nn_kernel_gpu_add_param( node,
+        CHECK_STATUS_FAIL_GOTO(status, final);
-                    "uniPreluInt16_part1_4x4", &uniPreluInt16_part1_4x4 );
+    }
-            CHECK_STATUS_FAIL_GOTO(status, final );
+    break;
-        }
+    case _PACK_SELECT_KEY(I8,  I8,  1, 1, 1):
-        break;
+    case _PACK_SELECT_KEY(I16, I16, 1, 1, 1):
-        case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 1 ):
+    {
-        case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 2 ):
+        gpu_dp_inst_t uniPreluInt8_2x8 = { {
-        case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 1 ):
+            0x55555555, // TCfg
-        case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 2 ):
+            0x00000000, // ASelt
            0xb3a29180, 0xf7e6d5c4, // ABin
            0x66666666, // BSelt
            0x30201000, 0x70605040, // BBin
            0x00000600, // AccumType, ConstantType, and PostShift
            0x00000001, 0x00000001, 0x00000001, 0x00000001,
            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
        }, GPU_DP_TYPE_16 };
        gpu_dp_inst_t uniPreluInt16_part0_4x4 = { {
            0x05050505, // TCfg
            0x00000000, // ASelt
            0x00510040, 0x00730062, // ABin
            0x06060606, // BSelt
            0x00100000, 0x00300020, // BBin
            0x00000400, // AccumType, ConstantType, and PostShift
            0x00000001, 0x00000000, 0x00000001, 0x00000000,
            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
        }, GPU_DP_TYPE_16 };
        gpu_dp_inst_t uniPreluInt16_part1_4x4 = { {
            0x05050505, // TCfg
            0x00000000, // ASelt
            0x00510040, 0x00730062, // ABin
            0x06060606, // BSelt
            0x00500040, 0x00700060, // BBin
            0x00000400, // AccumType, ConstantType, and PostShift
            0x00000001, 0x00000000, 0x00000001, 0x00000000,
            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
        }, GPU_DP_TYPE_16 };
        gpu_dp_inst_update_postshfit(&uniPreluInt8_2x8, shift0);
        gpu_dp_inst_update_postshfit(&uniPreluInt16_part0_4x4, shift0);
        gpu_dp_inst_update_postshfit(&uniPreluInt16_part1_4x4, shift0);
        status = vsi_nn_kernel_gpu_add_param(node,
            "uniPreluInt8_2x8", &uniPreluInt8_2x8);
        status |= vsi_nn_kernel_gpu_add_param(node,
            "uniPreluInt16_part0_4x4", &uniPreluInt16_part0_4x4);
        status |= vsi_nn_kernel_gpu_add_param(node,
            "uniPreluInt16_part1_4x4", &uniPreluInt16_part1_4x4);
        CHECK_STATUS_FAIL_GOTO(status, final);
    }
    break;
    case _PACK_SELECT_KEY(BF16, BF16, 0, 1, 1):
    case _PACK_SELECT_KEY(BF16, BF16, 0, 1, 2):
    case _PACK_SELECT_KEY(BF16, BF16, 0, 0, 1):
    case _PACK_SELECT_KEY(BF16, BF16, 0, 0, 2):
        {
            gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
                0x11111111, // TCfg
@ -446,15 +442,15 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
            status |= vsi_nn_kernel_gpu_add_param( node,
                "uniConvF16toF32_part1_4x4", &uniConvF16toF32_part1_4x4 );
            status |= vsi_nn_kernel_gpu_add_param( node,
-                "inputZP0", &inputZP0 );
+                "input0_zp", &input0_zp);
            status |= vsi_nn_kernel_gpu_add_param( node,
-                "input_scale0", &input_scale0 );
+                "input0_scale", &input0_scale );
            status |= vsi_nn_kernel_gpu_add_param( node,
-                "inputZP1", &inputZP1 );
+                "input1_zp", &input1_zp);
            status |= vsi_nn_kernel_gpu_add_param( node,
-                "input_scale1", &input_scale1 );
+                "input1_scale", &input1_scale );
            status |= vsi_nn_kernel_gpu_add_param( node,
-                "outputZP", &outputZP );
+                "output_zp", &output_zp );
            if (attr[2]->dtype == F16)
            {
                status |= vsi_nn_kernel_gpu_add_param( node,
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@ -58,53 +59,92 @@ typedef enum
 #define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type)  "resize_bilinear_"#_input_type"_opt"
 #define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_1"
 #define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_2"
 #define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_3"
 #define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC4(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_4"
 #define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC5(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_5"
 #define STR(a) #a
 // Add kernel hashtable here
-#define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag ) \
+#define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag, same_type ) \
-        (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (scale_flag))
+        (( IN_DTYPE ) | ( OUT_DTYPE << 8) | (scale_flag << 16) | (same_type << 22))
-#define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \
+#define _PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN ), \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN, SAME_TYPE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_DOWN"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) }
-#define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \
+#define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP ), \
+        _PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, TRUE ), \
        _PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, FALSE )
 #define _PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP, SAME_TYPE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) }
-#define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \
+#define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT ), \
+        _PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, TRUE ), \
        _PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, FALSE )
 #define _PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT, SAME_TYPE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP_opt"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(IN_DTYPE) }
-#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \
+#define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF ), \
+        _PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, TRUE ), \
        _PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, FALSE )
 #define PACK_KERNEL_MAP_UP_2X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF, TRUE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
            "_SAME_2x_upsample_half_pixel_centers"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
-#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
+#define PACK_KERNEL_MAP_UP_4X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF ), \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF, TRUE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
            "_SAME_4x_upsample_half_pixel_centers"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
-#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \
+#define PACK_KERNEL_MAP_UP_8X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF ), \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF, TRUE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
            "_SAME_8x_upsample_half_pixel_centers"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(IN_DTYPE) }
-#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
+#define PACK_KERNEL_MAP_UP_3X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF ), \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF, TRUE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
            "_SAME_3x_upsample_half_pixel_centers"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
 #define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \
        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF, FALSE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
            "_2x_upsample_half_pixel_centers"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(IN_DTYPE) }
 #define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF, FALSE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
            "_4x_upsample_half_pixel_centers"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(IN_DTYPE) }
 #define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \
        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF, FALSE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
            "_8x_upsample_half_pixel_centers"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC5(IN_DTYPE) }
 #define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF, FALSE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
            "_3x_upsample_half_pixel_centers"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC4(IN_DTYPE) }
 #define PACK_KERNEL_MAP_UP_8X_ALIGN( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN ), \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN, TRUE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
            "_SAME_8x_upsample_align_corners"), \
          "resize_bilinear_align_corners" }
@ -135,6 +175,10 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
    PACK_KERNEL_MAP_UP(F16, F16),
    PACK_KERNEL_MAP_UP(BF16, BF16),
    PACK_KERNEL_MAP_UP_OPT(U8, U8),
    PACK_KERNEL_MAP_UP_2X_HALF_SAME_TYPE(U8, U8),
    PACK_KERNEL_MAP_UP_3X_HALF_SAME_TYPE(U8, U8),
    PACK_KERNEL_MAP_UP_4X_HALF_SAME_TYPE(U8, U8),
    PACK_KERNEL_MAP_UP_8X_HALF_SAME_TYPE(U8, U8),
    PACK_KERNEL_MAP_UP_2X_HALF(U8, U8),
    PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
    PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
@ -672,18 +716,23 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
        };
    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
    vsi_nn_kernel_tensor_attr_t * input_attr    = NULL;
-    vsi_size_array_t             * out_shape     = NULL;
+    vsi_size_array_t             * out_shape    = NULL;
-    vsi_size_array_t             * in_shape      = NULL;
+    vsi_size_array_t             * in_shape     = NULL;
    vsi_nn_kernel_dtype_e         input_dtype   = F16;
    vsi_nn_kernel_dtype_e         output_dtype  = F16;
    uint32_t    depth = 0;
    uint32_t    in_width = 0;
    uint32_t    in_height = 0;
    uint32_t    out_width = 0;
    uint32_t    out_height = 0;
    vsi_bool    is_same_type = FALSE;
    vsi_bool    is_2x_up_kernel  = FALSE;
    vsi_bool    is_3x_up_kernel  = FALSE;
    vsi_bool    is_4x_up_kernel  = FALSE;
    vsi_bool    is_8x_up_kernel  = FALSE;
    float scale = 1.f;
    int32_t input_zp = 0;
    int32_t output_zp = 0;
    VSI_UNREFERENCED(param_size);
@ -692,17 +741,23 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
-    out_shape     = output_attr->shape;
+    out_shape    = output_attr->shape;
-    in_shape      = input_attr->shape;
+    in_shape     = input_attr->shape;
-    input_dtype   = input_attr->dtype;
+    input_dtype  = input_attr->dtype;
    output_dtype = output_attr->dtype;
    in_width          = (uint32_t)(in_shape->data[0]);
    in_height         = (uint32_t)(in_shape->data[1]);
    depth             = (uint32_t)(in_shape->data[2]);
    out_width         = (uint32_t)(out_shape->data[0]);
    out_height        = (uint32_t)(out_shape->data[1]);
    scale = input_attr->scale;
    input_zp = input_attr->zero_point;
    scale /= output_attr->scale;
    output_zp = output_attr->zero_point;
    is_same_type = _is_same_quant(input_attr, output_attr);
-    if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
+    if ((U8 == input_dtype) && (output_dtype == U8))
    {
        is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
        is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
@ -728,206 +783,303 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
        gpu_param.global_scale[2] = 1;
    }
-    if (is_2x_up_kernel)
+    if (is_2x_up_kernel || is_3x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
    {
-        gpu_dp_inst_t uniResize2xUp_0_4x8 = {{
+        uint16_t M0 = 0;
-            0x55555555, 0x55555555, // TCfg
+        int32_t  postShift = 0;
-            0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
+        uint32_t multAndoutZP[2] = { 0 };
-            0x00000704, // AccumType, ConstantType, and PostShift
+        gpu_dp_inst_t uniU8PostProcess_2x8 = { {
-            0x09030301, 0x03090103, 0x09030301, 0x03090103,
+            0xdddddddd, // TCfg
-            0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
+            0x44444444, // ASelt
-        }, GPU_DP_TYPE_16};
+            0x13121110, 0x17161514, // ABin
-        gpu_dp_inst_t uniResize2xUp_1_4x8 = {{
+            0x11111111, // BSelt
-            0x55555555, 0x55555555, // TCfg
+            0x00000000, 0x00000000, // BBin
-            0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
+            0x00002600, // AccumType, ConstantType, and PostShift
-            0x00000704, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
-            0x09030301, 0x03090103, 0x09030301, 0x03090103,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
+        }, GPU_DP_TYPE_16 };
        }, GPU_DP_TYPE_16};
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
+        if (is_2x_up_kernel)
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
+        {
-        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
+            gpu_dp_inst_t uniResize2xUp_0_4x8 = { {
-        CHECK_STATUS_FAIL_GOTO(status, final );
+                0x55555555, 0x55555555, // TCfg
-    }
+                0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
-    else if (is_3x_up_kernel)
+                0x00000704, // AccumType, ConstantType, and PostShift
-    {
+                0x09030301, 0x03090103, 0x09030301, 0x03090103,
-        gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{
+                0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
-            0x15515515, // TCfg
+            }, GPU_DP_TYPE_16 };
-            0x00000000, // ASelt
+            gpu_dp_inst_t uniResize2xUp_1_4x8 = { {
-            0x21210110, 0x03323202, // ABin
+                0x55555555, 0x55555555, // TCfg
-            0x2aa2aa2a, // BSelt
+                0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
-            0x00000000, 0x00000000, // BBin
+                0x00000704, // AccumType, ConstantType, and PostShift
-            0x00000610, // AccumType, ConstantType, and PostShift
+                0x09030301, 0x03090103, 0x09030301, 0x03090103,
-            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
+                0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
-            0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
+            }, GPU_DP_TYPE_16 };
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{
            0x05155155, // TCfg
            0x00000000, // ASelt
            0x54044343, 0x00650554, // ABin
            0x0a2aa2aa, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00000610, // AccumType, ConstantType, and PostShift
            0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{
            0x55551155, // TCfg
            0x50501050, // ASelt
            0x01011010, 0x21212121, // ABin
            0xaaaa22aa, // BSelt
            0x00000000, 0x00000000, // BBin
            0x0000060f, // AccumType, ConstantType, and PostShift
            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
            0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{
            0x11555511, // TCfg
            0x10505010, // ASelt
            0x32320202, 0x03033232, // ABin
            0x22aaaa22, // BSelt
            0x00000000, 0x00000000, // BBin
            0x0000060f, // AccumType, ConstantType, and PostShift
            0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{
            0x55115555, // TCfg
            0x50105050, // ASelt
            0x43434343, 0x54540404, // ABin
            0xaa22aaaa, // BSelt
            0x00000000, 0x00000000, // BBin
            0x0000060f, // AccumType, ConstantType, and PostShift
            0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
            0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{
            0x00551155, // TCfg
            0x00501050, // ASelt
            0x05055454, 0x00006565, // ABin
            0x00aa22aa, // BSelt
            0x00000000, 0x00000000, // BBin
            0x0000060f, // AccumType, ConstantType, and PostShift
            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
            0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
+            if (!is_same_type)
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
+            {
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
+                float f2i_radio = 16.0f;
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
+                gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
+                multAndoutZP[0] = (uint32_t)(M0);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
+                multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
    else if (is_4x_up_kernel)
    {
        gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{
            0x55555555, 0x55555555, // TCfg
            0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
            0x00000406, // AccumType, ConstantType, and PostShift
            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{
            0x55555555, 0x55555555, // TCfg
            0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
            0x00000406, // AccumType, ConstantType, and PostShift
            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{
            0x55555555, 0x55555555, // TCfg
            0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
            0x00000406, // AccumType, ConstantType, and PostShift
            0x23150503, 0x31070701, 0x07310107, 0x15230305,
            0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{
            0x55555555, 0x55555555, // TCfg
            0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
            0x00000406, // AccumType, ConstantType, and PostShift
            0x23150503, 0x31070701, 0x07310107, 0x15230305,
            0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
        }, GPU_DP_TYPE_16};
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
+                gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
+                uniResize2xUp_0_4x8.data[7] = 0x00000700;
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
+                uniResize2xUp_1_4x8.data[7] = 0x00000700;
        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
    else if (is_8x_up_kernel)
    {
        gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{
            0x55555555, 0x55555555, // TCfg
            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
            0x00000708, // AccumType, ConstantType, and PostShift
            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{
            0x55555555, 0x55555555, // TCfg
            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
            0x00000708, // AccumType, ConstantType, and PostShift
            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{
            0x55555555, 0x55555555, // TCfg
            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
            0x00000708, // AccumType, ConstantType, and PostShift
            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{
            0x55555555, 0x55555555, // TCfg
            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
            0x00000708, // AccumType, ConstantType, and PostShift
            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{
            0x55555555, 0x55555555, // TCfg
            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
            0x00000708, // AccumType, ConstantType, and PostShift
            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{
            0x55555555, 0x55555555, // TCfg
            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
            0x00000708, // AccumType, ConstantType, and PostShift
            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{
            0x55555555, 0x55555555, // TCfg
            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
            0x00000708, // AccumType, ConstantType, and PostShift
            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{
            0x55555555, 0x55555555, // TCfg
            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
            0x00000708, // AccumType, ConstantType, and PostShift
            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
        }, GPU_DP_TYPE_16};
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
+                status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
+                    &uniU8PostProcess_2x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
+                CHECK_STATUS_FAIL_GOTO(status, final);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
+            }
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
+
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
+            status = vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
+            status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
-        CHECK_STATUS_FAIL_GOTO(status, final );
+            CHECK_STATUS_FAIL_GOTO(status, final);
        }
        else if (is_3x_up_kernel)
        {
            gpu_dp_inst_t uniResize3xUp_l00_2x8 = { {
                0x15515515, // TCfg
                0x00000000, // ASelt
                0x21210110, 0x03323202, // ABin
                0x2aa2aa2a, // BSelt
                0x00000000, 0x00000000, // BBin
                0x00000610, // AccumType, ConstantType, and PostShift
                0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
                0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniResize3xUp_l01_2x8 = { {
                0x05155155, // TCfg
                0x00000000, // ASelt
                0x54044343, 0x00650554, // ABin
                0x0a2aa2aa, // BSelt
                0x00000000, 0x00000000, // BBin
                0x00000610, // AccumType, ConstantType, and PostShift
                0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
                0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniResize3xUp_l10_4x4 = { {
                0x55551155, // TCfg
                0x50501050, // ASelt
                0x01011010, 0x21212121, // ABin
                0xaaaa22aa, // BSelt
                0x00000000, 0x00000000, // BBin
                0x0000060f, // AccumType, ConstantType, and PostShift
                0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
                0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniResize3xUp_l11_4x4 = { {
                0x11555511, // TCfg
                0x10505010, // ASelt
                0x32320202, 0x03033232, // ABin
                0x22aaaa22, // BSelt
                0x00000000, 0x00000000, // BBin
                0x0000060f, // AccumType, ConstantType, and PostShift
                0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
                0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniResize3xUp_l12_4x4 = { {
                0x55115555, // TCfg
                0x50105050, // ASelt
                0x43434343, 0x54540404, // ABin
                0xaa22aaaa, // BSelt
                0x00000000, 0x00000000, // BBin
                0x0000060f, // AccumType, ConstantType, and PostShift
                0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
                0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniResize3xUp_l13_4x4 = { {
                0x00551155, // TCfg
                0x00501050, // ASelt
                0x05055454, 0x00006565, // ABin
                0x00aa22aa, // BSelt
                0x00000000, 0x00000000, // BBin
                0x0000060f, // AccumType, ConstantType, and PostShift
                0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
                0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
            if (!is_same_type)
            {
                float f2i_radio = 256.0f;
                gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
                multAndoutZP[0] = (uint32_t)(M0);
                multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
                gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
                uniResize3xUp_l00_2x8.data[7] = 0x00000608;
                uniResize3xUp_l01_2x8.data[7] = 0x00000608;
                uniResize3xUp_l10_4x4.data[7] = 0x00000607;
                uniResize3xUp_l11_4x4.data[7] = 0x00000607;
                uniResize3xUp_l12_4x4.data[7] = 0x00000607;
                uniResize3xUp_l13_4x4.data[7] = 0x00000607;
                status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
                    &uniU8PostProcess_2x8);
                status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
                CHECK_STATUS_FAIL_GOTO(status, final);
            }
            status = vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
            CHECK_STATUS_FAIL_GOTO(status, final);
        }
        else if (is_4x_up_kernel)
        {
            gpu_dp_inst_t uniResize4xUp_l00_4x8 = { {
                0x55555555, 0x55555555, // TCfg
                0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
                0x00000406, // AccumType, ConstantType, and PostShift
                0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
                0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniResize4xUp_l01_4x8 = { {
                0x55555555, 0x55555555, // TCfg
                0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
                0x00000406, // AccumType, ConstantType, and PostShift
                0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
                0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniResize4xUp_l10_4x8 = { {
                0x55555555, 0x55555555, // TCfg
                0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
                0x00000406, // AccumType, ConstantType, and PostShift
                0x23150503, 0x31070701, 0x07310107, 0x15230305,
                0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniResize4xUp_l11_4x8 = { {
                0x55555555, 0x55555555, // TCfg
                0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
                0x00000406, // AccumType, ConstantType, and PostShift
                0x23150503, 0x31070701, 0x07310107, 0x15230305,
                0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
            }, GPU_DP_TYPE_16 };
            if (!is_same_type)
            {
                float f2i_radio = 64.0f;
                gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
                multAndoutZP[0] = (uint32_t)(M0);
                multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
                gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
                uniResize4xUp_l00_4x8.data[7] = 0x00000400;
                uniResize4xUp_l01_4x8.data[7] = 0x00000400;
                uniResize4xUp_l10_4x8.data[7] = 0x00000400;
                uniResize4xUp_l11_4x8.data[7] = 0x00000400;
                status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
                    &uniU8PostProcess_2x8);
                status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
                CHECK_STATUS_FAIL_GOTO(status, final);
            }
            status = vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
            status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
            CHECK_STATUS_FAIL_GOTO(status, final);
        }
        else if (is_8x_up_kernel)
        {
            gpu_dp_inst_t uniResize8xUp_l00_4x8 = { {
                0x55555555, 0x55555555, // TCfg
                0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
                0x00000708, // AccumType, ConstantType, and PostShift
                0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
                0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniResize8xUp_l01_4x8 = { {
                0x55555555, 0x55555555, // TCfg
                0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
                0x00000708, // AccumType, ConstantType, and PostShift
                0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
                0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniResize8xUp_l10_4x8 = { {
                0x55555555, 0x55555555, // TCfg
                0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
                0x00000708, // AccumType, ConstantType, and PostShift
                0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
                0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniResize8xUp_l11_4x8 = { {
                0x55555555, 0x55555555, // TCfg
                0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
                0x00000708, // AccumType, ConstantType, and PostShift
                0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
                0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniResize8xUp_l20_4x8 = { {
                0x55555555, 0x55555555, // TCfg
                0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
                0x00000708, // AccumType, ConstantType, and PostShift
                0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
                0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniResize8xUp_l21_4x8 = { {
                0x55555555, 0x55555555, // TCfg
                0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
                0x00000708, // AccumType, ConstantType, and PostShift
                0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
                0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniResize8xUp_l30_4x8 = { {
                0x55555555, 0x55555555, // TCfg
                0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
                0x00000708, // AccumType, ConstantType, and PostShift
                0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
                0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniResize8xUp_l31_4x8 = { {
                0x55555555, 0x55555555, // TCfg
                0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
                0x00000708, // AccumType, ConstantType, and PostShift
                0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
                0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
            }, GPU_DP_TYPE_16 };
            if (!is_same_type)
            {
                float f2i_radio = 256.0f;
                gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
                multAndoutZP[0] = (uint32_t)(M0);
                multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
                gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
                uniResize8xUp_l00_4x8.data[7] = 0x00000700;
                uniResize8xUp_l01_4x8.data[7] = 0x00000700;
                uniResize8xUp_l10_4x8.data[7] = 0x00000700;
                uniResize8xUp_l11_4x8.data[7] = 0x00000700;
                uniResize8xUp_l20_4x8.data[7] = 0x00000700;
                uniResize8xUp_l21_4x8.data[7] = 0x00000700;
                uniResize8xUp_l30_4x8.data[7] = 0x00000700;
                uniResize8xUp_l31_4x8.data[7] = 0x00000700;
                status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
                    &uniU8PostProcess_2x8);
                status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
                CHECK_STATUS_FAIL_GOTO(status, final);
            }
            status = vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
            status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
            CHECK_STATUS_FAIL_GOTO(status, final);
        }
    }
    else
    {
@ -1193,22 +1345,22 @@ static vsi_status _query_kernel
    if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0])
    {
-        if (is_same_type && (!align_corners) && (half_pixel_centers) && is_2x_upsample)
+        if ((!align_corners) && (half_pixel_centers) && is_2x_upsample)
        {
            scale_flag = UP_2X_HALF;
            initializer = _bilinear_half_pixel_centers_opt_initializer;
        }
-        else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_3x_upsample)
+        else if ((!align_corners) && (half_pixel_centers) && is_3x_upsample)
        {
            scale_flag = UP_3X_HALF;
            initializer = _bilinear_half_pixel_centers_opt_initializer;
        }
-        else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_4x_upsample)
+        else if ((!align_corners) && (half_pixel_centers) && is_4x_upsample)
        {
            scale_flag = UP_4X_HALF;
            initializer = _bilinear_half_pixel_centers_opt_initializer;
        }
-        else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_8x_upsample)
+        else if ((!align_corners) && (half_pixel_centers) && is_8x_upsample)
        {
            scale_flag = UP_8X_HALF;
            initializer = _bilinear_half_pixel_centers_opt_initializer;
@ -1232,7 +1384,7 @@ static vsi_status _query_kernel
        scale_flag = DOWN;
    }
-    key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
+    key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
    for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
    {
        if( kernel_map[i].key == key )
@ -1244,7 +1396,7 @@ static vsi_status _query_kernel
    if ((UP_OPT == scale_flag) && (i >= kernel_map_size))
    {
        scale_flag = UP;
-        key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
+        key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
        for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
        {
            if( kernel_map[i].key == key )
@ -1257,7 +1409,7 @@ static vsi_status _query_kernel
    if ((UP == scale_flag) && (i >= kernel_map_size))
    {
        scale_flag = DOWN;
-        key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
+        key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
        for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
        {
            if( kernel_map[i].key == key )
@ -1433,7 +1585,7 @@ static vsi_bool _is_image_width_lt16
    size_t bytes = vsi_nn_kernel_dtype_get_bytes(in_dtype);
    vsi_size_t max_cross_read_img_width = bytes == 1 ? 16 : 8;
-    if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
+    if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
    {
        return FALSE;
    }
@ -1468,7 +1620,8 @@ static vsi_nn_kernel_node_t _setup
    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
    vsi_bool is_same_type       = vsi_nn_is_same_type(inputs[0], outputs[0]);
-    vsi_bool is_evis2           = (vsi_bool)(graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_2);
+    vsi_bool is_evis2           = \
        (vsi_bool)(((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver == VSI_NN_HW_EVIS_2);
    vsi_bool is_run_opt_kernel  = FALSE;
    vsi_nn_tensor_t*  scale     = NULL;
    int32_t pad_left = half_pixel_centers ? 1 : 0;
--- a/src/tim/vx/internal/src/kernel/evis/rope_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/rope_evis.c
@ -0,0 +1,744 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_error.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 __BEGIN_DECLS
 /*
 * Define kernel meta.
    B---batch
    N---num_heads
    S---sequence length
    H---head size
 */
 typedef enum
 {
    LAYOUT_NONE,
    LAYOUT_BNHS,
    LAYOUT_BNH1,
    LAYOUT_BSNH,
    LAYOUT_BNSH,
 } _internal_rope_layout_e;
 // Add kernel hashtable here
 #define STR(a) #a
 #define ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT, INTERLEAVED ) \
      ((IN0_DTYPE) | (IN1_DTYPE << 8) | (OUT_DTYPE << 16) | (LAYOUT << 24) | (INTERLEAVED << 28))
 #define PACK_KERNEL_BNHS_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
        { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNHS, 0 ), \
         CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnhs"), \
         "rope_0" }
 #define PACK_KERNEL_BNH1_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
        { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNH1, 0 ), \
         CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnh1"), \
         "rope_1" }
 #define PACK_KERNEL_BSNH_INTERLEVEAD_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
        { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BSNH, 1 ), \
         CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bsnh"), \
         "rope_2" }
 #define PACK_KERNEL_BNSH_INTERLEVEAD_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
        { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNSH, 1 ), \
         CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnsh"), \
         "rope_3" }
 typedef struct
 {
    uint32_t key;
    char * function_name;
    const char * source_name;
 } _kernel_map_type;
 #define PACK_KERNEL_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
    PACK_KERNEL_BNHS_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
    PACK_KERNEL_BNH1_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
    PACK_KERNEL_BSNH_INTERLEVEAD_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
    PACK_KERNEL_BNSH_INTERLEVEAD_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE),
 static const _kernel_map_type _rope_kernel_map[] =
 {
    // Register kernel here
    PACK_KERNEL_MAP( BF16, BF16, BF16)
    PACK_KERNEL_MAP( F16,  F16,  F16 )
    PACK_KERNEL_MAP( I16,  I16,  I16 )
    PACK_KERNEL_MAP( I16,  F16,  I16 )
    PACK_KERNEL_MAP( I16,  I16,  I8 )
    PACK_KERNEL_MAP( I16,  F16,  I8 )
    PACK_KERNEL_MAP( I16,  I16,  U8 )
    PACK_KERNEL_MAP( I16,  F16,  U8 )
    PACK_KERNEL_MAP( U16,  U16,  U16 )
    PACK_KERNEL_MAP( U16,  F16,  U16 )
    PACK_KERNEL_MAP( I8,   I8,   I8  )
    PACK_KERNEL_MAP( I8,   F16,  I8  )
    PACK_KERNEL_MAP( U8,   U8,   U8  )
    PACK_KERNEL_MAP( U8,   F16,  U8  )
 };
 /*
 * Kernel params
 */
 static vx_param_description_t _rope_kernel_param_def[] =
 {
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _ROPE_PARAM_NUM  _cnt_of_array( _rope_kernel_param_def )
 #define SCALAR_AXIS       (4)
 /*
 * Kernel initializer
 */
 DEF_KERNEL_INITIALIZER(_rope_initializer)
    (
    vsi_nn_kernel_node_t                node,
    const vsi_nn_kernel_node_param_t  * param,
    size_t                              param_size
    )
 {
    vsi_status status = VSI_FAILURE;
    gpu_param_t gpu_param = {
        3,
        {0, 0, 0},
        {0, 0, 0},
        {0, 0, 0},
        {0, 0, 0}
    };
    vsi_nn_kernel_tensor_attr_t* out_attr = NULL;
    vsi_nn_kernel_tensor_attr_t* in0_attr = NULL;
    vsi_nn_kernel_tensor_attr_t* in1_attr = NULL;
    vsi_nn_kernel_tensor_attr_t* in2_attr = NULL;
    vsi_size_array_t* in_shape = NULL;
    vsi_nn_kernel_dtype_e in0_dtype = F16;
    vsi_nn_kernel_dtype_e in1_dtype = F16;
    vsi_nn_kernel_dtype_e in2_dtype = F16;
    vsi_nn_kernel_dtype_e out_dtype = F16;
    float in0_scale = 1.0f;
    float in1_scale = 1.0f;
    float in2_scale = 1.0f;
    float output_scale = 1.0f;
    float output_zp = 0;
    int32_t in0_zp = 0;
    int32_t cos_zp = 0;
    int32_t sin_zp = 0;
    int32_t p = 0;
    int32_t axis = 0;
    int32_t interleaved = 0;
    int32_t half_head_size = 1;
    vsi_size_t shape[3] = {1};
    uint32_t pack_key = 0;
    VSI_UNREFERENCED(node);
    VSI_UNREFERENCED(param);
    VSI_UNREFERENCED(param_size);
    // Add initializer
    in0_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
    CHECK_PTR_FAIL_GOTO(in0_attr, "Create tensor attr buffer fail.", final);
    in1_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]);
    CHECK_PTR_FAIL_GOTO(in1_attr, "Create tensor attr buffer fail.", final);
    in2_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
    CHECK_PTR_FAIL_GOTO(in2_attr, "Create tensor attr buffer fail.", final);
    out_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[3]);
    CHECK_PTR_FAIL_GOTO(out_attr, "Create tensor attr buffer fail.", final);
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &p);
    CHECK_STATUS_FAIL_GOTO(status, final);
    axis = p & 0xFFFF;
    interleaved = (p >> 16) & 0xFFFF;
    in_shape = in0_attr->shape;
    in0_dtype = in0_attr->dtype;
    in1_dtype = in1_attr->dtype;
    in2_dtype = in2_attr->dtype;
    out_dtype = out_attr->dtype;
    in0_scale = in0_attr->scale;
    in1_scale = in1_attr->scale;
    in2_scale = in2_attr->scale;
    in0_zp = -in0_attr->zero_point;
    cos_zp = -in1_attr->zero_point;
    sin_zp = -in2_attr->zero_point;
    output_scale = out_attr->scale;
    output_zp = (float)out_attr->zero_point;
    half_head_size = (int32_t)(in_shape->data[axis] / 2);
    shape[0] = in_shape->data[0];
    shape[1] = in_shape->data[1];
    shape[2] = in_shape->data[2];
    shape[axis] = half_head_size;
    gpu_param.global_scale[0] = 8;
    gpu_param.global_scale[1] = 1;
    gpu_param.global_scale[2] = 1;
    gpu_param.global_size[0] = gpu_align_p2((shape[0] + \
        gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
    gpu_param.global_size[1] = shape[1];
    gpu_param.global_size[2] = shape[2];
 #define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE )    \
        ((IN0_TYPE) | (IN1_TYPE << 8) | (IN2_TYPE << 16) | (OUT_TYPE << 24))
    pack_key = _PACK_SELECT_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype);
    switch (pack_key)
    {
    case _PACK_SELECT_KEY(BF16, BF16, BF16, BF16):
    {
        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = { {
            0x11111111, // TCfg
            0x01010101, // ASelt
            0x01050004, 0x03070206, // ABin
            0x22222222, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00000600, // AccumType, ConstantType, and PostShift
            0x00000001, 0x00000001, 0x00000001, 0x00000001,
            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
        }, GPU_DP_TYPE_16 };
        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = { {
            0x11111111, // TCfg
            0x01010101, // ASelt
            0x05050404, 0x07070606, // ABin
            0x22222222, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00000600, // AccumType, ConstantType, and PostShift
            0x00000001, 0x00000001, 0x00000001, 0x00000001,
            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
        }, GPU_DP_TYPE_16 };
        gpu_dp_inst_t uniExtractOddData_2x8 = { {
            0x11111111, // TCfg
            0x11110000, // ASelt
            0x07050301, 0x07050301, // ABin
            0x22222222, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00000600, // AccumType, ConstantType, and PostShift
            0x00000001, 0x00000001, 0x00000001, 0x00000001,
            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
        }, GPU_DP_TYPE_16 };
        if (interleaved && axis == 0)
        {
            uniExtractOddData_2x8.data[1] = 0x10101010;
            uniExtractOddData_2x8.data[2] = 0x03030101;
            uniExtractOddData_2x8.data[3] = 0x07070505;
        }
        else
        {
            status = vsi_nn_kernel_gpu_add_param(node,
                "half_head_size", &half_head_size);
            CHECK_STATUS_FAIL_GOTO(status, final);
        }
        status = vsi_nn_kernel_gpu_add_param(node,
            "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
        status |= vsi_nn_kernel_gpu_add_param(node,
            "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8);
        status |= vsi_nn_kernel_gpu_add_param(node,
            "uniExtractOddData_2x8", &uniExtractOddData_2x8);
        CHECK_STATUS_FAIL_GOTO(status, final);
    }
    break;
    case _PACK_SELECT_KEY(I16, I16, I16, I16):
    case _PACK_SELECT_KEY(I16, F16, F16, I16):
    case _PACK_SELECT_KEY(I16, I16, I16, I8):
    case _PACK_SELECT_KEY(I16, F16, F16, I8):
    case _PACK_SELECT_KEY(I16, I16, I16, U8):
    case _PACK_SELECT_KEY(I16, F16, F16, U8):
    case _PACK_SELECT_KEY(F16, F16, F16, F16):
        {
            float scale0 = in0_scale * in1_scale / output_scale;
            float scale1 = in0_scale* in2_scale / output_scale;
            gpu_dp_inst_t uniExtractHalf8_2x8 = { {
                0x11111111, // TCfg
                0x11110000, // ASelt
                0x06040200, 0x06040200, // ABin
                0x22222222, // BSelt
                0x00000000, 0x00000000, // BBin
                0x00000100, // AccumType, ConstantType, and PostShift
                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniExtractInteger_2x8 = { {
                0x33333333, // TCfg
                0x11110000, // ASelt
                0x03020100, 0x03020100, // ABin
                0x00000000, // BSelt
                0x00000000, 0x00000000, // BBin
                0x00002400, // AccumType, ConstantType, and PostShift
                0x00000000, 0x00000000, 0x00000000, 0x00000000,
                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniATimesB_0_4x4 = { {
                0x01010101, // TCfg
                0x00000000, // ASelt
                0x00010000, 0x00030002, // ABin
                0x01010101, // BSelt
                0x00010000, 0x00030002, // BBin
                0x00000400, // AccumType, ConstantType, and PostShift
                0x00000000, 0x00000000, 0x00000000, 0x00000000,
                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniATimesB_1_4x4 = { {
                0x01010101, // TCfg
                0x00000000, // ASelt
                0x00050004, 0x00070006, // ABin
                0x01010101, // BSelt
                0x00050004, 0x00070006, // BBin
                0x00000400, // AccumType, ConstantType, and PostShift
                0x00000000, 0x00000000, 0x00000000, 0x00000000,
                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniAEvenTimesB_0_4x4 = { {
                0x01010101, // TCfg
                0x00000000, // ASelt
                0x00020000, 0x00060004, // ABin
                0x01010101, // BSelt
                0x00010000, 0x00030002, // BBin
                0x00000400, // AccumType, ConstantType, and PostShift
                0x00000000, 0x00000000, 0x00000000, 0x00000000,
                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniAEvenTimesB_1_4x4 = { {
                0x01010101, // TCfg
                0x00000000, // ASelt
                0x00020000, 0x00060004, // ABin
                0x01010101, // BSelt
                0x00050004, 0x00070006, // BBin
                0x00000400, // AccumType, ConstantType, and PostShift
                0x00000000, 0x00000000, 0x00000000, 0x00000000,
                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniAOddTimesB_0_4x4 = { {
                0x01010101, // TCfg
                0x00000000, // ASelt
                0x00030001, 0x00070005, // ABin
                0x01010101, // BSelt
                0x00010000, 0x00030002, // BBin
                0x00000400, // AccumType, ConstantType, and PostShift
                0x00000000, 0x00000000, 0x00000000, 0x00000000,
                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniAOddTimesB_1_4x4 = { {
                0x01010101, // TCfg
                0x00000000, // ASelt
                0x00030001, 0x00070005, // ABin
                0x01010101, // BSelt
                0x00050004, 0x00070006, // BBin
                0x00000400, // AccumType, ConstantType, and PostShift
                0x00000000, 0x00000000, 0x00000000, 0x00000000,
                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
            if (interleaved && axis == 0)
            {
                uniExtractHalf8_2x8.data[1] = 0x10101010;
                uniExtractHalf8_2x8.data[2] = 0x02020000;
                uniExtractHalf8_2x8.data[3] = 0x06060404;
                uniExtractInteger_2x8.data[1] = 0x10101010;
                uniExtractInteger_2x8.data[2] = 0x01010000;
                uniExtractInteger_2x8.data[3] = 0x03030202;
                status = vsi_nn_kernel_gpu_add_param(node,
                    "uniAEvenTimesB_0_4x4", &uniAEvenTimesB_0_4x4);
                status |= vsi_nn_kernel_gpu_add_param(node,
                    "uniAEvenTimesB_1_4x4", &uniAEvenTimesB_1_4x4);
                status |= vsi_nn_kernel_gpu_add_param(node,
                    "uniAOddTimesB_0_4x4", &uniAOddTimesB_0_4x4);
                status |= vsi_nn_kernel_gpu_add_param(node,
                    "uniAOddTimesB_1_4x4", &uniAOddTimesB_1_4x4);
            }
            else
            {
                status = vsi_nn_kernel_gpu_add_param(node,
                    "uniATimesB_0_4x4", &uniATimesB_0_4x4);
                status |= vsi_nn_kernel_gpu_add_param(node,
                    "uniATimesB_1_4x4", &uniATimesB_1_4x4);
                status |= vsi_nn_kernel_gpu_add_param(node,
                    "half_head_size", &half_head_size);
            }
            status |= vsi_nn_kernel_gpu_add_param(node,
                "scale0", &scale0);
            status |= vsi_nn_kernel_gpu_add_param(node,
                "scale1", &scale1);
            status |= vsi_nn_kernel_gpu_add_param(node,
                "output_zp", &output_zp);
            if (out_dtype == F16)
            {
                status |= vsi_nn_kernel_gpu_add_param(node,
                    "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
            }
            else
            {
                status |= vsi_nn_kernel_gpu_add_param(node,
                    "uniExtract8Data_2x8", &uniExtractInteger_2x8);
            }
            CHECK_STATUS_FAIL_GOTO(status, final);
        }
        break;
    case _PACK_SELECT_KEY(I8,  I8,  I8,  I8):
    case _PACK_SELECT_KEY(U8,  U8,  U8,  U8):
    case _PACK_SELECT_KEY(U16, U16, U16, U16):
    case _PACK_SELECT_KEY(I8,  F16, F16, I8):
    case _PACK_SELECT_KEY(U8,  F16, F16, U8):
    case _PACK_SELECT_KEY(U16, F16, F16, U16):
        {
            float scale0 = in0_scale * in1_scale / output_scale;
            float scale1 = in0_scale* in2_scale / output_scale;
            gpu_dp_inst_t uniExtractInteger_2x8 = { {
                0x33333333, // TCfg
                0x11110000, // ASelt
                0x03020100, 0x03020100, // ABin
                0x00000000, // BSelt
                0x00000000, 0x00000000, // BBin
                0x00002400, // AccumType, ConstantType, and PostShift
                0x00000000, 0x00000000, 0x00000000, 0x00000000,
                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniAMinusZp_0_4x4 = { {
                0x0d0d0d0d, // TCfg
                0x04040404, // ASelt
                0x00010000, 0x00030002, // ABin
                0x02020202, // BSelt
                0x00000000, 0x00000000, // BBin
                0x00002400, // AccumType, ConstantType, and PostShift
                0x00000001, 0x00000000, 0x00000001, 0x00000000,
                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniAMinusZp_1_4x4 = { {
                0x0d0d0d0d, // TCfg
                0x04040404, // ASelt
                0x00050004, 0x00070006, // ABin
                0x02020202, // BSelt
                0x00000000, 0x00000000, // BBin
                0x00002400, // AccumType, ConstantType, and PostShift
                0x00000001, 0x00000000, 0x00000001, 0x00000000,
                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniAEvenMinusZp_4x4 = { {
                0x0d0d0d0d, // TCfg
                0x04040404, // ASelt
                0x00020000, 0x00060004, // ABin
                0x02020202, // BSelt
                0x00000000, 0x00000000, // BBin
                0x00002400, // AccumType, ConstantType, and PostShift
                0x00000001, 0x00000000, 0x00000001, 0x00000000,
                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
            gpu_dp_inst_t uniAOddMinusZp_4x4 = { {
                0x0d0d0d0d, // TCfg
                0x04040404, // ASelt
                0x00030001, 0x00070005, // ABin
                0x02020202, // BSelt
                0x00000000, 0x00000000, // BBin
                0x00002400, // AccumType, ConstantType, and PostShift
                0x00000001, 0x00000000, 0x00000001, 0x00000000,
                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
            if (interleaved && axis == 0)
            {
                uniExtractInteger_2x8.data[1] = 0x10101010;
                uniExtractInteger_2x8.data[2] = 0x01010000;
                uniExtractInteger_2x8.data[3] = 0x03030202;
                status = vsi_nn_kernel_gpu_add_param(node,
                    "uniAEvenMinusZp_4x4", &uniAEvenMinusZp_4x4);
                status |= vsi_nn_kernel_gpu_add_param(node,
                    "uniAOddMinusZp_4x4", &uniAOddMinusZp_4x4);
            }
            else
            {
                status = vsi_nn_kernel_gpu_add_param(node,
                    "half_head_size", &half_head_size);
            }
            status |= vsi_nn_kernel_gpu_add_param(node,
                "uniAMinusZp_0_4x4", &uniAMinusZp_0_4x4);
            status |= vsi_nn_kernel_gpu_add_param(node,
                "uniAMinusZp_1_4x4", &uniAMinusZp_1_4x4);
            status |= vsi_nn_kernel_gpu_add_param(node,
                "scale0", &scale0);
            status |= vsi_nn_kernel_gpu_add_param(node,
                "scale1", &scale1);
            status |= vsi_nn_kernel_gpu_add_param(node,
                "output_zp", &output_zp);
            status |= vsi_nn_kernel_gpu_add_param(node,
                "in0_zp", &in0_zp);
            status |= vsi_nn_kernel_gpu_add_param(node,
                "cos_zp", &cos_zp);
            status |= vsi_nn_kernel_gpu_add_param(node,
                "sin_zp", &sin_zp);
            status |= vsi_nn_kernel_gpu_add_param(node,
                "uniExtract8Data_2x8", &uniExtractInteger_2x8);
            CHECK_STATUS_FAIL_GOTO(status, final);
        }
        break;
    default:
        break;
    }
    status = vsi_nn_kernel_gpu_config(node, &gpu_param);
 final:
    if (in0_attr) vsi_nn_kernel_tensor_attr_release(&in0_attr);
    if (in1_attr) vsi_nn_kernel_tensor_attr_release(&in1_attr);
    if (in2_attr) vsi_nn_kernel_tensor_attr_release(&in2_attr);
    if (out_attr) vsi_nn_kernel_tensor_attr_release(&out_attr);
    return status;
 } /* _rope_initializer() */
 /*
 * Query kernel
 */
 static vsi_status _query_kernel
    (
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
    int32_t axis,
    int32_t interleaved,
    _internal_rope_layout_e *layout
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_dtype_e in0_dtype;
    vsi_nn_kernel_dtype_e in1_dtype;
    vsi_nn_kernel_dtype_e in2_dtype;
    vsi_nn_kernel_dtype_e out_dtype;
    int32_t in0_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
    int32_t in1_zp = vsi_nn_get_tensor_zero_point(inputs[1]);
    int32_t in2_zp = vsi_nn_get_tensor_zero_point(inputs[2]);
    const _kernel_map_type * kernel_map = _rope_kernel_map;
    size_t kernel_map_size              = _cnt_of_array( _rope_kernel_map );
    vx_param_description_t * param_def  = _rope_kernel_param_def;
    vx_kernel_initialize_f  initializer = _rope_initializer;
    uint32_t key;
    uint32_t i;
    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
    in2_dtype  = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
    /*only support symmetric int16*/
    if ( ( (in0_dtype == I16 && in1_dtype == I16 && out_dtype == I16) ||
           (in0_dtype == I16 && in1_dtype == F16 && out_dtype == I16) ||
           (in0_dtype == I16 && in1_dtype == F16 && out_dtype == I8)  ||
           (in0_dtype == I16 && in1_dtype == I16 && out_dtype == I8)  ||
           (in0_dtype == I16 && in1_dtype == F16 && out_dtype == U8)  ||
           (in0_dtype == I16 && in1_dtype == I16 && out_dtype == U8) ) &&
        (in0_zp != 0 || in1_zp != 0 || in2_zp != 0))
    {
        return VSI_FAILURE;
    }
    if (axis == 1 && inputs[0]->attr.size[0] == inputs[1]->attr.size[0] &&
        in1_dtype == in2_dtype)
    {
        if (inputs[0]->attr.size[0] == 1)
        {
            *layout = LAYOUT_BNH1;
        }
        else
        {
            *layout = LAYOUT_BNHS;
        }
    }
    else if (axis == 0 && in1_dtype == in2_dtype)
    {
        if (inputs[0]->attr.size[2] == inputs[1]->attr.size[2] &&
            inputs[1]->attr.size[1] == 1)
        {
            *layout = LAYOUT_BSNH;
        }
        else
        {
            *layout = LAYOUT_BNSH;
        }
    }
    key = ROPE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, *layout, interleaved);
    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
    {
        if ( kernel_map[i].key == key )
        {
            break;
        }
    }
    if ( i < (uint32_t)kernel_map_size )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
        kernel->info.parameters  = param_def;
        kernel->info.numParams   = _cnt_of_array( _rope_kernel_param_def );
        kernel->info.initialize  = initializer;
        // Register code source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                "vsi_nn_kernel_header",
                kernel_map[i].source_name );
        // Register binary source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
                kernel_map[i].source_name );
        status = VSI_SUCCESS;
    }
    return status;
 } /* _query_kernel() */
 static vsi_nn_kernel_node_t _setup
    (
    vsi_nn_graph_t              * graph,
    vsi_nn_tensor_t            ** inputs,
    size_t                        input_num,
    vsi_nn_tensor_t            ** outputs,
    size_t                        output_num,
    const vsi_nn_kernel_param_t * params,
    vsi_nn_kernel_t             * kernel
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t node_params[_ROPE_PARAM_NUM] = { NULL };
    vsi_nn_kernel_node_t node = NULL;
    int32_t axis = 0;
    int32_t i = 0;
    int32_t interleaved = 0;
    int32_t param = 0;
    vsi_size_t shape[3][VSI_NN_MAX_DIM_NUM] = { 0 };
    vsi_nn_tensor_t* rs_tensors[4] = { NULL };
    vsi_nn_tensor_t* reshape_tensors[4] = { NULL };
    _internal_rope_layout_e layout = LAYOUT_NONE;
    VSI_UNREFERENCED(params);
    axis = vsi_nn_kernel_param_get_int32(params, "axis");
    interleaved = vsi_nn_kernel_param_get_int32(params, "interleaved");
    // Check if gpu can support the size
    if ( !vsi_nn_kernel_gpu_check_shape(
        inputs[0]->attr.size, inputs[0]->attr.dim_num ) )
    {
        return NULL;
    }
    status = _query_kernel( kernel, inputs, outputs, axis, interleaved, &layout );
    if (outputs[0]->attr.size[0] == 1 || layout == LAYOUT_BSNH)
    {
        memcpy(shape[0], inputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
        memcpy(shape[1], inputs[1]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
        memcpy(shape[2], outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
        if (outputs[0]->attr.size[0] == 1)
        {
            for (i = 1; i < 3; i++)
            {
                shape[0][i - 1] = shape[0][i];
                shape[1][i - 1] = shape[1][i];
                shape[2][i - 1] = shape[2][i];
            }
            shape[0][2] = 1;
            shape[1][2] = 1;
            shape[2][2] = 1;
        }
        else
        {
            int32_t j = 0;
            for (i = 0; i < 3; i++)
            {
                if (shape[1][i] != 1)
                {
                    shape[1][j] = shape[1][i];
                    j ++;
                }
            }
            for (; j < 3; j++)
            {
                shape[1][j] = 1;
            }
        }
        rs_tensors[0] = vsi_nn_reshape_tensor(graph,
            inputs[0], shape[0], inputs[0]->attr.dim_num);
        rs_tensors[1] = vsi_nn_reshape_tensor(graph,
            inputs[1], shape[1], inputs[1]->attr.dim_num);
        rs_tensors[2] = vsi_nn_reshape_tensor(graph,
            inputs[2], shape[1], inputs[2]->attr.dim_num);
        rs_tensors[3] = vsi_nn_reshape_tensor(graph,
            outputs[0], shape[2], outputs[0]->attr.dim_num);
        if (outputs[0]->attr.size[0] == 1 && axis > 0)
        {
            axis--;
        }
        reshape_tensors[0] = rs_tensors[0];
        reshape_tensors[1] = rs_tensors[1];
        reshape_tensors[2] = rs_tensors[2];
        reshape_tensors[3] = rs_tensors[3];
    }
    else
    {
        reshape_tensors[0] = inputs[0];
        reshape_tensors[1] = inputs[1];
        reshape_tensors[2] = inputs[2];
        reshape_tensors[3] = outputs[0];
    }
    param = (interleaved << 16) | axis;
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
        if ( node )
        {
            /* Set inputs and outputs */
            vsi_nn_kernel_node_pack_io( node_params, _ROPE_PARAM_NUM,
                reshape_tensors, input_num, &reshape_tensors[3], output_num );
            /* Pass parameters to node. */
            node_params[SCALAR_AXIS] = vsi_nn_kernel_scalar_create(graph, I32, &param);
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ROPE_PARAM_NUM );
            vsi_nn_kernel_scalar_release(&node_params[SCALAR_AXIS]);
        }
    }
    for (i = 0; i < 4; i++)
    {
        vsi_safe_release_tensor(rs_tensors[i]);
    }
    return node;
 } /* _setup() */
 __END_DECLS
 REGISTER_BACKEND_EVIS( rope, _setup )
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
@ -186,18 +186,26 @@ static const _kernel_map_type scatter_nd_update_special_ref_map[] =
 {
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I16,  I32, I16,  I16, KERNEL_SOURCE_4)
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U16,  I32, U16,  U16, KERNEL_SOURCE_4)
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(F16,  I32, F16,  F16, KERNEL_SOURCE_4)
 };
 static const _kernel_map_type scatter_nd_update_special_update_map[] =
 {
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U16,  I32, U16,  U16, KERNEL_SOURCE_4)
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I16,  I32, I16,  I16, KERNEL_SOURCE_4)
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(F16,  I32, F16,  F16, KERNEL_SOURCE_4)
 };
 static const _kernel_map_type scatter_nd_update_special_copy_map[] =
 {
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U16,  I32, U16,  U16, KERNEL_SOURCE_4)
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I16,  I32, I16,  I16, KERNEL_SOURCE_4)
 };
 /*
@ -563,6 +571,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
    {
    case _PACK_SELECT_KEY( I8,  I8 ):
    case _PACK_SELECT_KEY( U8,  U8 ):
    case _PACK_SELECT_KEY( I16,  I16 ):
    case _PACK_SELECT_KEY( U16,  U16 ):
        {
            uint16_t M0               = 0;
            int32_t  postShift0       = 0;
@ -605,6 +615,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
            CHECK_STATUS_FAIL_GOTO(status, OnError );
        }
        break;
    case _PACK_SELECT_KEY( F16,  F16 ):
        break;
    default:
        break;
    }
@ -759,6 +771,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
    {
    case _PACK_SELECT_KEY( I8,  I8 ):
    case _PACK_SELECT_KEY( U8,  U8 ):
    case _PACK_SELECT_KEY( I16,  I16 ):
    case _PACK_SELECT_KEY( U16,  U16 ):
        {
            uint16_t M1               = 0;
            int32_t  postShift1       = 0;
@ -801,6 +815,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
            CHECK_STATUS_FAIL_GOTO(status, OnError );
        }
        break;
    case _PACK_SELECT_KEY( F16,  F16 ):
        break;
    default:
        break;
    }
@ -1597,6 +1613,19 @@ static vsi_status _query_kernel_special
        status |= VSI_FAILURE;
    }
    if (input0_dtype == F16)
    {
        input0_dtype = U16;
    }
    if (input2_dtype == F16)
    {
        input2_dtype = U16;
    }
    if (output_dtype == F16)
    {
        output_dtype = U16;
    }
    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 6, 1, 0);
    for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_copy_map); i ++ )
--- a/src/tim/vx/internal/src/kernel/evis/swish_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/swish_evis.c
@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@ -591,7 +592,7 @@ static vsi_nn_kernel_node_t _setup
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
 #if (VX_ACTIVATION_EXT_SUPPORT)
-    if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
+    if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
    {
        return NULL;
    }
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@ -548,16 +548,16 @@ static vsi_status _gpu_register
    vsi_status status;
    vx_kernel_description_t* info;
    vx_kernel obj;
    vsi_nn_context_t context;
    vx_program program = NULL;
    const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
    vsi_nn_runtime_option_t* options;
    options = ((vsi_nn_graph_prv_t*)graph)->options;
 #define MAX_BUILDPROGRAM_LEN 1024
    char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
    size_t cost_bytes = 0;
    memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
    context = graph->ctx;
    status = VSI_FAILURE;
    info = &(kernel->info);
@ -579,21 +579,21 @@ static vsi_status _gpu_register
        return status;
    }
-    if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE )
+    if (options->config.evis.ver == VSI_NN_HW_EVIS_NONE)
    {
        // set default evis version is 2
        if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
        {
            cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
                    "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
-                    context->config.use_40bits_va );
+                    options->config.use_40bits_va );
        }
    }
    else
    {
        cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
                "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
-                context->config.evis.ver, context->config.use_40bits_va );
+                options->config.evis.ver, options->config.use_40bits_va );
    }
    // Pack build option
    if( kernel->gpu.sources[active_fmt].build_option.data )
@ -655,16 +655,16 @@ static vsi_status _gpu_register_ext
    vsi_status status;
    vx_kernel_description_t* info;
    vx_kernel obj;
    vsi_nn_context_t context;
    vx_program program = NULL;
    const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
    vsi_nn_runtime_option_t* options;
    options = ((vsi_nn_graph_prv_t*)graph)->options;
 #define MAX_BUILDPROGRAM_LEN 1024
    char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
    size_t cost_bytes = 0;
    memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
    context = graph->ctx;
    status = VSI_FAILURE;
    info = &(kernel->info);
@ -686,21 +686,21 @@ static vsi_status _gpu_register_ext
        return status;
    }
-    if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE )
+    if (options->config.evis.ver == VSI_NN_HW_EVIS_NONE)
    {
        // set default evis version is 2
        if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
        {
            cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
                    "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
-                    context->config.use_40bits_va );
+                    options->config.use_40bits_va );
        }
    }
    else
    {
        cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
                "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
-                context->config.evis.ver, context->config.use_40bits_va );
+                options->config.evis.ver, options->config.use_40bits_va );
    }
    // Pack build option
    if( kernel->gpu.sources[active_fmt].build_option.data )
@ -1258,7 +1258,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
            }
            /* Skip evis if not support */
            if( type == VSI_NN_KERNEL_TYPE_EVIS
-                    && graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_NONE )
+                    && ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver == VSI_NN_HW_EVIS_NONE )
            {
                continue;
            }
@ -1677,7 +1677,7 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
    int32_t enableShader = ((vsi_nn_graph_prv_t*)graph)->options->enable_shader;
 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
-    if ( graph->ctx->config.subGroupSize == 0 )
+    if ( ((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize == 0 )
    {
        return FALSE;
    }
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
@ -162,15 +162,11 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(pow)
 #if (VX_TENSOR_GATHER_API_SUPPORT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(gather)
 #endif
 #if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(relational_ops)
 #endif
 #if (VX_TENSOR_TILE_API_SUPPORT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(tile)
 #endif
 #if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(layer_norm)
 #endif
 #if (VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(exp)
 #endif
@ -184,6 +180,7 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(log_softmax)
 #if (VX_BITCAST_VX_SUPPORT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(bitcast)
 #endif
-
+REGISTER_VX_FIRST_KERNEL_SELECTOR(group_norm)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(instance_norm)
 __END_DECLS
--- a/src/tim/vx/internal/src/kernel/vx/group_norm_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/group_norm_vx.c
@ -0,0 +1,89 @@
 /****************************************************************************
 *
 *    Copyright (c) 2021 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include "vsi_nn_types.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #if VX_GROUP_NORMALIZATION_VX_SUPPORT
 #define REGISTER_GROUP_NORM_OPENVX_KERNEL( kernel_name )   \
    static vsi_nn_kernel_node_t _##kernel_name##setup \
        ( \
        vsi_nn_graph_t              * graph, \
        vsi_nn_tensor_t            ** inputs, \
        size_t                        input_num, \
        vsi_nn_tensor_t            ** outputs, \
        size_t                        output_num,\
        const vsi_nn_kernel_param_t * params, \
        vsi_nn_kernel_t             * kernel \
        ); \
    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
    static vsi_nn_kernel_node_t _##kernel_name##setup \
        ( \
        vsi_nn_graph_t              * graph, \
        vsi_nn_tensor_t            ** inputs, \
        size_t                        input_num, \
        vsi_nn_tensor_t            ** outputs, \
        size_t                        output_num,\
        const vsi_nn_kernel_param_t * params, \
        vsi_nn_kernel_t             * kernel \
        )
 REGISTER_GROUP_NORM_OPENVX_KERNEL(group_norm)
 {
    vx_node node = NULL;
    float eps = vsi_nn_kernel_param_get_float32(params, "eps");
    int32_t group_num = vsi_nn_kernel_param_get_int32(params, "group_num");
    vx_tensor inputs_tensor[3] = { NULL };
    vx_tensor output_tensor = NULL;
    inputs_tensor[0] = inputs[0]->t;
    inputs_tensor[1] = inputs[1]->t;
    inputs_tensor[2] = inputs[2]->t;
    output_tensor = outputs[0]->t;
    VSI_UNREFERENCED(output_num);
    VSI_UNREFERENCED(kernel);
    if (graph->ctx->config.support_ffd ||
        graph->ctx->config.support_stream_processor)
    {
        node = vxGroupNormalizationLayer(
            graph->g,
            eps,
            group_num,
            inputs_tensor,
            (vx_uint32)input_num,
            output_tensor
        );
    }
    return (vsi_nn_kernel_node_t)node;
 } /* group_norm() */
 #endif
--- a/src/tim/vx/internal/src/kernel/vx/instance_norm_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/instance_norm_vx.c
@ -0,0 +1,87 @@
 /****************************************************************************
 *
 *    Copyright (c) 2021 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include "vsi_nn_types.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #if VX_INSTANCE_NORMALIZATION_VX_SUPPORT
 #define REGISTER_INSTANCE_NORM_OPENVX_KERNEL( kernel_name )   \
    static vsi_nn_kernel_node_t _##kernel_name##setup \
        ( \
        vsi_nn_graph_t              * graph, \
        vsi_nn_tensor_t            ** inputs, \
        size_t                        input_num, \
        vsi_nn_tensor_t            ** outputs, \
        size_t                        output_num,\
        const vsi_nn_kernel_param_t * params, \
        vsi_nn_kernel_t             * kernel \
        ); \
    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
    static vsi_nn_kernel_node_t _##kernel_name##setup \
        ( \
        vsi_nn_graph_t              * graph, \
        vsi_nn_tensor_t            ** inputs, \
        size_t                        input_num, \
        vsi_nn_tensor_t            ** outputs, \
        size_t                        output_num,\
        const vsi_nn_kernel_param_t * params, \
        vsi_nn_kernel_t             * kernel \
        )
 REGISTER_INSTANCE_NORM_OPENVX_KERNEL(instance_norm)
 {
    vsi_nn_kernel_node_t node = NULL;
    float eps = vsi_nn_kernel_param_get_float32(params, "eps");
    vx_tensor inputs_tensor[3] = { NULL };
    vx_tensor output_tensor = NULL;
    inputs_tensor[0] = inputs[0]->t;
    inputs_tensor[1] = inputs[1]->t;
    inputs_tensor[2] = inputs[2]->t;
    output_tensor = outputs[0]->t;
    VSI_UNREFERENCED(output_num);
    VSI_UNREFERENCED(kernel);
    if (graph->ctx->config.support_ffd ||
        graph->ctx->config.support_stream_processor)
    {
        node = vxInstanceNormalizationLayer(
            graph->g,
            eps,
            inputs_tensor,
            (vx_uint32)input_num,
            output_tensor
        );
    }
    return (vsi_nn_kernel_node_t)node;
 } /* instance_norm() */
 #endif
--- a/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c
@ -30,7 +30,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
+#if (VX_LAYER_NORMALIZATION_VX_SUPPORT)
 #define REGISTER_LAYER_NORM_OPENVX_KERNEL( kernel_name )   \
    static vsi_nn_kernel_node_t _##kernel_name##setup \
        ( \
@ -71,14 +71,20 @@ REGISTER_LAYER_NORM_OPENVX_KERNEL( layer_norm )
    inputs_tensor[2] = inputs[2]->t;
    output_tensor = outputs[0]->t;
-    node = vxLayerNormalizationLayer(
+#if !defined(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) || !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
-        graph->g,
+    if (graph->ctx->config.support_ffd ||
-        eps,
+        graph->ctx->config.support_stream_processor)
-        axis,
+#endif
-        inputs_tensor,
+    {
-        (uint32_t)input_num,
+        node = vxLayerNormalizationLayer(
-        output_tensor
+            graph->g,
            eps,
            axis,
            inputs_tensor,
            (uint32_t)input_num,
            output_tensor
        );
    }
    return (vsi_nn_kernel_node_t)node;
 } /* layer_norm() */
--- a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
@ -89,9 +89,10 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 )
    if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
    {
        vsi_nn_tensor_attr_t attr;
        memcpy( &attr, &outputs[0]->attr, sizeof( attr ) );
        memcpy( &attr.size, &inputs[0]->attr.size, sizeof( attr.size ) );
-        attr.vtl = FALSE;
+        attr.vtl = TRUE;
        attr.is_const = FALSE;
        convert_tensor = vsi_nn_CreateTensor(graph, &attr);
--- a/src/tim/vx/internal/src/kernel/vx/relationalops_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/relationalops_vx.c
@ -30,7 +30,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
+#if (VX_RELATIONAL_OPS_VX_SUPPORT)
 #define REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( kernel_name )   \
    static vsi_nn_kernel_node_t _##kernel_name##setup \
@ -68,12 +68,25 @@ REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( relational_ops )
    VSI_UNREFERENCED(kernel);
    VSI_UNREFERENCED(output_num);
-    node = vxRelationalLayer(graph->g,
+#if !defined(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) || !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
-                              operation,
+    if (vsi_nn_is_broadcast_operaton(inputs, input_num, outputs[0]))
-                              inputs_tensor,
+    {
-                              (uint32_t)input_num,
+        return NULL;
-                              outputs[0]->t
+    }
-                              );
+#endif
 #if !defined(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) || !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
    if (graph->ctx->config.support_stream_processor)
 #endif
    {
        node = vxRelationalLayer(
            graph->g,
            operation,
            inputs_tensor,
            (uint32_t)input_num,
            outputs[0]->t
        );
    }
    return (vsi_nn_kernel_node_t)node;
 } /* relational_ops() */
--- a/src/tim/vx/internal/src/kernel/vx/swish_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/swish_vx.c
@ -23,6 +23,7 @@
 *****************************************************************************/
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_log.h"
@ -66,7 +67,7 @@ REGISTER_SWISH_OPENVX_KERNEL( swish )
    VSI_UNREFERENCED(output_num);
    VSI_UNREFERENCED(input_num);
-    if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
+    if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
    {
        swish_type = (vsi_nn_swish_type)vsi_nn_kernel_param_get_int32(params, "type");
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl
@ -67,8 +67,8 @@ __kernel void cumsum_F32toF32_axis2(
    }
 }
-#define CUMSUM_toU8_AXIS2_SH(name, src_type, read_image_type) \
+#define CUMSUM_toINT_AXIS2_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
-__kernel void cumsum_##name##toU8_axis2( \
+__kernel void cumsum_##name##_axis2( \
    __read_only image2d_array_t  input, \
    __write_only image2d_array_t  output, \
    int axis, \
@ -87,19 +87,19 @@ __kernel void cumsum_##name##toU8_axis2( \
    int4 coord_out = coord; \
 \
    src_type sum = (src_type)(0); \
-    uint4 dst = (uint4)(0); \
+    dst_type dst = (dst_type)(0); \
    int tmp_zp = convert_int_rte(output_zp); \
-    dst.x = convert_uint_sat(tmp_zp); \
+    dst.x = convert_dtype(tmp_zp); \
 \
    float cnt = 0.0f; \
 \
    if(exclusive && rev) \
    { \
        coord_out.z = channel - 1; \
-        write_imageui(output, coord_out, dst); \
+        image_write(output, coord_out, dst); \
        for(coord.z = channel - 1; coord.z > 0; coord.z--) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            coord_out.z--; \
            cnt += 1.0f; \
            sum += data; \
@ -107,17 +107,17 @@ __kernel void cumsum_##name##toU8_axis2( \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
+            dst.x = convert_dtype(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            image_write(output, coord_out, dst); \
        } \
    } \
    else if(exclusive) \
    { \
        coord_out.z = 0; \
-        write_imageui(output, coord_out, dst); \
+        image_write(output, coord_out, dst); \
        for(coord.z = 0; coord.z < channel - 1; coord.z++) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            coord_out.z++; \
            cnt += 1.0f; \
            sum += data; \
@ -125,45 +125,44 @@ __kernel void cumsum_##name##toU8_axis2( \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
+            dst.x = convert_dtype(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            image_write(output, coord_out, dst); \
        } \
    } \
    else if(rev) \
    { \
        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
+            dst.x = convert_dtype(tmpSum); \
-            write_imageui(output, coord, dst); \
+            image_write(output, coord, dst); \
        } \
    } \
    else \
    { \
        for(coord.z = 0; coord.z < channel; coord.z++) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
+            dst.x = convert_dtype(tmpSum); \
-            write_imageui(output, coord, dst); \
+            image_write(output, coord, dst); \
        } \
    } \
 }
-CUMSUM_toU8_AXIS2_SH(U8,uint4,read_imageui)
+CUMSUM_toINT_AXIS2_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)
-CUMSUM_toU8_AXIS2_SH(F32,float4,read_imagef)
+CUMSUM_toINT_AXIS2_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)
-
+CUMSUM_toINT_AXIS2_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)
 __kernel void cumsum_F32toF32_axis1(
    __read_only image2d_array_t  input,
@ -233,10 +232,10 @@ __kernel void cumsum_F32toF32_axis1(
    }
 }
-#define CUMSUM_toU8_AXIS1_SH(name, src_type, read_image_type) \
+#define CUMSUM_toINT_AXIS1_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
-__kernel void cumsum_##name##toU8_axis1( \
+__kernel void cumsum_##name##_axis1( \
-    __read_only image2d_array_t  input, \
+    __read_only  image2d_array_t input, \
-    __write_only image2d_array_t  output, \
+    __write_only image2d_array_t output, \
    int axis, \
    int exclusive, \
    int rev, \
@ -253,20 +252,20 @@ __kernel void cumsum_##name##toU8_axis1( \
    int4 coord_out = coord; \
 \
    src_type sum = (src_type)(0); \
-    uint4 dst = (uint4)(0); \
+    dst_type dst = (dst_type)(0); \
    int tmp_zp = convert_int_rte(output_zp); \
-    dst.x = convert_uint_sat(tmp_zp); \
+    dst.x = convert_dtype(tmp_zp); \
 \
    float cnt = 0; \
 \
    if(exclusive && rev) \
    { \
        coord_out.y = height - 1; \
-        write_imageui(output, coord_out, dst); \
+        image_write(output, coord_out, dst); \
 \
        for(coord.y = height - 1; coord.y > 0; coord.y--) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            cnt += 1.0f; \
            coord_out.y--; \
            sum += data; \
@ -274,17 +273,17 @@ __kernel void cumsum_##name##toU8_axis1( \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
+            dst.x = convert_dtype(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            image_write(output, coord_out, dst); \
        } \
    } \
    else if(exclusive) \
    { \
        coord_out.y = 0; \
-        write_imageui(output, coord_out, dst); \
+        image_write(output, coord_out, dst); \
        for(coord.y = 0; coord.y < height - 1; coord.y++) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            cnt += 1.0f; \
            coord_out.y++; \
            sum += data; \
@ -292,44 +291,44 @@ __kernel void cumsum_##name##toU8_axis1( \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
+            dst.x = convert_dtype(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            image_write(output, coord_out, dst); \
        } \
    } \
    else if(rev) \
    { \
        for(coord.y = height - 1; coord.y >= 0; coord.y--) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
+            dst.x = convert_dtype(tmpSum); \
-            write_imageui(output, coord, dst); \
+            image_write(output, coord, dst); \
        } \
    } \
    else \
    { \
        for(coord.y = 0; coord.y < height; coord.y++) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
+            dst.x = convert_dtype(tmpSum); \
-            write_imageui(output, coord, dst); \
+            image_write(output, coord, dst); \
        } \
    } \
 }
-CUMSUM_toU8_AXIS1_SH(U8,uint4,read_imageui)
+CUMSUM_toINT_AXIS1_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)
-CUMSUM_toU8_AXIS1_SH(F32,float4,read_imagef)
+CUMSUM_toINT_AXIS1_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)
-
+CUMSUM_toINT_AXIS1_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)
 __kernel void cumsum_F32toF32_axis0(
    __read_only image2d_array_t  input,
@ -399,8 +398,8 @@ __kernel void cumsum_F32toF32_axis0(
    }
 }
-#define CUMSUM_toU8_AXIS0_SH(name, src_type, read_image_type) \
+#define CUMSUM_toINT_AXIS0_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
-__kernel void cumsum_##name##toU8_axis0( \
+__kernel void cumsum_##name##_axis0( \
    __read_only image2d_array_t  input, \
    __write_only image2d_array_t  output, \
    int axis, \
@ -419,19 +418,19 @@ __kernel void cumsum_##name##toU8_axis0( \
    int4 coord_out = coord; \
 \
    src_type sum = (src_type)(0); \
-    uint4 dst = (uint4)(0); \
+    dst_type dst = (dst_type)(0); \
    int tmp_zp = convert_int_rte(output_zp); \
-    dst.x = convert_uint_sat(tmp_zp); \
+    dst.x = convert_dtype(tmp_zp); \
 \
    float cnt = 0; \
 \
    if(exclusive && rev) \
    { \
        coord_out.x = width - 1; \
-        write_imageui(output, coord_out, dst); \
+        image_write(output, coord_out, dst); \
        for(coord.x = width - 1; coord.x > 0; coord.x--) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            coord_out.x--; \
            cnt += 1.0f; \
            sum += data; \
@ -439,8 +438,8 @@ __kernel void cumsum_##name##toU8_axis0( \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
+            dst.x = convert_dtype(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            image_write(output, coord_out, dst); \
        } \
    } \
    else if(exclusive) \
@ -449,7 +448,7 @@ __kernel void cumsum_##name##toU8_axis0( \
        write_imageui(output, coord_out, dst); \
        for(coord.x = 0; coord.x < width - 1; coord.x++) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            coord_out.x++; \
            cnt += 1.0f; \
            sum += data; \
@ -457,40 +456,42 @@ __kernel void cumsum_##name##toU8_axis0( \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
+            dst.x = convert_dtype(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            image_write(output, coord_out, dst); \
        } \
    } \
    else if(rev) \
    { \
        for(coord.x = width - 1; coord.x >= 0; coord.x--) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
+            dst.x = convert_dtype(tmpSum); \
-            write_imageui(output, coord, dst); \
+            image_write(output, coord, dst); \
        } \
    } \
    else \
    { \
        for(coord.x = 0; coord.x < width; coord.x++) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
+            dst.x = convert_dtype(tmpSum); \
-            write_imageui(output, coord, dst); \
+            image_write(output, coord, dst); \
        } \
    } \
 }
-CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui)
+CUMSUM_toINT_AXIS0_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)
-CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)
+CUMSUM_toINT_AXIS0_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)
 CUMSUM_toINT_AXIS0_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl
@ -65,188 +65,100 @@ __kernel void cumsum_F32toF32_axis1_2D(
    }
 }
-__kernel void cumsum_U8toU8_axis1_2D(
+#define CUMSUM_INT_AXIS1_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
-    __read_only image2d_t  input,
+__kernel void cumsum_##name##_axis1_2D( \
-    __write_only image2d_t  output,
+    __read_only  image2d_t input, \
-    int axis,
+    __write_only image2d_t output, \
-    int exclusive,
+    int axis, \
-    int rev,
+    int exclusive, \
-    int width,
+    int rev, \
-    int height,
+    int width, \
-    int chn,
+    int height, \
-    int input_zp,
+    int chn, \
-    float in_out_scale,
+    int input_zp, \
-    float in_out_zp_scale,
+    float in_out_scale, \
-    float output_zp
+    float in_out_zp_scale, \
-    )
+    float output_zp \
-{
+    ) \
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+{ \
-
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
-    uint4 sum = (uint4)(0);
+ \
-    uint4 dst = (uint4)(0);
+    src_type sum = (src_type)(0); \
-
+    dst_type dst = (dst_type)(0); \
-    int tmp_zp = convert_int_rte(output_zp);
+    int tmp_zp = convert_int_rte(output_zp); \
-    dst.x = convert_uint_sat(tmp_zp);
+    dst.x = convert_dtype(tmp_zp); \
-
+ \
-    float cnt = 0;
+    float cnt = 0; \
-
+ \
-    if(exclusive && rev)
+    if(exclusive && rev) \
-    {
+    { \
-        coord.w = height - 1;
+        coord.w = height - 1; \
-        write_imageui(output, coord.zw, dst);
+        image_write(output, coord.zw, dst); \
-        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        for(coord.y = height - 1; coord.y > 0; coord.y--) \
-        {
+        { \
-            uint4 data = read_imageui(input, coord.xy);
+            src_type data = image_read(input, coord.xy); \
-            cnt += 1.0f;
+            cnt += 1.0f; \
-            coord.w--;
+            coord.w--; \
-            sum += data;
+            sum += data; \
-
+ \
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
-
+ \
-            dst.x = (uint)convert_int_rte(tmpSum);
+            dst.x = convert_dtype(tmpSum); \
-            write_imageui(output, coord.zw, dst);
+            image_write(output, coord.zw, dst); \
-        }
+        } \
-    }
+    } \
-    else if(exclusive)
+    else if(exclusive) \
-    {
+    { \
-        write_imageui(output, coord.zw, dst);
+        image_write(output, coord.zw, dst); \
-        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        for(coord.y = 0; coord.y < height - 1; coord.y++) \
-        {
+        { \
-            uint4 data = read_imageui(input, coord.xy);
+            src_type data = image_read(input, coord.xy); \
-            cnt += 1.0f;
+            cnt += 1.0f; \
-            coord.w++;
+            coord.w++; \
-            sum += data;
+            sum += data; \
-
+ \
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
-
+ \
-            dst.x = (uint)convert_int_rte(tmpSum);
+            dst.x = convert_dtype(tmpSum); \
-            write_imageui(output, coord.zw, dst);
+            image_write(output, coord.zw, dst); \
-        }
+        } \
-    }
+    } \
-    else if(rev)
+    else if(rev) \
-    {
+    { \
-        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \
-        {
+        { \
-            uint4 data = read_imageui(input, coord.xy);
+            src_type data = image_read(input, coord.xy); \
-            cnt += 1.0f;
+            cnt += 1.0f; \
-            sum += data;
+            sum += data; \
-
+ \
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
-
+ \
-            dst.x = (uint)convert_int_rte(tmpSum);
+            dst.x = convert_dtype(tmpSum); \
-            write_imageui(output, coord.xy, dst);
+            image_write(output, coord.xy, dst); \
-        }
+        } \
-    }
+    } \
-    else
+    else \
-    {
+    { \
-        for(coord.y = 0; coord.y < height; coord.y++)
+        for(coord.y = 0; coord.y < height; coord.y++) \
-        {
+        { \
-            uint4 data = read_imageui(input, coord.xy);
+            src_type data = image_read(input, coord.xy); \
-            cnt += 1.0f;
+            cnt += 1.0f; \
-            sum += data;
+            sum += data; \
-
+ \
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
-
+ \
-            dst.x = (uint)convert_int_rte(tmpSum);
+            dst.x = convert_dtype(tmpSum); \
-            write_imageui(output, coord.xy, dst);
+            image_write(output, coord.xy, dst); \
-        }
+        } \
-    }
+    } \
 }
 __kernel void cumsum_F32toU8_axis1_2D(
    __read_only image2d_t  input,
    __write_only image2d_t  output,
    int axis,
    int exclusive,
    int rev,
    int width,
    int height,
    int chn,
    int input_zp,
    float in_out_scale,
    float in_out_zp_scale,
    float output_zp
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
    float4 sum = (float4)(0);
    uint4 dst = (uint4)(0);
    int tmp_zp = convert_int_rte(output_zp);
    dst.x = convert_uint_sat(tmp_zp);
    float cnt = 0;
    if(exclusive && rev)
    {
        coord.w = height - 1;
        write_imageui(output, coord.zw, dst);
        for(coord.y = height - 1; coord.y > 0; coord.y--)
        {
            float4 data = read_imagef(input, coord.xy);
            cnt += 1.0f;
            coord.w--;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum.x * in_out_scale + tmpAlpha;
            dst.x = (uint)convert_int_rte(tmpSum);
            write_imageui(output, coord.zw, dst);
        }
    }
    else if(exclusive)
    {
        write_imageui(output, coord.zw, dst);
        for(coord.y = 0; coord.y < height - 1; coord.y++)
        {
            float4 data = read_imagef(input, coord.xy);
            cnt += 1.0f;
            coord.w++;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum.x * in_out_scale + tmpAlpha;
            dst.x = (uint)convert_int_rte(tmpSum);
            write_imageui(output, coord.zw, dst);
        }
    }
    else if(rev)
    {
        for(coord.y = height - 1; coord.y >= 0; coord.y--)
        {
            float4 data = read_imagef(input, coord.xy);
            cnt += 1.0f;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum.x * in_out_scale + tmpAlpha;
            dst.x = (uint)convert_int_rte(tmpSum);
            write_imageui(output, coord.xy, dst);
        }
    }
    else
    {
        for(coord.y = 0; coord.y < height; coord.y++)
        {
            float4 data = read_imagef(input, coord.xy);
            cnt += 1.0f;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum.x * in_out_scale + tmpAlpha;
            dst.x = (uint)convert_int_rte(tmpSum);
            write_imageui(output, coord.xy, dst);
        }
    }
 }
 CUMSUM_INT_AXIS1_2D_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)
 CUMSUM_INT_AXIS1_2D_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)
 CUMSUM_INT_AXIS1_2D_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)
 __kernel void cumsum_F32toF32_axis0_2D(
    __read_only image2d_t  input,
@ -316,188 +228,100 @@ __kernel void cumsum_F32toF32_axis0_2D(
    }
 }
-__kernel void cumsum_U8toU8_axis0_2D(
+#define CUMSUM_INT_AXIS0_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
-    __read_only image2d_t  input,
+__kernel void cumsum_##name##_axis0_2D( \
-    __write_only image2d_t  output,
+    __read_only  image2d_t input, \
-    int axis,
+    __write_only image2d_t output, \
-    int exclusive,
+    int axis, \
-    int rev,
+    int exclusive, \
-    int width,
+    int rev, \
-    int height,
+    int width, \
-    int chn,
+    int height, \
-    int input_zp,
+    int chn, \
-    float in_out_scale,
+    int input_zp, \
-    float in_out_zp_scale,
+    float in_out_scale, \
-    float output_zp
+    float in_out_zp_scale, \
-    )
+    float output_zp \
-{
+    ) \
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+{ \
-
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
-    uint4 sum = (uint4)(0);
+ \
-    uint4 dst = (uint4)(0);
+    src_type sum = (src_type)(0); \
-
+    dst_type dst = (dst_type)(0); \
-    int tmp_zp = convert_int_rte(output_zp);
+ \
-    dst.x = convert_uint_sat(tmp_zp);
+    int tmp_zp = convert_int_rte(output_zp); \
-
+    dst.x = convert_dtype(tmp_zp); \
-    float cnt = 0.0f;
+ \
-
+    float cnt = 0.0f; \
-    if(exclusive && rev)
+ \
-    {
+    if(exclusive && rev) \
-        coord.x = width - 1;
+    { \
-        coord.z = coord.x;
+        coord.x = width - 1; \
-        write_imageui(output, coord.zw, dst);
+        coord.z = coord.x; \
-        for(; coord.x > 0; coord.x--)
+        image_write(output, coord.zw, dst); \
-        {
+        for(; coord.x > 0; coord.x--) \
-            uint4 data = read_imageui(input, coord.xy);
+        { \
-            coord.z--;
+            src_type data = image_read(input, coord.xy); \
-            cnt += 1.0;
+            coord.z--; \
-            sum += data;
+            cnt += 1.0; \
-
+            sum += data; \
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+ \
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
-
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
-            dst.x = (uint)convert_int_rte(tmpSum);
+ \
-            write_imageui(output, coord.zw, dst);
+            dst.x = convert_dtype(tmpSum); \
-        }
+            image_write(output, coord.zw, dst); \
-    }
+        } \
-    else if(exclusive)
+    } \
-    {
+    else if(exclusive) \
-        coord.z = 0;
+    { \
-        write_imageui(output, coord.zw, dst);
+        coord.z = 0; \
-        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        image_write(output, coord.zw, dst); \
-        {
+        for(coord.x = 0; coord.x < width - 1; coord.x++) \
-            uint4 data = read_imageui(input, coord.xy);
+        { \
-            cnt += 1.0f;
+            src_type data = image_read(input, coord.xy); \
-            coord.z++;
+            cnt += 1.0f; \
-            sum += data;
+            coord.z++; \
-
+            sum += data; \
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+ \
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
-
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
-            dst.x = (uint)convert_int_rte(tmpSum);
+ \
-            write_imageui(output, coord.zw, dst);
+            dst.x = convert_dtype(tmpSum); \
-        }
+            image_write(output, coord.zw, dst); \
-    }
+        } \
-    else if(rev)
+    } \
-    {
+    else if(rev) \
-        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+    { \
-        {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--) \
-            uint4 data = read_imageui(input, coord.xy);
+        { \
-            cnt += 1.0f;
+            src_type data = image_read(input, coord.xy); \
-            sum += data;
+            cnt += 1.0f; \
-
+            sum += data; \
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+ \
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
-
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
-            dst.x = (uint)convert_int_rte(tmpSum);
+ \
-            write_imageui(output, coord.xy, dst);
+            dst.x = convert_dtype(tmpSum); \
-        }
+            image_write(output, coord.xy, dst); \
-    }
+        } \
-    else
+    } \
-    {
+    else \
-        for(coord.x = 0; coord.x < width; coord.x++)
+    { \
-        {
+        for(coord.x = 0; coord.x < width; coord.x++) \
-            uint4 data = read_imageui(input, coord.xy);
+        { \
-            cnt += 1.0f;
+            src_type data = image_read(input, coord.xy); \
-            sum += data;
+            cnt += 1.0f; \
-
+            sum += data; \
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+ \
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
-
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
-            dst.x = (uint)convert_int_rte(tmpSum);
+ \
-            write_imageui(output, coord.xy, dst);
+            dst.x = convert_dtype(tmpSum); \
-        }
+            image_write(output, coord.xy, dst); \
-    }
+        } \
-}
+    } \
 __kernel void cumsum_F32toU8_axis0_2D(
    __read_only image2d_t  input,
    __write_only image2d_t  output,
    int axis,
    int exclusive,
    int rev,
    int width,
    int height,
    int chn,
    int input_zp,
    float in_out_scale,
    float in_out_zp_scale,
    float output_zp
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
    float4 sum = (float4)(0);
    uint4 dst = (uint4)(0);
    int tmp_zp = convert_int_rte(output_zp);
    dst.x = convert_uint_sat(tmp_zp);
    float cnt = 0.0f;
    if(exclusive && rev)
    {
        coord.x = width - 1;
        coord.z = coord.x;
        write_imageui(output, coord.zw, dst);
        for(; coord.x > 0; coord.x--)
        {
            float4 data = read_imagef(input, coord.xy);
            coord.z--;
            cnt += 1.0;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum.x * in_out_scale + tmpAlpha;
            dst.x = (uint)convert_int_rte(tmpSum);
            write_imageui(output, coord.zw, dst);
        }
    }
    else if(exclusive)
    {
        coord.z = 0;
        write_imageui(output, coord.zw, dst);
        for(coord.x = 0; coord.x < width - 1; coord.x++)
        {
            float4 data = read_imagef(input, coord.xy);
            cnt += 1.0f;
            coord.z++;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum.x * in_out_scale + tmpAlpha;
            dst.x = (uint)convert_int_rte(tmpSum);
            write_imageui(output, coord.zw, dst);
        }
    }
    else if(rev)
    {
        for(coord.x = width - 1; coord.x >= 0; coord.x--)
        {
            float4 data = read_imagef(input, coord.xy);
            cnt += 1.0f;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum.x * in_out_scale + tmpAlpha;
            dst.x = (uint)convert_int_rte(tmpSum);
            write_imageui(output, coord.xy, dst);
        }
    }
    else
    {
        for(coord.x = 0; coord.x < width; coord.x++)
        {
            float4 data = read_imagef(input, coord.xy);
            cnt += 1.0f;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum.x * in_out_scale + tmpAlpha;
            dst.x = (uint)convert_int_rte(tmpSum);
            write_imageui(output, coord.xy, dst);
        }
    }
 }
 CUMSUM_INT_AXIS0_2D_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)
 CUMSUM_INT_AXIS0_2D_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)
 CUMSUM_INT_AXIS0_2D_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)
--- a/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl
@ -132,3 +132,30 @@ __kernel void one_hot_U8toU8
        coord.z ++;
    } while (coord.z < depth);
 }
 __kernel void one_hot_I32toBF16
    (
        __read_only  image2d_t       input,
        __write_only image2d_array_t output,
                     int             depth,
                     uint            on_value,
                     uint            off_value,
                     float           inputScale,
                     float           inputTail
    )
 {
    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
    int4 src = read_imagei(input, coord.xy);
    int  val = convert_int(convert_float(src.x) * inputScale - inputTail);
    do
    {
        uint4 dst;
        dst.x = val == coord.z ? on_value : off_value;
        write_imageui(output, coord.xzyw, dst.xxxx);
        coord.z ++;
    } while (coord.z < depth);
 }
--- a/src/tim/vx/internal/src/libnnext/ops/cl/rope_0.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/rope_0.cl
@ -0,0 +1,373 @@
 __kernel void rope_F32_F32toF32_axis0
  (
    __read_only  image2d_array_t input,
    __read_only  image2d_array_t cos_cache,
    __read_only  image2d_array_t sin_cache,
    __write_only image2d_array_t output,
                 int axis,
                 float input_zp,
                 float cos_zp,
                 float sin_zp,
                 float scale0,
                 float scale1,
                 float output_zp,
                 int half_head_size,
                 int step
  )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
    float4 cos, sin;
    READ_IMAGEF_2DARRAY(cos, cos_cache, coord);
    READ_IMAGEF_2DARRAY(sin, sin_cache, coord);
    coord.x = coord.x * step;
    float4 src0 = read_imagef(input, coord);
    int4 coord_out = coord;
    coord.x += half_head_size;
    float4 src1 = read_imagef(input, coord);
    float4 dst0 = src0 * cos - src1 * sin;
    float4 dst1 = src0 * sin + src1 * cos;
    write_imagef(output, coord_out, dst0);
    coord_out.x += half_head_size;
    write_imagef(output, coord_out, dst1);
 }
 __kernel void rope_F32_F32toF32_axis1
  (
    __read_only  image2d_array_t input,
    __read_only  image2d_array_t cos_cache,
    __read_only  image2d_array_t sin_cache,
    __write_only image2d_array_t output,
                 int axis,
                 float input_zp,
                 float cos_zp,
                 float sin_zp,
                 float scale0,
                 float scale1,
                 float output_zp,
                 int half_head_size,
                 int step
  )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
    float4 cos, sin;
    READ_IMAGEF_2DARRAY(cos, cos_cache, coord);
    READ_IMAGEF_2DARRAY(sin, sin_cache, coord);
    coord.y = coord.y * step;
    float4 src0 = read_imagef(input, coord);
    int4 coord_out = coord;
    coord.y += half_head_size;
    float4 src1 = read_imagef(input, coord);
    float4 dst0 = src0 * cos - src1 * sin;
    float4 dst1 = src0 * sin + src1 * cos;
    write_imagef(output, coord_out, dst0);
    coord_out.y += half_head_size;
    write_imagef(output, coord_out, dst1);
 }
 __kernel void rope_F32_F32toF32_axis2
  (
    __read_only  image2d_array_t input,
    __read_only  image2d_array_t cos_cache,
    __read_only  image2d_array_t sin_cache,
    __write_only image2d_array_t output,
                 int axis,
                 float input_zp,
                 float cos_zp,
                 float sin_zp,
                 float scale0,
                 float scale1,
                 float output_zp,
                 int half_head_size,
                 int step
  )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
    float4 cos = read_imagef(cos_cache, coord);
    float4 sin = read_imagef(sin_cache, coord);
    coord.z = coord.z * step;
    float4 src0 = read_imagef(input, coord);
    int4 coord_out = coord;
    coord.z += half_head_size;
    float4 src1 = read_imagef(input, coord);
    float4 dst0 = src0 * cos - src1 * sin;
    float4 dst1 = src0 * sin + src1 * cos;
    write_imagef(output, coord_out, dst0);
    coord_out.z += half_head_size;
    write_imagef(output, coord_out, dst1);
 }
 __kernel void rope_I32_I32toI32_axis0
  (
    __read_only  image2d_array_t input,
    __read_only  image2d_array_t cos_cache,
    __read_only  image2d_array_t sin_cache,
    __write_only image2d_array_t output,
                 int axis,
                 float input_zp,
                 float cos_zp,
                 float sin_zp,
                 float scale0,
                 float scale1,
                 float output_zp,
                 int half_head_size,
                 int step
  )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
    int4 _cos, _sin;
    float4 cos, sin;
    READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);
    READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);
    coord.x = coord.x * step;
    float4 src0 = convert_float4(read_imagei(input, coord));
    int4 coord_out = coord;
    coord.x += half_head_size;
    float4 src1 = convert_float4(read_imagei(input, coord));
    src0 = src0 - input_zp;
    src1 = src1 - input_zp;
    cos = convert_float4(_cos) - cos_zp;
    sin = convert_float4(_sin) - sin_zp;
    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
    int4 dst0 = convert_int4_rte(_dst0);
    int4 dst1 = convert_int4_rte(_dst1);
    write_imagei(output, coord_out, dst0);
    coord_out.x += half_head_size;
    write_imagei(output, coord_out, dst1);
 }
 __kernel void rope_I32_I32toI32_axis1
  (
    __read_only  image2d_array_t input,
    __read_only  image2d_array_t cos_cache,
    __read_only  image2d_array_t sin_cache,
    __write_only image2d_array_t output,
                 int axis,
                 float input_zp,
                 float cos_zp,
                 float sin_zp,
                 float scale0,
                 float scale1,
                 float output_zp,
                 int half_head_size,
                 int step
  )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
    int4 _cos, _sin;
    float4 cos, sin;
    READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);
    READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);
    coord.y = coord.y * step;
    float4 src0 = convert_float4(read_imagei(input, coord));
    int4 coord_out = coord;
    coord.y += half_head_size;
    float4 src1 = convert_float4(read_imagei(input, coord));
    src0 = src0 - input_zp;
    src1 = src1 - input_zp;
    cos = convert_float4(_cos) - cos_zp;
    sin = convert_float4(_sin) - sin_zp;
    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
    int4 dst0 = convert_int4_rte(_dst0);
    int4 dst1 = convert_int4_rte(_dst1);
    write_imagei(output, coord_out, dst0);
    coord_out.y += half_head_size;
    write_imagei(output, coord_out, dst1);
 }
 __kernel void rope_I32_I32toI32_axis2
  (
    __read_only  image2d_array_t input,
    __read_only  image2d_array_t cos_cache,
    __read_only  image2d_array_t sin_cache,
    __write_only image2d_array_t output,
                 int axis,
                 float input_zp,
                 float cos_zp,
                 float sin_zp,
                 float scale0,
                 float scale1,
                 float output_zp,
                 int half_head_size,
                 int step
  )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
    float4 cos = convert_float4(read_imagei(cos_cache, coord));
    float4 sin = convert_float4(read_imagei(sin_cache, coord));
    coord.z = coord.z * step;
    float4 src0 = convert_float4(read_imagei(input, coord));
    int4 coord_out = coord;
    coord.z += half_head_size;
    float4 src1 = convert_float4(read_imagei(input, coord));
    src0 = src0 - input_zp;
    src1 = src1 - input_zp;
    cos = cos - cos_zp;
    sin = sin - sin_zp;
    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
    int4 dst0 = convert_int4_rte(_dst0);
    int4 dst1 = convert_int4_rte(_dst1);
    write_imagei(output, coord_out, dst0);
    coord_out.z += half_head_size;
    write_imagei(output, coord_out, dst1);
 }
 __kernel void rope_U32_U32toU32_axis0
  (
    __read_only  image2d_array_t input,
    __read_only  image2d_array_t cos_cache,
    __read_only  image2d_array_t sin_cache,
    __write_only image2d_array_t output,
                 int axis,
                 float input_zp,
                 float cos_zp,
                 float sin_zp,
                 float scale0,
                 float scale1,
                 float output_zp,
                 int half_head_size,
                 int step
  )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
    uint4 _cos, _sin;
    float4 cos, sin;
    READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);
    READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);
    coord.x = coord.x * step;
    float4 src0 = convert_float4(read_imageui(input, coord));
    int4 coord_out = coord;
    coord.x += half_head_size;
    float4 src1 = convert_float4(read_imageui(input, coord));
    src0 = src0 - input_zp;
    src1 = src1 - input_zp;
    cos = convert_float4(_cos) - cos_zp;
    sin = convert_float4(_sin) - sin_zp;
    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
    uint4 dst0 = convert_uint4_rte(_dst0);
    uint4 dst1 = convert_uint4_rte(_dst1);
    write_imageui(output, coord_out, dst0);
    coord_out.x += half_head_size;
    write_imageui(output, coord_out, dst1);
 }
 __kernel void rope_U32_U32toU32_axis1
  (
    __read_only  image2d_array_t input,
    __read_only  image2d_array_t cos_cache,
    __read_only  image2d_array_t sin_cache,
    __write_only image2d_array_t output,
                 int axis,
                 float input_zp,
                 float cos_zp,
                 float sin_zp,
                 float scale0,
                 float scale1,
                 float output_zp,
                 int half_head_size,
                 int step
  )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
    uint4 _cos, _sin;
    float4 cos, sin;
    READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);
    READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);
    coord.y = coord.y * step;
    float4 src0 = convert_float4(read_imageui(input, coord));
    int4 coord_out = coord;
    coord.y += half_head_size;
    float4 src1 = convert_float4(read_imageui(input, coord));
    src0 = src0 - input_zp;
    src1 = src1 - input_zp;
    cos = convert_float4(_cos) - cos_zp;
    sin = convert_float4(_sin) - sin_zp;
    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
    uint4 dst0 = convert_uint4_rte(_dst0);
    uint4 dst1 = convert_uint4_rte(_dst1);
    write_imageui(output, coord_out, dst0);
    coord_out.y += half_head_size;
    write_imageui(output, coord_out, dst1);
 }
 __kernel void rope_U32_U32toU32_axis2
  (
    __read_only  image2d_array_t input,
    __read_only  image2d_array_t cos_cache,
    __read_only  image2d_array_t sin_cache,
    __write_only image2d_array_t output,
                 int axis,
                 float input_zp,
                 float cos_zp,
                 float sin_zp,
                 float scale0,
                 float scale1,
                 float output_zp,
                 int half_head_size,
                 int step
  )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
    float4 cos = convert_float4(read_imageui(cos_cache, coord));
    float4 sin = convert_float4(read_imageui(sin_cache, coord));
    coord.z = coord.z * step;
    float4 src0 = convert_float4(read_imageui(input, coord));
    int4 coord_out = coord;
    coord.z += half_head_size;
    float4 src1 = convert_float4(read_imageui(input, coord));
    src0 = src0 - input_zp;
    src1 = src1 - input_zp;
    cos = cos - cos_zp;
    sin = sin - sin_zp;
    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
    uint4 dst0 = convert_uint4_rte(_dst0);
    uint4 dst1 = convert_uint4_rte(_dst1);
    write_imageui(output, coord_out, dst0);
    coord_out.z += half_head_size;
    write_imageui(output, coord_out, dst1);
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/custom_letterbox.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_letterbox.vx
@ -0,0 +1,307 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform int top;
 _viv_uniform int left;
 _viv_uniform float out_scale_r;
 _viv_uniform float out_scale_g;
 _viv_uniform float out_scale_b;
 _viv_uniform float out_zp_r;
 _viv_uniform float out_zp_g;
 _viv_uniform float out_zp_b;
 _viv_uniform float pad_v_r;
 _viv_uniform float pad_v_g;
 _viv_uniform float pad_v_b;
 _viv_uniform float scale_w;
 _viv_uniform float scale_h;
 _viv_uniform int resize_max_w;
 _viv_uniform int resize_max_h;
 _viv_uniform int out_height;
 _viv_uniform int r_order;
 _viv_uniform int b_order;
 _viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
 _viv_uniform VXC_512Bits uniLeftToFloat32_4x4;
 _viv_uniform VXC_512Bits uniExtactHalf8_2x8;
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;
 __kernel void custom_letterbox_U8toU8
    (
     __read_only  image2d_t input,
     __write_only image2d_t output,
            int       top_,
            int       bottom_,
            int       left_,
            int       right_,
            float     mean_r_,
            float     mean_g_,
            float     mean_b_,
            float     scale_r_,
            float     scale_g_,
            float     scale_b_,
            int       pad_r_,
            int       pad_g_,
            int       pad_b_,
            int       reverse_channel
    )
 {
    int2 coord_out  =  (int2)(get_global_id(0), get_global_id(1));
    int2 coord = coord_out;
    uint4 dst = (uint4)(0,0,0,0);
    vxc_uchar8 result;
    if (coord_out.x < left || coord_out.x >= resize_max_w ||
        coord_out.y < top  || coord_out.y >= resize_max_h)
    {
        dst.x = convert_uint(pad_v_r);
        coord.y = coord_out.y + r_order;
        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
        dst.x = convert_uint(pad_v_g);
        coord.y = coord_out.y + out_height;
        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
        dst.x = convert_uint(pad_v_b);
        coord.y = coord_out.y + b_order;
        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
        return;
    }
    float in_x = convert_float(coord_out.x - left) * scale_w;
    float in_y = convert_float(coord_out.y - top) * scale_h;
    float left_x_f    = floor(in_x);
    float top_y_f     = floor(in_y);
    float x_lerp      = in_x - left_x_f;
    float y_lerp      = in_y - top_y_f;
    int   left_x_idx  = convert_int(left_x_f);
    int   top_y_idx   = convert_int(top_y_f);
    int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
    vxc_uchar8 top_data, bottom_data;
    VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
                                VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
    VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
                                VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
    float4 left4 = (float4)(0,0,0,0);
    float4 right4 = (float4)(0,0,0,0);
    float4 top4 = (float4)(0,0,0,0);
    float4 bottom4 = (float4)(0,0,0,0);
    VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
    VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
    top4 = right4 * x_lerp + left4;
    VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
    VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
    bottom4 = right4 * x_lerp + left4;
    float4 out = (bottom4 - top4) * y_lerp + top4;
    dst.x = convert_uint(out.x * out_scale_r + out_zp_r );
    coord.y = coord_out.y + r_order;
    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    dst.x = convert_uint(out.y * out_scale_g + out_zp_g);
    coord.y = coord_out.y + out_height;
    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    dst.x = convert_uint(out.z * out_scale_b + out_zp_b);
    coord.y = coord_out.y + b_order;
    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
 __kernel void custom_letterbox_U8toI8
    (
     __read_only  image2d_t input,
     __write_only image2d_t output,
            int       top_,
            int       bottom_,
            int       left_,
            int       right_,
            float     mean_r_,
            float     mean_g_,
            float     mean_b_,
            float     scale_r_,
            float     scale_g_,
            float     scale_b_,
            int       pad_r_,
            int       pad_g_,
            int       pad_b_,
            int       reverse_channel
    )
 {
    int2 coord_out  =  (int2)(get_global_id(0), get_global_id(1));
    int2 coord = coord_out;
    int4 dst = (int4)(0,0,0,0);
    vxc_char8 result;
    if (coord_out.x < left || coord_out.x >= resize_max_w ||
        coord_out.y < top  || coord_out.y >= resize_max_h)
    {
        dst.x = convert_int(pad_v_r);
        coord.y = coord_out.y + r_order;
        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
        dst.x = convert_int(pad_v_g);
        coord.y = coord_out.y + out_height;
        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
        dst.x = convert_int(pad_v_b);
        coord.y = coord_out.y + b_order;
        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
        return;
    }
    float in_x = convert_float(coord_out.x - left) * scale_w;
    float in_y = convert_float(coord_out.y - top) * scale_h;
    float left_x_f    = floor(in_x);
    float top_y_f     = floor(in_y);
    float x_lerp      = in_x - left_x_f;
    float y_lerp      = in_y - top_y_f;
    int   left_x_idx  = convert_int(left_x_f);
    int   top_y_idx   = convert_int(top_y_f);
    int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
    vxc_char8 top_data, bottom_data;
    VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
                               VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
    VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
                               VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
    float4 left4 = (float4)(0,0,0,0);
    float4 right4 = (float4)(0,0,0,0);
    float4 top4 = (float4)(0,0,0,0);
    float4 bottom4 = (float4)(0,0,0,0);
    VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
    VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
    top4 = right4 * x_lerp + left4;
    VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
    VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
    bottom4 = right4 * x_lerp + left4;
    float4 out = (bottom4 - top4) * y_lerp + top4;
    dst.x = convert_int(out.x * out_scale_r + out_zp_r);
    coord.y = coord_out.y + r_order;
    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    dst.x = convert_int(out.y * out_scale_g + out_zp_g);
    coord.y = coord_out.y + out_height;
    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    dst.x = convert_int(out.z * out_scale_b + out_zp_b);
    coord.y = coord_out.y + b_order;
    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
 __kernel void custom_letterbox_U8toF16
    (
     __read_only  image2d_t input,
     __write_only image2d_t output,
            int       top_,
            int       bottom_,
            int       left_,
            int       right_,
            float     mean_r_,
            float     mean_g_,
            float     mean_b_,
            float     scale_r_,
            float     scale_g_,
            float     scale_b_,
            int       pad_r_,
            int       pad_g_,
            int       pad_b_,
            int       reverse_channel
    )
 {
    int2 coord_out  =  (int2)(get_global_id(0), get_global_id(1));
    int2 coord = coord_out;
    half4 tmp;
    vxc_half8 dst_temp;
    vxc_ushort8 dst;
    if (coord_out.x < left || coord_out.x >= resize_max_w ||
        coord_out.y < top  || coord_out.y >= resize_max_h)
    {
        float4 pad = (float4)(pad_v_r, pad_v_g, pad_v_b, 0);
        _viv_asm(CONV, tmp, pad);
        VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
        _viv_asm(COPY, dst, dst_temp, 16);
        coord.y = coord_out.y + r_order;
        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
        tmp.x = tmp.y;
        VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
        _viv_asm(COPY, dst, dst_temp, 16);
        coord.y = coord_out.y + out_height;
        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
        tmp.x = tmp.z;
        VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
        _viv_asm(COPY, dst, dst_temp, 16);
        coord.y = coord_out.y + b_order;
        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
        return;
    }
    float in_x = convert_float(coord_out.x - left) * scale_w;
    float in_y = convert_float(coord_out.y - top) * scale_h;
    float left_x_f    = floor(in_x);
    float top_y_f     = floor(in_y);
    float x_lerp      = in_x - left_x_f;
    float y_lerp      = in_y - top_y_f;
    int   left_x_idx  = convert_int(left_x_f);
    int   top_y_idx   = convert_int(top_y_f);
    int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
    vxc_uchar8 top_data, bottom_data;
    VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
                            VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
    VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
                            VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
    float4 left4 = (float4)(0,0,0,0);
    float4 right4 = (float4)(0,0,0,0);
    float4 top4 = (float4)(0,0,0,0);
    float4 bottom4 = (float4)(0,0,0,0);
    VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
    VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
    top4 = right4 * x_lerp + left4;
    VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
    VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
    bottom4 = right4 * x_lerp + left4;
    float4 out = (bottom4 - top4) * y_lerp + top4;
    float4 out_temp = (float4)(0,0,0,0);
    out_temp.x = out.x * out_scale_r + out_zp_r;
    _viv_asm(CONV, tmp, out_temp);
    VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
    _viv_asm(COPY, dst, dst_temp, 16);
    coord.y = coord_out.y + r_order;
    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    out_temp.x = out.y * out_scale_g + out_zp_g;
    _viv_asm(CONV, tmp, out_temp);
    VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
    _viv_asm(COPY, dst, dst_temp, 16);
    coord.y = coord_out.y + out_height;
    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    out_temp.x = out.z * out_scale_b + out_zp_b;
    _viv_asm(CONV, tmp, out_temp);
    VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
    _viv_asm(COPY, dst, dst_temp, 16);
    coord.y = coord_out.y + out_height;
    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx
@ -10,7 +10,12 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32;
 _viv_uniform VXC_512Bits uniExtract8Bin_2x8;
 _viv_uniform int  sf_size;
 _viv_uniform float  srcScale;
 _viv_uniform float  srcZP;
 _viv_uniform float  dstScale;
 _viv_uniform float  dstZP;
 #define F_MAX(a,b) ((a)>(b)?(a):(b))
 __kernel void Softmax2VXC
    (
@ -19,35 +24,37 @@ __kernel void Softmax2VXC
    int axis
    )
 {
-   int4 coord_in = (int4)(0,0,0,0);
+   int4 coord_in = (int4)(0, get_global_id(0), 0, 0);
-   float fMax = 0.0;
+   float fMax = 0;
   for (int i = 0; i < sf_size; i++)
   {
-       vxc_char8 val;
+       vxc_short8 val;
       vxc_half8  val_h;
       coord_in.x = i;
-       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
       _viv_asm(COPY, val_h, val, 16);
       float fval;
-       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
+       VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
       fMax = F_MAX(fMax, fval);
   }
    float  fProbSum = 0.0f;
    vxc_short8 dst;
    for (int i = 0; i < sf_size; i++)
    {
-       vxc_char8 val;
+       vxc_short8 val;
-
+       vxc_half8  val_h;
       coord_in.x = i;
-       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
       _viv_asm(COPY, val_h, val, 16);
       float fval;
-       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
+       VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
       float fOut = (float)exp(fval - fMax);
       fProbSum += fOut;
       half hVal;
-       _viv_asm(CONV,hVal,fOut);
+       _viv_asm(CONV, hVal, fOut);
-       _viv_asm(COPY,dst,hVal, 4);
+       _viv_asm(COPY, dst, hVal, 4);
       VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    }
@ -56,15 +63,68 @@ __kernel void Softmax2VXC
       vxc_short8 val;
       vxc_half8  val_h;
       coord_in.x = i;
-       VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
+       VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
       float fval;
       _viv_asm(COPY, val_h,val, 16);
       VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
-
+       float fOut =fval / fProbSum;
       float fOut =fval/fProbSum;
       half hVal;
-       _viv_asm(CONV,hVal,fOut);
+       _viv_asm(CONV, hVal, fOut);
       _viv_asm(COPY,dst,hVal, 4);
       VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    }
 }
 __kernel void Softmax2VXC_u8
    (
    image2d_array_t input,
    image2d_array_t output,
    int axis
    )
 {
   int4 coord_in = (int4)(0, get_global_id(0), 0, 0);
   float fMax = -3.4e38f;
   for (int i = 0; i < sf_size; i++)
   {
       vxc_uchar8 val;
       coord_in.x = i;
       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
       float fval;
       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
       fval = (fval - srcZP) * srcScale;
       fMax = F_MAX(fMax, fval);
   }
    float  fProbSum = 0.0f;
    vxc_uchar8 dst;
    for (int i = 0; i < sf_size; i++)
    {
       vxc_uchar8 val;
       coord_in.x = i;
       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
       float fval;
       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
       fval = (fval - srcZP) * srcScale;
       float fOut = (float)exp(fval - fMax);
       fProbSum += fOut;
    }
    for (int i = 0; i < sf_size; i++)
    {
       vxc_uchar8 val;
       coord_in.x = i;
       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
       float fval;
       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
       fval = (fval - srcZP) * srcScale;
       float fOut = exp(fval - fMax) / fProbSum;
       fOut = fOut * dstScale + dstZP;
       short dst0;
       _viv_asm(CONV, dst0, fOut);
       VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8);
       VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx
@ -16,7 +16,7 @@ _viv_uniform float sum_x2_tail1;
 _viv_uniform float output_scale;
 _viv_uniform float output_zp;
-#define GROUP_NORM_SUMS_16BITS_IMPL(name, src_type) \
+#define GROUP_NORM_SUMS_16BITS_IMPL(name, load_type, src_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \
    __read_only  image2d_array_t input, \
    __write_only image2d_array_t output, \
@ -26,7 +26,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
    int lidx = get_local_id(0); \
    int gidz = get_global_id(1); \
    int4 coord = (int4)(gidx, 0, gidz, 0); \
-    vxc_short8 src0; \
+    load_type src; \
    src_type in_h; \
    float4 sumsqr; \
    float4 tmpSumSqr = (float4)(0); \
@ -43,9 +43,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
    { \
        for(coord.y = 0; coord.y < height;) \
        { \
-            VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            VXC_OP4(img_load_3d, src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
            coord.y++; \
-            _viv_asm(COPY, in_h, src0, 16); \
+            _viv_asm(COPY, in_h, src, 16); \
            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
            tmpSumSqr += sumsqr; \
        } \
@ -76,10 +76,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
        write_imagef(output, coord_out, data); \
    } \
 }
-GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_half8)
+GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_short8,  vxc_half8)
-GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)
+GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8,  vxc_short8)
 GROUP_NORM_SUMS_16BITS_IMPL(U16, vxc_ushort8, vxc_ushort8)
-#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \
+#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, load_type, src_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \
    __read_only  image2d_array_t input, \
    __write_only image2d_array_t output, \
@ -89,7 +90,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
    int lidx = get_local_id(0); \
 \
    int2 coord = (int2)(gidx, get_global_id(1)); \
-    vxc_short8 src0; \
+    load_type src; \
    src_type in_h; \
    float4 sumsqr = (float4)(0); \
 \
@ -98,8 +99,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
 \
    if(gidx < width) \
    { \
-        VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-        _viv_asm(COPY, in_h, src0, 16); \
+        _viv_asm(COPY, in_h, src, 16); \
        VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
        sumsqr.y = sumsqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sumsqr.x; \
        sumsqr.x = sumsqr.x * input_scale + sum_x_tail; \
@ -128,8 +129,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
        write_imagef(output, coord_out, data); \
    } \
 }
-GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)
+GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_short8, vxc_half8)
-GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)
+GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8, vxc_short8)
 GROUP_NORM_SUMS_16BITS_IMPL_2D(U16, vxc_ushort8, vxc_ushort8)
 #define GROUP_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \
@ -178,7 +180,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
    _viv_asm(CONV_RTE, tmpVal0, norm); \
    norm = alpha * tmpData1 + bias_val; \
    _viv_asm(CONV_RTE, tmpVal1, norm); \
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    _viv_asm(COPY, outval, dst, 16); \
    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
 }
@ -230,10 +232,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
    float4 norm; \
    norm = alpha * tmpData0 + bias_val; \
-    _viv_asm(CONV, tmpVal0, norm); \
+    _viv_asm(CONV_RTE, tmpVal0, norm); \
    norm = alpha * tmpData1 + bias_val; \
-    _viv_asm(CONV, tmpVal1, norm); \
+    _viv_asm(CONV_RTE, tmpVal1, norm); \
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    _viv_asm(COPY, outval, dst, 16); \
    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
@ -283,7 +285,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
 \
    float4 norm; \
    norm = alpha * tmpData0 + bias_val; \
-    _viv_asm(CONV, tmpVal0, norm); \
+    _viv_asm(CONV_RTE, tmpVal0, norm); \
    norm = alpha * tmpData1 + bias_val; \
    _viv_asm(CONV_RTE, tmpVal1, norm); \
    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
@ -296,6 +298,7 @@ GROUP_NORM_16BITS_F32_IMPL(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int
 GROUP_NORM_16BITS_F32_IMPL(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)
 GROUP_NORM_16BITS_F32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
 GROUP_NORM_16BITS_F32_IMPL(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)
 GROUP_NORM_16BITS_F32_IMPL(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)
 #define GROUP_NORM_16BITS_F32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \
@ -333,10 +336,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
    float4 norm; \
    norm = alpha * tmpData0 + bias_val; \
-    _viv_asm(CONV, tmpVal0, norm); \
+    _viv_asm(CONV_RTE, tmpVal0, norm); \
    norm = alpha * tmpData1 + bias_val; \
-    _viv_asm(CONV, tmpVal1, norm); \
+    _viv_asm(CONV_RTE, tmpVal1, norm); \
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    _viv_asm(COPY, outval, dst, 16); \
    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
@ -346,4 +349,5 @@ GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,
 GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)
 GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
 GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)
 GROUP_NORM_16BITS_F32_IMPL_2D(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/prelu.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/prelu.vx
@ -115,45 +115,45 @@ _viv_uniform VXC_512Bits uniDataSubZPtoFp32Part1_4x4;
 _viv_uniform VXC_512Bits uniConvF16toF32_part0_4x4;
 _viv_uniform VXC_512Bits uniConvF16toF32_part1_4x4;
 _viv_uniform VXC_512Bits uniExtact8Bin_2x8;
-_viv_uniform int inputZP0;
+_viv_uniform int input0_zp;
-_viv_uniform int inputZP1;
+_viv_uniform int input1_zp;
-_viv_uniform float input_scale0;
+_viv_uniform float input0_scale;
-_viv_uniform float input_scale1;
+_viv_uniform float input1_scale;
-_viv_uniform float outputZP;
+_viv_uniform float output_zp;
-#define PRELU_F16_3D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \
+#define PRELU_F16_3D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \
-    __kernel void prelu_##name0##to##name1( \
+    __kernel void prelu_##name( \
    __read_only  image2d_array_t input0, \
    __read_only  image2d_array_t input1, \
    __write_only image2d_array_t output) \
 {\
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\
-    vxc_float4 vecA, vecB, vecC, vecD;\
+    float4 vecA, vecB, vecC, vecD;\
    input_type0 srcA;\
    copy_type0  src0;\
    vxc_short8 srcB;\
    vxc_half8  src1;\
-    input_type0 input_ZP;\
+    input_type0 zp;\
    VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
    _viv_asm(COPY, src0, srcA, 16); \
    VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
    _viv_asm(COPY, src1, srcB, 16); \
    \
-    _viv_asm(COPY, input_ZP, inputZP0, 4);\
+    _viv_asm(COPY, zp, input0_zp, 4);\
-    VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
+    VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
        uniDataSubZPtoFp32Part0_4x4); \
-    VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
+    VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
        uniDataSubZPtoFp32Part1_4x4);\
    VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\
    VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\
    \
-    vecA = vecA * input_scale0;\
+    vecA = vecA * input0_scale;\
-    vecB = vecB * input_scale0;\
+    vecB = vecB * input0_scale;\
-    vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \
+    float4 maxData0 = vecA > 0 ? vecA : 0.0; \
-    vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \
+    float4 maxData1 = vecB > 0 ? vecB : 0.0; \
-    vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
+    float4 minData0 = vecA < 0 ? vecA : 0.0; \
-    vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
+    float4 minData1 = vecB < 0 ? vecB : 0.0; \
-    vecA = maxData0 + vecC * minData0 + outputZP;\
+    vecA = maxData0 + vecC * minData0 + output_zp;\
-    vecB = maxData1 + vecD * minData1 + outputZP;\
+    vecB = maxData1 + vecD * minData1 + output_zp;\
    convert_type dst0, dst1;\
    _viv_asm(CONV_RTE, dst0, vecA);\
    _viv_asm(CONV_RTE, dst1, vecB);\
@ -164,49 +164,49 @@ _viv_uniform float outputZP;
    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
 }
 //        name0, name1, input_type0, copy_type0,  output_type, convert_type, copy_type
-PRELU_F16_3D(I8F16,  I8,  vxc_char16,  vxc_char16,  vxc_char16,  int4,  vxc_char16)
+PRELU_F16_3D(I8F16toI8,   vxc_char16,  vxc_char16,  vxc_char16,  int4,  vxc_char16)
-PRELU_F16_3D(I8F16,  F16, vxc_char16,  vxc_char16,  vxc_half8,   half4, vxc_short8)
+PRELU_F16_3D(I8F16toF16,  vxc_char16,  vxc_char16,  vxc_half8,   half4, vxc_short8)
-PRELU_F16_3D(I16F16, I16, vxc_short8,  vxc_short8,  vxc_short8,  int4,  vxc_short8)
+PRELU_F16_3D(I16F16toI16, vxc_short8,  vxc_short8,  vxc_short8,  int4,  vxc_short8)
-PRELU_F16_3D(I16F16, F16, vxc_short8,  vxc_short8,  vxc_half8,   half4, vxc_short8)
+PRELU_F16_3D(I16F16toF16, vxc_short8,  vxc_short8,  vxc_half8,   half4, vxc_short8)
-PRELU_F16_3D(U8F16,  U8,  vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)
+PRELU_F16_3D(U8F16toU8,   vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)
-PRELU_F16_3D(U8F16,  F16, vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)
+PRELU_F16_3D(U8F16toF16,  vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)
-PRELU_F16_3D(F16F16, F16, vxc_short8,  vxc_half8,   vxc_half8,   half4, vxc_short8)
+PRELU_F16_3D(F16F16toF16, vxc_short8,  vxc_half8,   vxc_half8,   half4, vxc_short8)
-PRELU_F16_3D(F16F16, I8,  vxc_short8,  vxc_half8,   vxc_char16,  int4,  vxc_char16)
+PRELU_F16_3D(F16F16toI8,  vxc_short8,  vxc_half8,   vxc_char16,  int4,  vxc_char16)
-PRELU_F16_3D(F16F16, I16, vxc_short8,  vxc_half8,   vxc_short8,  int4,  vxc_short8)
+PRELU_F16_3D(F16F16toI16, vxc_short8,  vxc_half8,   vxc_short8,  int4,  vxc_short8)
-PRELU_F16_3D(F16F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_uchar16)
+PRELU_F16_3D(F16F16toU8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_uchar16)
-#define PRELU_F16_2D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \
+#define PRELU_F16_2D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \
-    __kernel void prelu_##name0##to##name1##_2D( \
+    __kernel void prelu_##name##_2D( \
    __read_only  image2d_array_t input0, \
    __read_only  image2d_array_t input1, \
    __write_only image2d_array_t output) \
 {\
    int2 coord = (int2)(get_global_id(0), get_global_id(1));\
-    vxc_float4 vecA, vecB, vecC, vecD;\
+    float4 vecA, vecB, vecC, vecD;\
    input_type0 srcA;\
    copy_type0  src0;\
    vxc_short8 srcB;\
    vxc_half8  src1;\
-    input_type0 input_ZP;\
+    input_type0 zp;\
    VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
    _viv_asm(COPY, src0, srcA, 16); \
    VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
    _viv_asm(COPY, src1, srcB, 16); \
    \
-    _viv_asm(COPY, input_ZP, inputZP0, 4);\
+    _viv_asm(COPY, zp, input0_zp, 4);\
-    VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
+    VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
-    VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
+    VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
    VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\
    VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\
    \
-    vecA = vecA * input_scale0;\
+    vecA = vecA * input0_scale;\
-    vecB = vecB * input_scale0;\
+    vecB = vecB * input0_scale;\
-    vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \
+    float4 maxData0 = vecA > 0 ? vecA : 0.0; \
-    vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \
+    float4 maxData1 = vecB > 0 ? vecB : 0.0; \
-    vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
+    float4 minData0 = vecA < 0 ? vecA : 0.0; \
-    vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
+    float4 minData1 = vecB < 0 ? vecB : 0.0; \
-    vecA = maxData0 + vecC * minData0 + outputZP;\
+    vecA = maxData0 + vecC * minData0 + output_zp;\
-    vecB = maxData1 + vecD * minData1 + outputZP;\
+    vecB = maxData1 + vecD * minData1 + output_zp;\
    convert_type dst0, dst1;\
    _viv_asm(CONV_RTE, dst0, vecA);\
    _viv_asm(CONV_RTE, dst1, vecB);\
@ -216,49 +216,49 @@ PRELU_F16_3D(F16F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_ucha
    _viv_asm(COPY, dst, dst2, 16); \
    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
 }
-PRELU_F16_2D(I8F16,  F16, vxc_char16,  vxc_char16,  vxc_half8,   half4, vxc_short8)
+PRELU_F16_2D(I8F16toF16,  vxc_char16,  vxc_char16,  vxc_half8,   half4, vxc_short8)
-PRELU_F16_2D(I8F16,  I8,  vxc_char16,  vxc_char16,  vxc_char16,  int4,  vxc_char16)
+PRELU_F16_2D(I8F16toI8,   vxc_char16,  vxc_char16,  vxc_char16,  int4,  vxc_char16)
-PRELU_F16_2D(I16F16, F16, vxc_short8,  vxc_short8,  vxc_half8,   half4, vxc_short8)
+PRELU_F16_2D(I16F16toF16, vxc_short8,  vxc_short8,  vxc_half8,   half4, vxc_short8)
-PRELU_F16_2D(U8F16,  U8,  vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)
+PRELU_F16_2D(U8F16toU8,   vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)
-PRELU_F16_2D(U8F16,  F16, vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)
+PRELU_F16_2D(U8F16toF16,  vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)
-PRELU_F16_2D(F16F16, F16, vxc_short8,  vxc_half8,   vxc_half8,   half4, vxc_short8)
+PRELU_F16_2D(F16F16toF16, vxc_short8,  vxc_half8,   vxc_half8,   half4, vxc_short8)
-PRELU_F16_2D(F16F16, I8,  vxc_short8,  vxc_half8,   vxc_char16,  int4,  vxc_char16)
+PRELU_F16_2D(F16F16toI8,  vxc_short8,  vxc_half8,   vxc_char16,  int4,  vxc_char16)
-PRELU_F16_2D(F16F16, I16, vxc_short8,  vxc_half8,   vxc_short8,  int4,  vxc_short8)
+PRELU_F16_2D(F16F16toI16, vxc_short8,  vxc_half8,   vxc_short8,  int4,  vxc_short8)
-PRELU_F16_2D(I16F16, I16, vxc_short8,  vxc_short8,  vxc_short8,  int4,  vxc_short8)
+PRELU_F16_2D(I16F16toI16, vxc_short8,  vxc_short8,  vxc_short8,  int4,  vxc_short8)
-PRELU_F16_2D(F16F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_uchar16)
+PRELU_F16_2D(F16F16toU8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_uchar16)
-#define PRELU_U8_2D(name, output_type, convert_type, copy_type) \
+#define PRELU_INTEGER_2D(name, src0_type, src1_type, output_type, convert_type, copy_type) \
-    __kernel void prelu_U8U8to##name##_2D( \
+    __kernel void prelu_##name##_2D( \
    __read_only  image2d_array_t input0, \
    __read_only  image2d_array_t input1, \
    __write_only image2d_array_t output) \
 {\
    int2 coord = (int2)(get_global_id(0), get_global_id(1));\
-    vxc_float4 vecA, vecB, vecC, vecD;\
+    float4 vecA, vecB, vecC, vecD;\
-    vxc_uchar16  src0;\
+    src0_type  src0;\
-    vxc_uchar16  src1;\
+    src1_type  src1;\
-    vxc_uchar16 input_ZP0;\
+    short zp0;\
-    vxc_uchar16 input_ZP1;\
+    short zp1;\
    VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
    VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
    \
-    _viv_asm(COPY, input_ZP0, inputZP0, 4);\
+    _viv_asm(COPY, zp0, input0_zp, 2);\
-    VXC_DP4x4(vecA, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
+    VXC_DP4x4(vecA, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
-    VXC_DP4x4(vecB, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
+    VXC_DP4x4(vecB, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
-    _viv_asm(COPY, input_ZP1, inputZP1, 4);\
+    _viv_asm(COPY, zp1, input1_zp, 4);\
-    VXC_DP4x4(vecC, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
+    VXC_DP4x4(vecC, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
-    VXC_DP4x4(vecD, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
+    VXC_DP4x4(vecD, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
    \
-    vecA = vecA * input_scale0;\
+    vecA = vecA * input0_scale;\
-    vecB = vecB * input_scale0;\
+    vecB = vecB * input0_scale;\
-    vecC = vecC * input_scale1;\
+    vecC = vecC * input1_scale;\
-    vecD = vecD * input_scale1;\
+    vecD = vecD * input1_scale;\
-    vxc_float4 maxData0 = vecA >= 0 ? vecA : 0.0; \
+    float4 maxData0 = vecA >= 0 ? vecA : 0.0; \
-    vxc_float4 maxData1 = vecB >= 0 ? vecB : 0.0; \
+    float4 maxData1 = vecB >= 0 ? vecB : 0.0; \
-    vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
+    float4 minData0 = vecA < 0 ? vecA : 0.0; \
-    vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
+    float4 minData1 = vecB < 0 ? vecB : 0.0; \
-    vecA = maxData0 + vecC * minData0 + outputZP;\
+    vecA = maxData0 + vecC * minData0 + output_zp;\
-    vecB = maxData1 + vecD * minData1 + outputZP;\
+    vecB = maxData1 + vecD * minData1 + output_zp;\
    convert_type dst0, dst1;\
    _viv_asm(CONV_RTE, dst0, vecA);\
    _viv_asm(CONV_RTE, dst1, vecB);\
@ -268,7 +268,8 @@ PRELU_F16_2D(F16F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_ucha
    _viv_asm(COPY, dst, dst2, 16); \
    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
 }
-PRELU_U8_2D(U8,  vxc_uchar16, int4,  vxc_uchar16)
+PRELU_INTEGER_2D(U8U8toU8,    vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)
-PRELU_U8_2D(F16, vxc_half8,   half4, vxc_short8)
+PRELU_INTEGER_2D(U8U8toF16,   vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_3.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_3.vx
@ -0,0 +1,181 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniU8PostProcess_2x8;
 _viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
 _viv_uniform VXC_512Bits uniResize2xUp_0_4x8;
 _viv_uniform VXC_512Bits uniResize2xUp_1_4x8;
 _viv_uniform int out_height;
 __kernel void resize_bilinear_U8toU8_2x_upsample_half_pixel_centers
    (
    __read_only  image2d_array_t   input,
    __write_only image2d_array_t   output,
                             int   align_corners,
                             int   half_pixel_centers
    )
 {
    int4 coord_out  = (int4)(get_global_id(0), 0, get_global_id(1), 0);
    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);
    coord_in.x = (coord_out.x * 2 - 1) >> 2;
    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;
    vxc_uchar16 in0, in1, tmp, result;
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    vxc_ushort8 multiplier;
    _viv_asm(COPY, multiplier, multAndoutZP, 16);
    vxc_ushort8 dst0;
    while (coord_out.y < out_height)
    {
        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_in.y += 2;
        coord_out.y++;
    }
 }
 _viv_uniform VXC_512Bits uniResize4xUp_l00_4x8;
 _viv_uniform VXC_512Bits uniResize4xUp_l01_4x8;
 _viv_uniform VXC_512Bits uniResize4xUp_l10_4x8;
 _viv_uniform VXC_512Bits uniResize4xUp_l11_4x8;
 __kernel void resize_bilinear_U8toU8_4x_upsample_half_pixel_centers
    (
    __read_only  image2d_array_t   input,
    __write_only image2d_array_t   output,
                             int   align_corners,
                             int   half_pixel_centers
    )
 {
    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0);
    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);
    coord_in.x = (coord_out.x * 2 - 3) >> 3;
    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;
    vxc_uchar16 in0, in1, dst0, dst1;
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    vxc_ushort8 multiplier;
    _viv_asm(COPY, multiplier, multAndoutZP, 16);
    vxc_ushort8 tmp;
    while (coord_out.y < out_height)
    {
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);
        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);
        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l10_4x8);
        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l10_4x8);
        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);
        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_in.y += 2;
        coord_out.y++;
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_4.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_4.vx
@ -0,0 +1,102 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniU8PostProcess_2x8;
 _viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
 _viv_uniform VXC_512Bits uniResize3xUp_l00_2x8;
 _viv_uniform VXC_512Bits uniResize3xUp_l01_2x8;
 _viv_uniform VXC_512Bits uniResize3xUp_l10_4x4;
 _viv_uniform VXC_512Bits uniResize3xUp_l11_4x4;
 _viv_uniform VXC_512Bits uniResize3xUp_l12_4x4;
 _viv_uniform VXC_512Bits uniResize3xUp_l13_4x4;
 __kernel void resize_bilinear_U8toU8_3x_upsample_half_pixel_centers
    (
    __read_only  image2d_array_t   input,
    __write_only image2d_array_t   output,
                             int   align_corners,
                             int   half_pixel_centers
    )
 {
    int4 coord_out  = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4 coord_in   = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    coord_in.x = (short)(coord_out.x * 2 - 1) / (short)6;
    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;
    coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;
    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;
    vxc_uchar16 in0, in1, in2, in3, tmp, dst0, dst1, dst2;
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
    VXC_OP4(img_load_3d, in2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
    VXC_OP4(img_load_3d, in3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    vxc_ushort8 multiplier;
    _viv_asm(COPY, multiplier, multAndoutZP, 16);
    vxc_ushort8 data;
    VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);
    VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);
    VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
    VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);
    VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
    VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
    coord_out.y++;
    VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l00_2x8);
    VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
    VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l01_2x8);
    VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
    VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);
    VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);
    VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
    VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);
    VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
    VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
    VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);
    VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);
    VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
    VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);
    VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
    VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
    coord_out.y++;
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
    coord_out.y++;
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
    coord_out.y++;
    VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l00_2x8);
    VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
    VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l01_2x8);
    VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
    VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);
    VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);
    VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
    VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);
    VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
    VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
    coord_out.y++;
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_5.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_5.vx
@ -0,0 +1,167 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniU8PostProcess_2x8;
 _viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
 _viv_uniform int out_height;
 _viv_uniform VXC_512Bits uniResize8xUp_l00_4x8;
 _viv_uniform VXC_512Bits uniResize8xUp_l01_4x8;
 _viv_uniform VXC_512Bits uniResize8xUp_l10_4x8;
 _viv_uniform VXC_512Bits uniResize8xUp_l11_4x8;
 _viv_uniform VXC_512Bits uniResize8xUp_l20_4x8;
 _viv_uniform VXC_512Bits uniResize8xUp_l21_4x8;
 _viv_uniform VXC_512Bits uniResize8xUp_l30_4x8;
 _viv_uniform VXC_512Bits uniResize8xUp_l31_4x8;
 __kernel void resize_bilinear_U8toU8_8x_upsample_half_pixel_centers
    (
    __read_only  image2d_array_t   input,
    __write_only image2d_array_t   output,
                             int   align_corners,
                             int   half_pixel_centers
    )
 {
    int4 coord_out  = (int4)(get_global_id(0), 0, get_global_id(1), 0);
    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);
    coord_in.x = (coord_out.x * 2 - 7) >> 4;
    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;
    vxc_uchar16 in0, in1, in2, dst0, dst1, dst2, dst3;
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    vxc_ushort8 multiplier;
    _viv_asm(COPY, multiplier, multAndoutZP, 16);
    vxc_ushort8 tmp;
    while (coord_out.y < out_height)
    {
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);
        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);
        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);
        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);
        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);
        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);
        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);
        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);
        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);
        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);
        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);
        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);
        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_in.y += 2;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);
        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);
        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);
        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);
        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
        coord_out.y++;
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/rope_0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/rope_0.vx
@ -0,0 +1,303 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform float scale0;
 _viv_uniform float scale1;
 _viv_uniform float output_zp;
 _viv_uniform int half_head_size;
 _viv_uniform VXC_512Bits uniATimesB_0_4x4;
 _viv_uniform VXC_512Bits uniATimesB_1_4x4;
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;
 #define ROPE_BNHS_SYMM(name, src_type, src1_type, copy_type, dst_type) \
 __kernel void rope_##name##_bnhs \
    ( \
    __read_only  image2d_array_t input, \
    __read_only  image2d_array_t cos_cache, \
    __read_only  image2d_array_t sin_cache, \
    __write_only image2d_array_t output, \
                 int   axis \
    ) \
 { \
    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
    int4 coord_out = coord_in; \
 \
    int8 input_desc; \
    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
    _viv_asm(MOV, coord_in.w, baseAddr); \
 \
    src_type data0, data1; \
    src1_type cos, sin; \
    copy_type v0, v1; \
    dst_type dst; \
    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, cos, v0, 16); \
    VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, sin, v1, 16); \
    coord_in.y += half_head_size; \
    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, \
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 \
    int8 output_desc; \
    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
    _viv_asm(MOV, coord_out.w, baseAddr); \
 \
    float4 data2, data3, data4, data5; \
    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
    VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
    VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
    data3 = data3 * scale0 - data5 * scale1 + output_zp; \
 \
    int4 dst0 = convert_int4_rte(data2); \
    int4 dst1 = convert_int4_rte(data3); \
 \
    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
                dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
 \
    VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
    VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
    data2 = data2 * scale1 + data4 * scale0 + output_zp; \
    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
 \
    dst0 = convert_int4_rte(data2); \
    dst1 = convert_int4_rte(data3); \
 \
    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    coord_out.y += half_head_size; \
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
            dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
 }
 ROPE_BNHS_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
 ROPE_BNHS_SYMM(I16_I16toI8,  vxc_short8, vxc_short8, vxc_short8, vxc_char8)
 ROPE_BNHS_SYMM(I16_I16toU8,  vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
 ROPE_BNHS_SYMM(I16_F16toI16, vxc_short8, vxc_half8,  vxc_short8, vxc_short8)
 ROPE_BNHS_SYMM(I16_F16toI8,  vxc_short8, vxc_half8,  vxc_short8, vxc_char8)
 ROPE_BNHS_SYMM(I16_F16toU8,  vxc_short8, vxc_half8,  vxc_short8, vxc_uchar8)
 __kernel void rope_F16_F16toF16_bnhs
    (
    __read_only  image2d_array_t input,
    __read_only  image2d_array_t cos_cache,
    __read_only  image2d_array_t sin_cache,
    __write_only image2d_array_t output,
                 int   axis
    )
 {
    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
    int4 coord_out = coord_in;
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    vxc_short8 v0, v1, v2, v3, dst;
    vxc_half8 data0, data1, cos, sin, dst2;
    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, data0, v0, 16);
    VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, cos, v1, 16);
    VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, sin, v2, 16);
    coord_in.y += half_head_size;
    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, data1, v3, 16);
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    float4 data2, data3, data4, data5;
    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
    VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
    VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
    data2 = data2 - data4;
    data3 = data3 - data5;
    half4 dst0;
    half4 dst1;
    _viv_asm(CONV_RTE, dst0, data2);
    _viv_asm(CONV_RTE, dst1, data3);
    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
    _viv_asm(COPY, dst, dst2, 16);
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
    VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
    data2 = data2 * scale1 + data4 * scale0 + output_zp;
    data3 = data3 * scale1 + data5 * scale0 + output_zp;
    _viv_asm(CONV_RTE, dst0, data2);
    _viv_asm(CONV_RTE, dst1, data3);
    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
    _viv_asm(COPY, dst, dst2, 16);
    coord_out.y += half_head_size;
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
 }
 _viv_uniform int in0_zp;
 _viv_uniform int cos_zp;
 _viv_uniform int sin_zp;
 _viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
 _viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
 #define ROPE_ASYM_BNHS(name, src1_type, copy_type, dtype) \
 __kernel void rope_##name##_bnhs \
    ( \
    __read_only  image2d_array_t input, \
    __read_only  image2d_array_t cos_cache, \
    __read_only  image2d_array_t sin_cache, \
    __write_only image2d_array_t output, \
                 int   axis \
    ) \
 { \
    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
    int4 coord_out = coord_in; \
 \
    int8 input_desc; \
    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
    _viv_asm(MOV, coord_in.w, baseAddr); \
 \
    dtype data0, data1, dst; \
    src1_type cos, sin; \
    copy_type v0, v1; \
    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, cos, v0, 16); \
    VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, sin, v1, 16); \
    coord_in.y += half_head_size; \
    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 \
    int8 output_desc; \
    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
    _viv_asm(MOV, coord_out.w, baseAddr); \
 \
    float4 l00, l01, cos0, cos1; \
    float4 l10, l11, sin0, sin1; \
    VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
    VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
    VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
    VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
    VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
    VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
    VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
    VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
    float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \
    float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
 \
    int4 dst0 = convert_int4_rte(data2); \
    int4 dst1 = convert_int4_rte(data3); \
 \
    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    VXC_OP4_NoDest(img_store_3d, output, \
            coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
 \
    data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \
    data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
 \
    dst0 = convert_int4_rte(data2); \
    dst1 = convert_int4_rte(data3); \
 \
    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    coord_out.y += half_head_size; \
    VXC_OP4_NoDest(img_store_3d, output, \
        coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
 }
 ROPE_ASYM_BNHS(I8_I8toI8,    vxc_char8,   vxc_char8,   vxc_char8)
 ROPE_ASYM_BNHS(U8_U8toU8,    vxc_uchar8,  vxc_uchar8,  vxc_uchar8)
 ROPE_ASYM_BNHS(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
 ROPE_ASYM_BNHS(I8_F16toI8,   vxc_half8,   vxc_short8,  vxc_char8)
 ROPE_ASYM_BNHS(U8_F16toU8,   vxc_half8,   vxc_short8,  vxc_uchar8)
 ROPE_ASYM_BNHS(U16_F16toU16, vxc_half8,   vxc_short8,  vxc_ushort8)
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
 _viv_uniform VXC_512Bits uniExtractOddData_2x8;
 __kernel void rope_BF16_BF16toBF16_bnhs
    (
    __read_only  image2d_array_t input,
    __read_only  image2d_array_t cos_cache,
    __read_only  image2d_array_t sin_cache,
    __write_only image2d_array_t output,
                 int   axis
    )
 {
    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
    int4 coord_out = coord_in;
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    vxc_ushort8 v0, v1, v2, v3, dst;
    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    coord_in.y += half_head_size;
    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
    vxc_short8 data;
    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
    _viv_asm(COPY, src0, data, 16);
    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
    _viv_asm(COPY, src1, data, 16);
    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
    _viv_asm(COPY, cos0, data, 16);
    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
    _viv_asm(COPY, cos1, data, 16);
    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
    _viv_asm(COPY, sin0, data, 16);
    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
    _viv_asm(COPY, sin1, data, 16);
    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
    _viv_asm(COPY, src2, data, 16);
    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
    _viv_asm(COPY, src3, data, 16);
    float4 data0 = src0 * cos0 - src2 * sin0;
    float4 data1 = src1 * cos1 - src3 * sin1;
    _viv_asm(COPY, v0, data0, 16);
    _viv_asm(COPY, v1, data1, 16);
    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    data0 = src0 * sin0 + src2 * cos0;
    data1 = src1 * sin1 + src3 * cos1;
    _viv_asm(COPY, v0, data0, 16);
    _viv_asm(COPY, v1, data1, 16);
    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
    coord_out.y += half_head_size;
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/rope_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/rope_1.vx
@ -0,0 +1,245 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform float scale0;
 _viv_uniform float scale1;
 _viv_uniform float output_zp;
 _viv_uniform int half_head_size;
 _viv_uniform VXC_512Bits uniATimesB_0_4x4;
 _viv_uniform VXC_512Bits uniATimesB_1_4x4;
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;
 #define ROPE_BNH1_SYMM(name, src_type, src1_type, copy_type, dst_type) \
 __kernel void rope_##name##_bnh1 \
    ( \
    __read_only  image2d_array_t input, \
    __read_only  image2d_array_t cos_cache, \
    __read_only  image2d_array_t sin_cache, \
    __write_only image2d_array_t output, \
                 int   axis \
    ) \
 { \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
 \
    src_type data0, data1; \
    src1_type cos, sin; \
    copy_type v0, v1; \
    VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, cos, v0, 16); \
    VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, sin, v1, 16); \
    coord.x += half_head_size; \
    VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 \
    float4 data2, data3, data4, data5; \
    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
    VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
    VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
    data3 = data3 * scale0 - data5 * scale1 + output_zp; \
 \
    int4 dst0 = convert_int4_rte(data2); \
    int4 dst1 = convert_int4_rte(data3); \
 \
    dst_type dst; \
    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 \
    VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
    VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
    data2 = data2 * scale1 + data4 * scale0 + output_zp; \
    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
 \
    dst0 = convert_int4_rte(data2); \
    dst1 = convert_int4_rte(data3); \
 \
    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
 ROPE_BNH1_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
 ROPE_BNH1_SYMM(I16_I16toI8,  vxc_short8, vxc_short8, vxc_short8, vxc_char8)
 ROPE_BNH1_SYMM(I16_I16toU8,  vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
 ROPE_BNH1_SYMM(I16_F16toI16, vxc_short8, vxc_half8,  vxc_short8, vxc_short8)
 ROPE_BNH1_SYMM(I16_F16toI8,  vxc_short8, vxc_half8,  vxc_short8, vxc_char8)
 ROPE_BNH1_SYMM(I16_F16toU8,  vxc_short8, vxc_half8,  vxc_short8, vxc_uchar8)
 __kernel void rope_F16_F16toF16_bnh1
    (
    __read_only  image2d_array_t input,
    __read_only  image2d_array_t cos_cache,
    __read_only  image2d_array_t sin_cache,
    __write_only image2d_array_t output,
                 int   axis
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
    vxc_short8 v0, v1, v2, v3, dst;
    vxc_half8 data0, data1, cos, sin, dst2;
    VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, data0, v0, 16);
    VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, cos, v1, 16);
    VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, sin, v2, 16);
    coord.x += half_head_size;
    VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, data1, v3, 16);
    float4 data2, data3, data4, data5;
    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
    VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
    VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
    data2 = data2 - data4;
    data3 = data3 - data5;
    half4 dst0;
    half4 dst1;
    _viv_asm(CONV_RTE, dst0, data2);
    _viv_asm(CONV_RTE, dst1, data3);
    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
    _viv_asm(COPY, dst, dst2, 16);
    VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
    VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
    data2 = data2 + data4;
    data3 = data3 + data5;
    _viv_asm(CONV_RTE, dst0, data2);
    _viv_asm(CONV_RTE, dst1, data3);
    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
    _viv_asm(COPY, dst, dst2, 16);
    VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 }
 _viv_uniform int in0_zp;
 _viv_uniform int cos_zp;
 _viv_uniform int sin_zp;
 _viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
 _viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
 #define ROPE_ASYM_BNH1(name, src1_type, copy_type, dtype) \
 __kernel void rope_##name##_bnh1 \
    ( \
    __read_only  image2d_array_t input, \
    __read_only  image2d_array_t cos_cache, \
    __read_only  image2d_array_t sin_cache, \
    __write_only image2d_array_t output, \
                 int   axis \
    ) \
 { \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
 \
    dtype data0, data1, dst; \
    src1_type cos, sin; \
    copy_type v0, v1; \
    VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, cos, v0, 16); \
    VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, sin, v1, 16); \
    coord.x += half_head_size; \
    VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 \
    float4 l00, l01, cos0, cos1; \
    float4 l10, l11, sin0, sin1; \
    VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
    VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
    VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
    VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
    VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
    VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
    VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
    VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
    float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \
    float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
 \
    int4 dst0 = convert_int4_rte(data2); \
    int4 dst1 = convert_int4_rte(data3); \
 \
    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 \
    data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \
    data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
 \
    dst0 = convert_int4_rte(data2); \
    dst1 = convert_int4_rte(data3); \
 \
    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
 ROPE_ASYM_BNH1(I8_I8toI8,    vxc_char8,   vxc_char8,   vxc_char8)
 ROPE_ASYM_BNH1(U8_U8toU8,    vxc_uchar8,  vxc_uchar8,  vxc_uchar8)
 ROPE_ASYM_BNH1(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
 ROPE_ASYM_BNH1(I8_F16toI8,   vxc_half8,   vxc_short8,  vxc_char8)
 ROPE_ASYM_BNH1(U8_F16toU8,   vxc_half8,   vxc_short8,  vxc_uchar8)
 ROPE_ASYM_BNH1(U16_F16toU16, vxc_half8,   vxc_short8,  vxc_ushort8)
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
 _viv_uniform VXC_512Bits uniExtractOddData_2x8;
 __kernel void rope_BF16_BF16toBF16_bnh1
    (
    __read_only  image2d_array_t input,
    __read_only  image2d_array_t cos_cache,
    __read_only  image2d_array_t sin_cache,
    __write_only image2d_array_t output,
                 int   axis
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
    vxc_ushort8 v0, v1, v2, v3, dst;
    VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    coord.x += half_head_size;
    VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
    vxc_short8 data;
    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
    _viv_asm(COPY, src0, data, 16);
    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
    _viv_asm(COPY, src1, data, 16);
    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
    _viv_asm(COPY, cos0, data, 16);
    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
    _viv_asm(COPY, cos1, data, 16);
    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
    _viv_asm(COPY, sin0, data, 16);
    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
    _viv_asm(COPY, sin1, data, 16);
    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
    _viv_asm(COPY, src2, data, 16);
    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
    _viv_asm(COPY, src3, data, 16);
    float4 data0 = src0 * cos0 - src2 * sin0;
    float4 data1 = src1 * cos1 - src3 * sin1;
    _viv_asm(COPY, v0, data0, 16);
    _viv_asm(COPY, v1, data1, 16);
    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
    VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    data0 = src0 * sin0 + src2 * cos0;
    data1 = src1 * sin1 + src3 * cos1;
    _viv_asm(COPY, v0, data0, 16);
    _viv_asm(COPY, v1, data1, 16);
    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
    VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/rope_2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/rope_2.vx
@ -0,0 +1,312 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform float scale0;
 _viv_uniform float scale1;
 _viv_uniform float output_zp;
 _viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;
 _viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;
 _viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;
 _viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;
 #define ROPE_BSNH_SYMM(name, src_type, src1_type, copy_type, dst_type) \
 __kernel void rope_##name##_bsnh \
    ( \
    __read_only  image2d_array_t input, \
    __read_only  image2d_array_t cos_cache, \
    __read_only  image2d_array_t sin_cache, \
    __write_only image2d_array_t output, \
                 int   axis \
    ) \
 { \
    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
 \
    src_type data0, data1; \
    src1_type cos, sin; \
    copy_type v0, v1; \
    dst_type dst; \
    VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, cos, v0, 16); \
    VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, sin, v1, 16); \
 \
    coord_in.x *= 2; \
    int8 input_desc; \
    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
    _viv_asm(MOV, coord_in.w, baseAddr); \
    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 \
    int4 coord_out = coord_in; \
    int8 output_desc; \
    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
    _viv_asm(MOV, coord_out.w, baseAddr); \
 \
    float4 data2, data3, data4, data5; \
    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
    VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
    VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
 \
    int4 dst0 = convert_int4_rte(data2); \
    int4 dst1 = convert_int4_rte(data3); \
 \
    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
                dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
 \
    VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
    VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
 \
    dst0 = convert_int4_rte(data2); \
    dst1 = convert_int4_rte(data3); \
 \
    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    coord_out.x += 8; \
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
            dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
 }
 ROPE_BSNH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
 ROPE_BSNH_SYMM(I16_I16toI8,  vxc_short8, vxc_short8, vxc_short8, vxc_char8)
 ROPE_BSNH_SYMM(I16_I16toU8,  vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
 ROPE_BSNH_SYMM(I16_F16toI16, vxc_short8, vxc_half8,  vxc_short8, vxc_short8)
 ROPE_BSNH_SYMM(I16_F16toI8,  vxc_short8, vxc_half8,  vxc_short8, vxc_char8)
 ROPE_BSNH_SYMM(I16_F16toU8,  vxc_short8, vxc_half8,  vxc_short8, vxc_uchar8)
 __kernel void rope_F16_F16toF16_bsnh
    (
    __read_only  image2d_array_t input,
    __read_only  image2d_array_t cos_cache,
    __read_only  image2d_array_t sin_cache,
    __write_only image2d_array_t output,
                 int   axis
    )
 {
    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
    vxc_short8 v0, v1, v2, v3, dst;
    vxc_half8 data0, data1, cos, sin, dst2;
    VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, cos, v1, 16);
    VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, sin, v2, 16);
    coord_in.x *= 2;
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, data0, v0, 16);
    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, data1, v3, 16);
    int4 coord_out = coord_in;
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    float4 data2, data3, data4, data5;
    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
    VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
    VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
    data2 = data2 - data4;
    data3 = data3 + data5;
    half4 dst0;
    half4 dst1;
    _viv_asm(CONV_RTE, dst0, data2);
    _viv_asm(CONV_RTE, dst1, data3);
    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
    _viv_asm(COPY, dst, dst2, 16);
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
    VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
    data2 = data2 - data4;
    data3 = data3 + data5;
    _viv_asm(CONV_RTE, dst0, data2);
    _viv_asm(CONV_RTE, dst1, data3);
    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
    _viv_asm(COPY, dst, dst2, 16);
    coord_out.x += 8;
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
 }
 _viv_uniform int in0_zp;
 _viv_uniform int cos_zp;
 _viv_uniform int sin_zp;
 _viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
 _viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
 _viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;
 _viv_uniform VXC_512Bits uniAOddMinusZp_4x4;
 #define ROPE_ASYM_BSNH(name, src1_type, copy_type, dtype) \
 __kernel void rope_##name##_bsnh \
    ( \
    __read_only  image2d_array_t input, \
    __read_only  image2d_array_t cos_cache, \
    __read_only  image2d_array_t sin_cache, \
    __write_only image2d_array_t output, \
                 int   axis \
    ) \
 { \
    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
 \
    dtype data0, data1, dst; \
    src1_type cos, sin; \
    copy_type v0, v1; \
 \
    VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, cos, v0, 16); \
    VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, sin, v1, 16); \
    coord_in.x *= 2; \
    int8 input_desc; \
    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
    _viv_asm(MOV, coord_in.w, baseAddr); \
    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 \
    int4 coord_out = coord_in; \
    int8 output_desc; \
    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
    _viv_asm(MOV, coord_out.w, baseAddr); \
 \
    float4 l00, l01, cos0, cos1; \
    float4 l10, l11, sin0, sin1; \
    VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
    VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
    VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
    VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
    VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
    VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
    float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \
    float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \
 \
    int4 dst0 = convert_int4_rte(data2); \
    int4 dst1 = convert_int4_rte(data3); \
 \
    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    VXC_OP4_NoDest(img_store_3d, output, \
            coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
 \
    VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
    VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
    data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
    data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
 \
    dst0 = convert_int4_rte(data2); \
    dst1 = convert_int4_rte(data3); \
 \
    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    coord_out.x += 8; \
    VXC_OP4_NoDest(img_store_3d, output, \
        coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
 }
 ROPE_ASYM_BSNH(I8_I8toI8,    vxc_char8,   vxc_char8,   vxc_char8)
 ROPE_ASYM_BSNH(U8_U8toU8,    vxc_uchar8,  vxc_uchar8,  vxc_uchar8)
 ROPE_ASYM_BSNH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
 ROPE_ASYM_BSNH(I8_F16toI8,   vxc_half8,   vxc_short8,  vxc_char8)
 ROPE_ASYM_BSNH(U8_F16toU8,   vxc_half8,   vxc_short8,  vxc_uchar8)
 ROPE_ASYM_BSNH(U16_F16toU16, vxc_half8,   vxc_short8,  vxc_ushort8)
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
 _viv_uniform VXC_512Bits uniExtractOddData_2x8;
 __kernel void rope_BF16_BF16toBF16_bsnh
    (
    __read_only  image2d_array_t input,
    __read_only  image2d_array_t cos_cache,
    __read_only  image2d_array_t sin_cache,
    __write_only image2d_array_t output,
                 int   axis
    )
 {
    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
    vxc_ushort8 v0, v1, v2, v3, dst;
    VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    coord_in.x *= 2;
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    int4 coord_out = coord_in;
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
    vxc_short8 data;
    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
    _viv_asm(COPY, src0, data, 16);
    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
    _viv_asm(COPY, src1, data, 16);
    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
    _viv_asm(COPY, cos0, data, 16);
    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
    _viv_asm(COPY, cos1, data, 16);
    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
    _viv_asm(COPY, sin0, data, 16);
    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
    _viv_asm(COPY, sin1, data, 16);
    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
    _viv_asm(COPY, src2, data, 16);
    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
    _viv_asm(COPY, src3, data, 16);
    float4 even = (float4)(src0.xz, src1.xz);
    float4 odd = (float4)(src0.yw, src1.yw);
    float4 data0 = even * cos0 - odd * sin0;
    float4 data1 = even * sin0 + odd * cos0;
    _viv_asm(COPY, v0, data0, 16);
    _viv_asm(COPY, v1, data1, 16);
    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    even = (float4)(src2.xz, src3.xz);
    odd = (float4)(src2.yw, src3.yw);
    data0 = even * cos1 - odd * sin1;
    data1 = even * sin1 + odd * cos1;
    _viv_asm(COPY, v0, data0, 16);
    _viv_asm(COPY, v1, data1, 16);
    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
    coord_out.x += 8;
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/rope_3.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/rope_3.vx
@ -0,0 +1,312 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform float scale0;
 _viv_uniform float scale1;
 _viv_uniform float output_zp;
 _viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;
 _viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;
 _viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;
 _viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;
 #define ROPE_BNSH_SYMM(name, src_type, src1_type, copy_type, dst_type) \
 __kernel void rope_##name##_bnsh \
    ( \
    __read_only  image2d_array_t input, \
    __read_only  image2d_array_t cos_cache, \
    __read_only  image2d_array_t sin_cache, \
    __write_only image2d_array_t output, \
                 int   axis \
    ) \
 { \
    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
 \
    src_type data0, data1; \
    src1_type cos, sin; \
    copy_type v0, v1; \
    dst_type dst; \
    VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, cos, v0, 16); \
    VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, sin, v1, 16); \
 \
    coord_in.x *= 2; \
    int8 input_desc; \
    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
    _viv_asm(MOV, coord_in.w, baseAddr); \
    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 \
    int4 coord_out = coord_in; \
    int8 output_desc; \
    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
    _viv_asm(MOV, coord_out.w, baseAddr); \
 \
    float4 data2, data3, data4, data5; \
    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
    VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
    VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
 \
    int4 dst0 = convert_int4_rte(data2); \
    int4 dst1 = convert_int4_rte(data3); \
 \
    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
                dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
 \
    VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
    VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
 \
    dst0 = convert_int4_rte(data2); \
    dst1 = convert_int4_rte(data3); \
 \
    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    coord_out.x += 8; \
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
            dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
 }
 ROPE_BNSH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
 ROPE_BNSH_SYMM(I16_I16toI8,  vxc_short8, vxc_short8, vxc_short8, vxc_char8)
 ROPE_BNSH_SYMM(I16_I16toU8,  vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
 ROPE_BNSH_SYMM(I16_F16toI16, vxc_short8, vxc_half8,  vxc_short8, vxc_short8)
 ROPE_BNSH_SYMM(I16_F16toI8,  vxc_short8, vxc_half8,  vxc_short8, vxc_char8)
 ROPE_BNSH_SYMM(I16_F16toU8,  vxc_short8, vxc_half8,  vxc_short8, vxc_uchar8)
 __kernel void rope_F16_F16toF16_bnsh
    (
    __read_only  image2d_array_t input,
    __read_only  image2d_array_t cos_cache,
    __read_only  image2d_array_t sin_cache,
    __write_only image2d_array_t output,
                 int   axis
    )
 {
    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
    vxc_short8 v0, v1, v2, v3, dst;
    vxc_half8 data0, data1, cos, sin, dst2;
    VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, cos, v1, 16);
    VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, sin, v2, 16);
    coord_in.x *= 2;
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, data0, v0, 16);
    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    _viv_asm(COPY, data1, v3, 16);
    int4 coord_out = coord_in;
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    float4 data2, data3, data4, data5;
    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
    VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
    VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
    data2 = data2 - data4;
    data3 = data3 + data5;
    half4 dst0;
    half4 dst1;
    _viv_asm(CONV_RTE, dst0, data2);
    _viv_asm(CONV_RTE, dst1, data3);
    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
    _viv_asm(COPY, dst, dst2, 16);
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
    VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
    data2 = data2 - data4;
    data3 = data3 + data5;
    _viv_asm(CONV_RTE, dst0, data2);
    _viv_asm(CONV_RTE, dst1, data3);
    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
    _viv_asm(COPY, dst, dst2, 16);
    coord_out.x += 8;
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
 }
 _viv_uniform int in0_zp;
 _viv_uniform int cos_zp;
 _viv_uniform int sin_zp;
 _viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
 _viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
 _viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;
 _viv_uniform VXC_512Bits uniAOddMinusZp_4x4;
 #define ROPE_ASYM_BNSH(name, src1_type, copy_type, dtype) \
 __kernel void rope_##name##_bnsh \
    ( \
    __read_only  image2d_array_t input, \
    __read_only  image2d_array_t cos_cache, \
    __read_only  image2d_array_t sin_cache, \
    __write_only image2d_array_t output, \
                 int   axis \
    ) \
 { \
    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
 \
    dtype data0, data1, dst; \
    src1_type cos, sin; \
    copy_type v0, v1; \
 \
    VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, cos, v0, 16); \
    VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    _viv_asm(COPY, sin, v1, 16); \
    coord_in.x *= 2; \
    int8 input_desc; \
    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
    _viv_asm(MOV, coord_in.w, baseAddr); \
    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 \
    int4 coord_out = coord_in; \
    int8 output_desc; \
    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
    _viv_asm(MOV, coord_out.w, baseAddr); \
 \
    float4 l00, l01, cos0, cos1; \
    float4 l10, l11, sin0, sin1; \
    VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
    VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
    VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
    VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
    VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
    VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
    float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \
    float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \
 \
    int4 dst0 = convert_int4_rte(data2); \
    int4 dst1 = convert_int4_rte(data3); \
 \
    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    VXC_OP4_NoDest(img_store_3d, output, \
            coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
 \
    VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
    VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
    data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
    data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
 \
    dst0 = convert_int4_rte(data2); \
    dst1 = convert_int4_rte(data3); \
 \
    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    coord_out.x += 8; \
    VXC_OP4_NoDest(img_store_3d, output, \
        coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
 }
 ROPE_ASYM_BNSH(I8_I8toI8,    vxc_char8,   vxc_char8,   vxc_char8)
 ROPE_ASYM_BNSH(U8_U8toU8,    vxc_uchar8,  vxc_uchar8,  vxc_uchar8)
 ROPE_ASYM_BNSH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
 ROPE_ASYM_BNSH(I8_F16toI8,   vxc_half8,   vxc_short8,  vxc_char8)
 ROPE_ASYM_BNSH(U8_F16toU8,   vxc_half8,   vxc_short8,  vxc_uchar8)
 ROPE_ASYM_BNSH(U16_F16toU16, vxc_half8,   vxc_short8,  vxc_ushort8)
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
 _viv_uniform VXC_512Bits uniExtractOddData_2x8;
 __kernel void rope_BF16_BF16toBF16_bnsh
    (
    __read_only  image2d_array_t input,
    __read_only  image2d_array_t cos_cache,
    __read_only  image2d_array_t sin_cache,
    __write_only image2d_array_t output,
                 int   axis
    )
 {
    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
    vxc_ushort8 v0, v1, v2, v3, dst;
    VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    coord_in.x *= 2;
    int8 input_desc;
    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
    _viv_asm(MOV, coord_in.w, baseAddr);
    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    int4 coord_out = coord_in;
    int8 output_desc;
    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
    _viv_asm(MOV, coord_out.w, baseAddr);
    float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
    vxc_short8 data;
    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
    _viv_asm(COPY, src0, data, 16);
    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
    _viv_asm(COPY, src1, data, 16);
    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
    _viv_asm(COPY, cos0, data, 16);
    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
    _viv_asm(COPY, cos1, data, 16);
    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
    _viv_asm(COPY, sin0, data, 16);
    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
    _viv_asm(COPY, sin1, data, 16);
    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
    _viv_asm(COPY, src2, data, 16);
    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
    _viv_asm(COPY, src3, data, 16);
    float4 even = (float4)(src0.xz, src1.xz);
    float4 odd = (float4)(src0.yw, src1.yw);
    float4 data0 = even * cos0 - odd * sin0;
    float4 data1 = even * sin0 + odd * cos0;
    _viv_asm(COPY, v0, data0, 16);
    _viv_asm(COPY, v1, data1, 16);
    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
    even = (float4)(src2.xz, src3.xz);
    odd = (float4)(src2.yw, src3.yw);
    data0 = even * cos1 - odd * sin1;
    data1 = even * sin1 + odd * cos1;
    _viv_asm(COPY, v0, data0, 16);
    _viv_asm(COPY, v1, data1, 16);
    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
    coord_out.x += 8;
    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_special.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_special.vx
@ -93,3 +93,101 @@ __kernel void scatter_nd_update_cpy2out_##src0_type##to##src0_type( \
 }
 SCATTER_ND_UPDATE_COPY2OUT(U8,  vxc_uchar16, 1)
 SCATTER_ND_UPDATE_COPY2OUT(I8,  vxc_char16, 1)
 SCATTER_ND_UPDATE_COPY2OUT(U16,  vxc_ushort8, 2)
 SCATTER_ND_UPDATE_COPY2OUT(I16,  vxc_short8, 2)
 #define SCATTER_ND_UPDATE_REF2OUT_16BITS(src0_type, data_type) \
 __kernel void scatter_nd_update_ref2out_##src0_type##to##src0_type( \
    __read_only image2d_t   input_ref, \
    image2d_t  temp_ref, \
    image2d_t  output0 \
    ) \
 { \
    int gidx = get_global_id(0); \
    Image img0 = create_image_from_image2d(input_ref, 2); \
    Image img1 = create_image_from_image2d(temp_ref, 2); \
    __global data_type* in_ptr = (__global data_type*)img0.ptr; \
    __global data_type* out_ptr = (__global data_type*)img1.ptr; \
    data_type src, dst; \
    src = in_ptr[gidx]; \
    vxc_ushort8 mp0; \
    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
    VXC_DP2x8(dst, src, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
        uniU8MulAndPostShift0_Lo_2x8); \
    out_ptr[gidx] = dst; \
 }
 SCATTER_ND_UPDATE_REF2OUT_16BITS(U16,  vxc_ushort8)
 SCATTER_ND_UPDATE_REF2OUT_16BITS(I16,  vxc_short8)
 #define SCATTER_ND_UPDATE_UPDATE2REF_16BITS(src0_type, data_type) \
 __kernel void scatter_nd_update_update2ref_##src0_type##to##src0_type##_16x( \
    __read_only image2d_t   input_index, \
    __read_only image2d_t   input_update, \
    image2d_t  temp_ref, \
    image2d_t  input0, \
    image2d_t  output1, \
    int width, int area, int vol, int coord_dim \
    ) \
 { \
    int gidx = get_global_id(0); \
    int gidy = get_global_id(1); \
 \
    Image img1 = create_image_from_image2d(input_index, 4); \
    Image img2 = create_image_from_image2d(input_update, 2); \
    Image img3 = create_image_from_image2d(temp_ref, 2); \
    __global int* index_ptr = (__global int*)img1.ptr; \
    __global data_type* update_ptr = (__global data_type*)img2.ptr; \
    __global data_type* output_ptr = (__global data_type*)img3.ptr; \
    data_type dst; \
 \
    int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx); \
    data_type src = update_ptr[gidy * update_width + gidx]; \
    int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \
    int loc = idx * output_width + gidx; \
    vxc_ushort8 mp1; \
    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
    VXC_DP2x8(dst, src, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
        uniU8MulAndPostShift1_Lo_2x8); \
    output_ptr[loc] = dst; \
 }
 SCATTER_ND_UPDATE_UPDATE2REF_16BITS(U16,  vxc_ushort8)
 SCATTER_ND_UPDATE_UPDATE2REF_16BITS(I16,  vxc_short8)
 __kernel void scatter_nd_update_ref2out_F16toF16(
    __read_only image2d_t   input_ref,
    image2d_t  temp_ref,
    image2d_t  output0
    )
 {
    int gidx = get_global_id(0);
    Image img0 = create_image_from_image2d(input_ref, 2);
    Image img1 = create_image_from_image2d(temp_ref, 2);
    __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)img0.ptr;
    __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)img1.ptr;
    out_ptr[gidx] = in_ptr[gidx];
 }
 __kernel void scatter_nd_update_update2ref_F16toF16_16x(
    __read_only image2d_t   input_index,
    __read_only image2d_t   input_update,
    image2d_t  temp_ref,
    image2d_t  input0,
    image2d_t  output1,
    int width, int area, int vol, int coord_dim
    )
 {
    int gidx = get_global_id(0);
    int gidy = get_global_id(1);
    Image img1 = create_image_from_image2d(input_index, 4);
    Image img2 = create_image_from_image2d(input_update, 2);
    Image img3 = create_image_from_image2d(temp_ref, 2);
    __global int* index_ptr = (__global int*)img1.ptr;
    __global vxc_ushort8* update_ptr = (__global vxc_ushort8*)img2.ptr;
    __global vxc_ushort8* output_ptr = (__global vxc_ushort8*)img3.ptr;
    int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx);
    int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;
    int loc = idx * output_width + gidx;
    output_ptr[loc] = update_ptr[gidy * update_width + gidx];
 }
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
@ -29,6 +29,7 @@
 #include "VX/vx_ext_program.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_log.h"
 #include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
@ -198,10 +199,11 @@ static vsi_status vsi_nn_RegisterVXKernel
    vx_size * program_len = NULL;
    const char **program_src = NULL;
    vx_context ctx = NULL;
    vsi_nn_context_t context = NULL;
    vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];
    uint8_t i = 0;
    vsi_bool load_from_file = FALSE;
    vsi_nn_runtime_option_t* options;
    options = ((vsi_nn_graph_prv_t*)graph)->options;
 #define MAX_BUILDPROGRAM_LEN 128
    char cmd[MAX_BUILDPROGRAM_LEN] = {0};
@ -210,8 +212,7 @@ static vsi_status vsi_nn_RegisterVXKernel
    memset(cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN);
    status = VSI_FAILURE;
    ctx = vxGetContext( (vx_reference)graph->g );
-    context = graph->ctx;
+    evis = options->config.evis.ver;
    evis = context->config.evis.ver;
    program_src = (const char**)malloc(kernel_info->resource_num * sizeof(char *));
    CHECK_PTR_FAIL_GOTO( program_src, "Create buffer fail.", final );
@ -244,12 +245,12 @@ static vsi_status vsi_nn_RegisterVXKernel
    {
        // set default evis version is 2
        snprintf(cmd, MAX_BUILDPROGRAM_LEN,
-            "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va);
+            "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", options->config.use_40bits_va);
    }
    else
    {
        snprintf(cmd, MAX_BUILDPROGRAM_LEN,
-            "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va);
+            "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, options->config.use_40bits_va);
    }
    status = vxBuildProgram(program, cmd);
@ -302,7 +303,7 @@ static vsi_status vsi_nn_RegisterBinKernel
    vx_size program_len = 0;
    const uint8_t *program_ptr = NULL;
    vx_context ctx;
-    vsi_nn_context_t context;
+    vsi_nn_runtime_option_t* options;
    vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];
 #define MAX_BUILDPROGRAM_LEN 128
@ -313,8 +314,8 @@ static vsi_status vsi_nn_RegisterBinKernel
    status = VSI_FAILURE;
    ctx = vxGetContext( (vx_reference)graph->g );
-    context = graph->ctx;
+    options = ((vsi_nn_graph_prv_t*)graph)->options;
-    evis = context->config.evis.ver;
+    evis = options->config.evis.ver;
    program_ptr = vsi_nn_VxBinResourceGetResource(
            kernel_info->resource_name[kernel_info->resource_num - 1], &program_len);
@ -337,12 +338,12 @@ static vsi_status vsi_nn_RegisterBinKernel
    {
        // set default evis version is 2
        snprintf(cmd, MAX_BUILDPROGRAM_LEN,
-            "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va);
+            "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", options->config.use_40bits_va);
    }
    else
    {
        snprintf(cmd, MAX_BUILDPROGRAM_LEN,
-            "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va);
+            "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, options->config.use_40bits_va);
    }
 #else
    snprintf(cmd, MAX_BUILDPROGRAM_LEN, "-cl-viv-vx-extension");
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
@ -35,6 +35,8 @@
 #include "utils/vsi_nn_constraint_check.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
 #include "vsi_nn_tensor_util_prv.h"
 #include "vsi_nn_error.h"
 static vsi_status _try_set_high_presision_tensor
    (
@ -120,9 +122,22 @@ static vsi_status _static_batchnorm
    vsi_nn_tensor_t ** outputs
    )
 {
 #define _TENSOR_LEN 64
    vsi_status         status;
    vsi_nn_kernel_param_t * param = NULL;
    vsi_nn_tensor_t* reshape_tensors[6] = { NULL };
    vsi_size_t shape[VSI_NN_MAX_DIM_NUM];
    uint32_t new_rank = 4;
    vsi_nn_tensor_t* input0 = NULL;
    vsi_nn_tensor_t* output = NULL;
    char reshape0_tensor_name[_TENSOR_LEN];
    char reshape1_tensor_name[_TENSOR_LEN];
    char batch_norm_tensor_name[_TENSOR_LEN];
    memset(reshape0_tensor_name, 0, sizeof(reshape0_tensor_name));
    memset(reshape1_tensor_name, 0, sizeof(reshape1_tensor_name));
    memset(batch_norm_tensor_name, 0, sizeof(batch_norm_tensor_name));
    status = VSI_FAILURE;
    status = _try_set_high_presision_tensor(inputs);
@ -131,10 +146,43 @@ static vsi_status _static_batchnorm
        VSILOGE("Set tensor attr of high presision fail");
        return status;
    }
-    if(_require_reshape(self, inputs))
+    if (_require_reshape(self, inputs))
    {
-        reshape_tensors[0] = self->nn_param.batch_norm.local->reshaped_input;
+        if (3 == inputs[0]->attr.dim_num)
-        reshape_tensors[5] = self->nn_param.batch_norm.local->reshaped_output;
+        {
            shape[0] = inputs[0]->attr.size[0];
            shape[1] = 1;
            shape[2] = inputs[0]->attr.size[1];
            shape[3] = inputs[0]->attr.size[2];
        }
        else if (5 == inputs[0]->attr.dim_num)
        {
            shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
            shape[1] = inputs[0]->attr.size[2];
            shape[2] = inputs[0]->attr.size[3];
            shape[3] = inputs[0]->attr.size[4];
        }
        input0 = vsi_nn_kernel_insert_reshape_node(self->graph,
            inputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_BACKWARD);
        CHECK_PTR_FAIL_GOTO(input0, "Create tensor fail.", final);
        reshape_tensors[0] = input0;
        snprintf(reshape0_tensor_name, sizeof(reshape0_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0);
        if (vxSetReferenceName((vx_reference)reshape_tensors[0]->t, reshape0_tensor_name) == VSI_FAILURE)
        {
            VSILOGW("Set uid %u reshape 0 node output name fail", self->uid);
            goto final;
        }
        output = vsi_nn_kernel_insert_reshape_node(self->graph,
            outputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_FORWARD);
        CHECK_PTR_FAIL_GOTO(output, "Create tensor fail.", final);
        reshape_tensors[5] = output;
        snprintf(reshape1_tensor_name, sizeof(reshape1_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1);
        if (vxSetReferenceName((vx_reference)outputs[0]->t, reshape1_tensor_name) == VSI_FAILURE)
        {
            VSILOGW("Set uid %u reshap 1 node output name fail", self->uid);
            goto final;
        }
    }
    else
    {
@ -155,12 +203,26 @@ static vsi_status _static_batchnorm
        reshape_tensors, 5,
        &reshape_tensors[5], 1, param );
-    if( self->n )
+    if ( self->n )
    {
        status = VSI_SUCCESS;
    }
-    vsi_nn_kernel_param_release( &param );
+    vsi_nn_kernel_param_release(&param);
    if (output)
    {
        snprintf(batch_norm_tensor_name, sizeof(batch_norm_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 2);
        if (vxSetReferenceName((vx_reference)output->t, batch_norm_tensor_name) == VSI_FAILURE)
        {
            VSILOGW("Set uid %u instance_norm node output name fail", self->uid);
            goto final;
        }
    }
 final:
    vsi_safe_release_tensor(input0);
    vsi_safe_release_tensor(output);
    return status;
 }
@ -313,68 +375,6 @@ static vsi_status op_compute
    return status;
 } /* op_compute() */
 static vsi_status op_optimize
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs,
    vsi_nn_opt_direction_e direction
    )
 {
    uint32_t dim = 0;
    vsi_nn_batcnnorm_lcl_data *local = NULL;
    vsi_size_t shape[VSI_NN_MAX_DIM_NUM];
    char tensor_name[128];
    dim = inputs[0]->attr.dim_num;
    if(_require_reshape(self, inputs) == FALSE)
    {
        return VSI_SUCCESS;
    }
    VSILOGD("Optimize 3D %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
    /*
        reshape 3d input (xcn) --> 4d input (whcn)
        reshape 3d output(xcn) --> 4d output(whcn)
    */
    dim = 4;
    if (3 == inputs[0]->attr.dim_num)
    {
        shape[0] = inputs[0]->attr.size[0];
        shape[1] = 1;
        shape[2] = inputs[0]->attr.size[1];
        shape[3] = inputs[0]->attr.size[2];
    }
    else if (5 == inputs[0]->attr.dim_num)
    {
        shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
        shape[1] = inputs[0]->attr.size[2];
        shape[2] = inputs[0]->attr.size[3];
        shape[3] = inputs[0]->attr.size[4];
    }
    local = self->nn_param.batch_norm.local;
    if (VSI_NN_OPTIMIZE_BACKWARD == direction)
    {
        local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim);
    }
    else
    {
        local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim);
        if(local->reshaped_output && local->reshaped_output->t)
        {
            memset(tensor_name, 0, sizeof(tensor_name));
            snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid);
            if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE)
            {
                VSILOGW("Set uid %u batchnorm reshaped output name fail", self->uid);
                return VSI_FAILURE;
            }
        }
    }
    return VSI_SUCCESS;
 } /* op_optimize() */
 static vsi_bool _dynamic_check
    (
    vsi_nn_node_t * self,
@ -494,58 +494,6 @@ static vsi_bool op_check
    }
 } /* op_check() */
 static vsi_bool op_setup
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    vsi_nn_batcnnorm_lcl_data *local = NULL;
    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
    {
        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
        memcpy( outputs[0]->attr.size, inputs[0]->attr.size,
            VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
    }
    if(_require_reshape(self, inputs))
    {
        local = (vsi_nn_batcnnorm_lcl_data *)malloc(sizeof(vsi_nn_batcnnorm_lcl_data));
        if(NULL == local)
        {
            return VSI_FAILURE;
        }
        memset(local, 0, sizeof(vsi_nn_batcnnorm_lcl_data));
        self->nn_param.batch_norm.local = local;
    }
    return TRUE;
 } /* op_setup() */
 static vsi_status op_deinit
    (
    vsi_nn_node_t * self
    )
 {
    vsi_nn_batch_norm_param *p = &(self->nn_param.batch_norm);
    if(p->local)
    {
        if (p->local->reshaped_input)
        {
            vsi_nn_ReleaseTensor(&(p->local->reshaped_input));
            p->local->reshaped_input = NULL;
        }
        if (p->local->reshaped_output)
        {
            vsi_nn_ReleaseTensor(&(p->local->reshaped_output));
            p->local->reshaped_output = NULL;
        }
        vsi_nn_safe_free(p->local);
    }
    vsi_nn_op_common_deinit(self);
    return VSI_SUCCESS;
 }
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -555,10 +503,10 @@ DEF_OP_REG
    /* op_name    */ BATCH_NORM,
    /* init       */ NULL,
    /* compute    */ op_compute,
-    /* deinit     */ op_deinit,
+    /* deinit     */ vsi_nn_op_common_deinit,
    /* check      */ op_check,
-    /* setup      */ op_setup,
+    /* setup      */ vsi_nn_op_common_setup,
-    /* optimize   */ op_optimize,
+    /* optimize   */ NULL,
    /* input_num  */ 5,
    /* output_num */ 1
    );
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c
@ -118,6 +118,7 @@ static vsi_bool op_setup
            if (outputs[0]->attr.dim_num == 0)
            {
                outputs[0]->attr.size[0] = 1;
                outputs[0]->attr.dim_num = 1;
                vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
            }
            else
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c
@ -82,6 +82,7 @@ static vsi_bool op_check
 {
    BEGIN_IO_TYPE_DECL(CUMSUM, 1, 1)
        IO_TYPE(D_U32,         D_U32)
        IO_TYPE(D_I32,         D_I32)
        IO_TYPE(D_F32,         D_F32)
        IO_TYPE(D_F16,         D_F16)
        IO_TYPE(D_BF16,        D_BF16)
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@ -253,6 +253,7 @@ static vsi_bool op_check
        IO_TYPE(D_BOOL8,      D_I32)
        IO_TYPE(D_BOOL8,      D_U16)
        IO_TYPE(D_BOOL8,      D_U32)
        IO_TYPE(D_BOOL8,      D_BF16)
        IO_TYPE(D_U8|Q_ASYM,  D_BOOL8)
        IO_TYPE(D_I8|Q_ASYM,  D_BOOL8)
        IO_TYPE(D_I8|Q_DFP,   D_BOOL8)
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
@ -155,10 +155,10 @@ vsi_bool vsi_nn_op_eltwise_setup
    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
    vsi_bool ret = TRUE;
-    out_rank = inputs[0]->attr.dim_num;
+    out_rank = vsi_nn_get_tensor_dims(inputs[0]);
    for ( i = 1; i < self->input.num; i++)
    {
-        in2_rank = inputs[i]->attr.dim_num;
+        in2_rank = vsi_nn_get_tensor_dims(inputs[i]);
        out_rank = vsi_nn_max( out_rank, in2_rank );
    }
@ -166,10 +166,10 @@ vsi_bool vsi_nn_op_eltwise_setup
    {
        vsi_size_t sz0, sz1;
-        sz0 = i < inputs[0]->attr.dim_num ? inputs[0]->attr.size[i] : 1;
+        sz0 = i < vsi_nn_get_tensor_dims(inputs[0]) ? inputs[0]->attr.size[i] : 1;
        for ( j = 1; j < self->input.num; j++)
        {
-            sz1 = i < inputs[j]->attr.dim_num  ? inputs[j]->attr.size[i] : 1;
+            sz1 = i < vsi_nn_get_tensor_dims(inputs[j]) ? inputs[j]->attr.size[i] : 1;
            sz0 = vsi_nn_max( sz0, sz1 );
            if (sz0 != sz1 && sz0 != 1 && sz1 != 1)
            {
@ -187,11 +187,12 @@ vsi_bool vsi_nn_op_eltwise_setup
    {
        outputs[0]->attr.dim_num = out_rank;
        memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) );
-        if (out_rank == 1 &&
+        if (vsi_nn_GetTensorIsScalar(inputs[0]) &&
            vsi_nn_GetTensorIsScalar(inputs[0]) &&
            vsi_nn_GetTensorIsScalar(inputs[1]))
        {
            vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
            outputs[0]->attr.size[0] = 1;
            outputs[0]->attr.dim_num = 1;
        }
    }
    else
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
@ -199,6 +199,7 @@ static vsi_bool op_setup
        if (o_rank == 0)
        {
            outputs[0]->attr.size[0] = 1;
            outputs[0]->attr.dim_num = 1;
            vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
        }
        else
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
@ -306,6 +306,8 @@ static vsi_bool _op_check
        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_I16|Q_DFP)
        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_I16|Q_ASYM)
        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_I16|Q_SYM)
        IO_TYPE(D_U16|Q_ASYM, D_F32,  D_F32,  D_U16|Q_ASYM)
        IO_TYPE(D_U16|Q_SYM,  D_F32,  D_F32,  D_U16|Q_SYM)
    END_IO_TYPE_DECL(GROUP_NORM)
    if (!VALIDATE_OP_IO_TYPES(GROUP_NORM, self, inputs, self->input.num, outputs, self->output.num))
    {
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c
@ -25,6 +25,7 @@
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_graph.h"
@ -197,6 +198,7 @@ static vsi_bool op_setup_default
    vsi_nn_internal_tensor_t * hstate_fc_outputs[GRUCELL_GATE_CNT] = { NULL };
    vsi_nn_internal_tensor_t * h_times_r = NULL;
    vsi_nn_tensor_attr_t attr;
    vsi_nn_activation_e recurrent_activation = p->recurrent_activation;
    vsi_nn_internal_init_node_wksp( self );
@ -230,7 +232,8 @@ static vsi_bool op_setup_default
    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
    if (inputs[GRUCELL_IN_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
-        self->graph->ctx->config.support_stream_processor)
+       (((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor &&
           recurrent_activation == VSI_NN_ACT_SIGMOID))
    {
        attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
    }
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c
@ -93,37 +93,15 @@ static vsi_bool op_check
    {
        BEGIN_IO_TYPE_DECL(L1_LAYER_NORM, 4, 1)
            IO_TYPE(D_F32,        D_F32,  D_F32,  D_F32,  D_F32)
            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_F16)
            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_U8|Q_ASYM)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_U8|Q_ASYM)
            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_DFP)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_DFP)
            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_ASYM)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_ASYM)
            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_SYM)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_SYM)
            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_DFP)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_DFP)
            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_ASYM)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_ASYM)
            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_SYM)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_SYM)
            IO_TYPE(D_BF16,       D_F32,  D_F32,  D_F32,  D_BF16)
            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_U8|Q_ASYM)
            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F32,  D_I16|Q_DFP)
            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_F32,  D_I16|Q_ASYM)
            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_F32,  D_I16|Q_SYM)
            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F32,  D_I8|Q_DFP)
            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_I8|Q_ASYM)
            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_F32,  D_I8|Q_SYM)
            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_U8|Q_ASYM)
            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_F16)
            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F32,  D_I16|Q_DFP)
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
@ -25,6 +25,7 @@
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_node.h"
@ -351,7 +352,7 @@ static vsi_bool op_setup
    }
    else if ( ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 &&
                outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) ||
-              self->graph->ctx->config.support_stream_processor )
+              ((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor )
    {
        vsi_nn_internal_tensor_t* output_tensor = NULL;
        vsi_nn_internal_tensor_t* reshape_tensor = NULL;
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
@ -106,7 +106,7 @@ static vsi_bool op_setup
    vsi_nn_internal_init_node_wksp( self );
-    if ( axis != 0 && !self->graph->ctx->config.support_stream_processor)
+    if ( axis != 0 && !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
    {
        vsi_nn_internal_tensor_t* mean_tensor = NULL;
        vsi_nn_internal_tensor_t* vari_tensor = NULL;
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
@ -25,6 +25,7 @@
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_graph.h"
@ -139,7 +140,7 @@ static vsi_bool op_setup
    p->is_cifg = inputs[LSTMUNIT_ACT_INPUT_FC_I] == NULL;
    p->is_projection = outputs[LSTMUNIT_ACT_HSTATE_OUT] == NULL;
-    if (self->graph->ctx->config.support_stream_processor)
+    if (((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
    {
        p->is_layer_norm = inputs[LSTMUNIT_ACT_HSTATE_FC_F] == NULL;
    }
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c
@ -100,6 +100,7 @@ static vsi_bool op_check
        IO_TYPE(D_I32,          D_I16|Q_ASYM)
        IO_TYPE(D_I32,          D_I16|Q_SYM)
        IO_TYPE(D_I32,          D_I32)
        IO_TYPE(D_I32,          D_BF16)
        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_SYM)
        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_SYM)
        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
@ -111,8 +112,10 @@ static vsi_bool op_check
        IO_TYPE(D_U8|Q_ASYM,    D_BF16)
        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
        IO_TYPE(D_I8|Q_ASYM,    D_F16)
        IO_TYPE(D_I8|Q_ASYM,    D_BF16)
        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
        IO_TYPE(D_I8|Q_DFP,     D_F16)
        IO_TYPE(D_I8|Q_DFP,     D_BF16)
        IO_TYPE(D_I16|Q_DFP,    D_I8|Q_DFP)
        IO_TYPE(D_I16|Q_DFP,    D_U8|Q_ASYM)
        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
@ -124,11 +127,14 @@ static vsi_bool op_check
        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
        IO_TYPE(D_I16|Q_SYM,    D_I8|Q_SYM)
        IO_TYPE(D_I16|Q_ASYM,   D_F16)
        IO_TYPE(D_I16|Q_ASYM,   D_BF16)
        IO_TYPE(D_I16|Q_ASYM,   D_F32)
        IO_TYPE(D_I16|Q_SYM,    D_U8|Q_ASYM)
        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
        IO_TYPE(D_I16|Q_SYM,    D_BF16)
        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
        IO_TYPE(D_I8|Q_SYM,     D_F16)
        IO_TYPE(D_I8|Q_SYM,     D_BF16)
        IO_TYPE(D_BF16,         D_BF16)
    END_IO_TYPE_DECL(ONE_HOT)
    if (!VALIDATE_OP_IO_TYPES(ONE_HOT, self, inputs, self->input.num, outputs, self->output.num))
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
@ -36,6 +36,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 #include "vsi_nn_error.h"
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
@ -50,33 +51,52 @@ static vsi_status op_compute
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_param_t * param = NULL;
    vsi_nn_kernel_node_t    n = NULL;
-    param =vsi_nn_kernel_param_create();
+    vsi_nn_tensor_t* reshape_tensor = NULL;
    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
    vsi_nn_pre_process_rgb_param* p = NULL;
-    vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_rgb.local.scale_x );
+    memcpy(shape, inputs[0]->attr.size, inputs[0]->attr.dim_num * sizeof(vsi_size_t));
-    vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_rgb.local.scale_y );
+
-    vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_rgb.rect.left );
+    shape[0] = shape[1] * shape[0];
-    vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_rgb.rect.top );
+    shape[1] = shape[2];
-    vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_rgb.r_mean );
+    shape[2] = 1;
-    vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_rgb.g_mean );
+
-    vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb.b_mean );
+    reshape_tensor = vsi_nn_reshape_tensor(self->graph,
-    vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_rgb.r_scale );
+        inputs[0], shape, inputs[0]->attr.dim_num);
-    vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_rgb.g_scale );
+    CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor failed", final);
-    vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_rgb.b_scale );
+
-    vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_rgb.reverse_channel );
+    p = (vsi_nn_pre_process_rgb_param*)&(self->nn_param.pre_process_rgb);
-    vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_rgb.local.enable_perm );
+
-    vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb.local.enable_copy );
+    param = vsi_nn_kernel_param_create();
-    n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb", inputs, 1, outputs, 1, param );
+
-    if( n != NULL )
+    vsi_nn_kernel_param_add_int32( param, "scale_x", p->local->scale_x );
    vsi_nn_kernel_param_add_int32( param, "scale_y", p->local->scale_y );
    vsi_nn_kernel_param_add_int32( param, "left", p->rect.left );
    vsi_nn_kernel_param_add_int32( param, "top", p->rect.top );
    vsi_nn_kernel_param_add_float32( param, "r_mean", p->r_mean );
    vsi_nn_kernel_param_add_float32( param, "g_mean", p->g_mean );
    vsi_nn_kernel_param_add_float32( param, "b_mean", p->b_mean );
    vsi_nn_kernel_param_add_float32( param, "r_scale", p->r_scale );
    vsi_nn_kernel_param_add_float32( param, "g_scale", p->g_scale );
    vsi_nn_kernel_param_add_float32( param, "b_scale", p->b_scale );
    vsi_nn_kernel_param_add_int32( param, "reverse", p->reverse_channel );
    vsi_nn_kernel_param_add_int32( param, "enable_perm", p->local->enable_perm );
    vsi_nn_kernel_param_add_int32( param, "enable_copy", p->local->enable_copy );
    n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb", &reshape_tensor, 1, outputs, 1, param );
    if ( n != NULL )
    {
        self->n = (vx_node)n;
        status = VSI_SUCCESS;
    }
-    if(param != NULL)
+    if (param != NULL)
    {
        vsi_nn_kernel_param_release( &param );
    }
 final:
    vsi_safe_release_tensor(reshape_tensor);
    return status;
 } /* op_compute() */
@ -166,35 +186,57 @@ static vsi_bool op_setup
    }
-    self->nn_param.pre_process_rgb.local.enable_perm = FALSE;
+    p->local->enable_perm = FALSE;
-    if (self->nn_param.pre_process_rgb.local.enable_perm == FALSE)
+    if (p->local->enable_perm == FALSE)
    {
-        p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]);
+        p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]);
-        p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
+        p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
    }
    else
    {
-        p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
+        p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
-        p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]);
+        p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]);
    }
-    p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15)));
+    p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15)));
    return TRUE;
 } /* op_setup() */
 static vsi_status op_init
 (
    vsi_nn_node_t* self
 )
 {
    vsi_status status = VSI_SUCCESS;
    self->nn_param.pre_process_rgb.local =
        (vsi_nn_pre_process_rgb_lcl_data*)malloc(sizeof(vsi_nn_pre_process_rgb_lcl_data));
    if (NULL == self->nn_param.pre_process_rgb.local)
    {
        return VX_ERROR_NO_MEMORY;
    }
    memset(self->nn_param.pre_process_rgb.local, 0, sizeof(vsi_nn_pre_process_rgb_lcl_data));
    return status;
 } /* op_init() */
 static vsi_status op_deinit
    (
    vsi_nn_node_t * self
    )
 {
-    if (self->nn_param.pre_process_rgb.local.local_tensor != NULL)
+    if (self->nn_param.pre_process_rgb.local->local_tensor != NULL)
    {
-        vxReleaseTensor(&self->nn_param.pre_process_rgb.local.local_tensor);
+        vxReleaseTensor(&self->nn_param.pre_process_rgb.local->local_tensor);
-        self->nn_param.pre_process_rgb.local.local_tensor = NULL;
+        self->nn_param.pre_process_rgb.local->local_tensor = NULL;
    }
    vsi_nn_safe_free(self->nn_param.pre_process_rgb.local);
    vsi_nn_op_common_deinit(self);
    return VSI_SUCCESS;
@ -208,7 +250,7 @@ extern "C" {
 DEF_OP_REG
    (
    /* op_name    */ PRE_PROCESS_RGB,
-    /* init       */ NULL,
+    /* init       */ op_init,
    /* compute    */ op_compute,
    /* deinit     */ op_deinit,
    /* check      */ op_check,
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c
@ -79,7 +79,10 @@ static vsi_status _prelu_op_compute
    vsi_status status = VSI_FAILURE;
    vsi_nn_prelu_param *prelu = &self->nn_param.prelu;
    vsi_ssize_t shapes[VSI_NN_MAX_DIM_NUM] = { 1 };
-    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
    vsi_nn_tensor_t* input0 = NULL;
    vsi_nn_tensor_t* input1 = NULL;
    vsi_nn_tensor_t* output = NULL;
    vsi_bool   one_rank = FALSE;
    vsi_bool   is_per_channel_alpha = 0;
    vsi_size_t alpha_shape = 1;
@ -88,6 +91,7 @@ static vsi_status _prelu_op_compute
    uint32_t dims = outputs[0]->attr.dim_num;
    reshape_tensors[0] = inputs[0];
    reshape_tensors[2] = outputs[0];
    one_rank = _is_one_rank_tensor(inputs[1], &alpha_shape);
    for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
@ -114,18 +118,23 @@ static vsi_status _prelu_op_compute
               dims = inputs[1]->attr.dim_num;
            }
-            reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+            input1 = vsi_nn_reshape_tensor( self->graph,
                inputs[1], (vsi_size_t*)shapes, dims );
            CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
            reshape_tensors[1] = input1;
        }
        else
        {
            memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t));
-            reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+            input1 = vsi_nn_reshape_tensor( self->graph,
                inputs[1], (vsi_size_t*)shapes, inputs[1]->attr.dim_num );
            CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
            reshape_tensors[1] = input1;
        }
    }
    else
    {
        uint32_t rank = inputs[0]->attr.dim_num;
        dims = inputs[1]->attr.dim_num;
        memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t));
@ -141,9 +150,32 @@ static vsi_status _prelu_op_compute
            shapes[1] = 1;
            dims = 2;
        }
        else if (one_rank && inputs[1]->attr.is_const == TRUE &&
            alpha_shape == inputs[0]->attr.size[0] &&
            alpha_shape == inputs[1]->attr.size[0] &&
            rank < 3)
        {
            is_per_channel_alpha = TRUE;
            shapes[0] = 1;
            shapes[1] = 1;
            shapes[2] = alpha_shape;
            shapes[3] = rank > 1 ? inputs[0]->attr.size[1] : 1;
            dims = 4;
            input0 = vsi_nn_reshape_tensor(self->graph, inputs[0], (vsi_size_t*)shapes, dims);
            CHECK_PTR_FAIL_GOTO(input0, "Create tensor fail.", final);
            reshape_tensors[0] = input0;
            output = vsi_nn_reshape_tensor(self->graph, outputs[0], (vsi_size_t*)shapes, dims);
            CHECK_PTR_FAIL_GOTO(output, "Create tensor fail.", final);
            reshape_tensors[2] = output;
            shapes[0] = alpha_shape;
            shapes[1] = 1;
            dims = 2;
        }
-        reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+        input1 = vsi_nn_reshape_tensor( self->graph,
            inputs[1], (vsi_size_t*)shapes, dims );
        CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
        reshape_tensors[1] = input1;
    }
    // Add params
@ -153,15 +185,19 @@ static vsi_status _prelu_op_compute
    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
        kernel_name,
        &reshape_tensors[0], 2,
-        outputs, 1, param );
+        &reshape_tensors[2], 1, param );
    vsi_nn_kernel_param_release( &param );
-    vsi_nn_ReleaseTensor( &reshape_tensors[1] );
+    if ( self->n )
    if( self->n )
    {
        status = VSI_SUCCESS;
    }
 final:
    vsi_safe_release_tensor(input0);
    vsi_safe_release_tensor(input1);
    vsi_safe_release_tensor(output);
    return status;
 } /* _prelu_op_compute() */
@ -211,28 +247,36 @@ static vsi_bool op_check
    )
 {
    BEGIN_IO_TYPE_DECL(PRELU, 2, 1)
-        IO_TYPE(D_F16,  D_F16, D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16,          D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_F16, D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_F16, D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_F16, D_F16)
+        IO_TYPE(D_F16,          D_F16,          D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16)
-        IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_I8|Q_SYM)
-        IO_TYPE(D_I16|Q_DFP, D_F16, D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_I8|Q_ASYM)
-        IO_TYPE(D_BF16, D_F16, D_BF16)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_F16)
-        IO_TYPE(D_BF16, D_BF16, D_BF16)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16)
-        IO_TYPE(D_F32, D_F32, D_F32)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_I16|Q_SYM)
-        IO_TYPE(D_I32, D_I32, D_I32)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_I16|Q_ASYM)
        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16)
        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_F16)
        IO_TYPE(D_BF16,         D_F16,          D_BF16)
        IO_TYPE(D_BF16,         D_BF16,         D_BF16)
        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
        IO_TYPE(D_F32,          D_F32,          D_F32)
        IO_TYPE(D_I32,          D_I32,          D_I32)
        /* HW 9.0 */
-        IO_TYPE(D_F32, D_BF16, D_BF16)
+        IO_TYPE(D_F32,          D_BF16,         D_BF16)
-        IO_TYPE(D_BF16, D_BF16, D_F32)
+        IO_TYPE(D_BF16,         D_BF16,         D_F32)
    END_IO_TYPE_DECL(PRELU)
-    if(!VALIDATE_OP_IO_TYPES(PRELU, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(PRELU, self, inputs, self->input.num, outputs, self->output.num)) {
        char* desc = generate_op_io_types_desc(inputs,
                self->input.num, outputs, self->output.num);
        VSILOGE("Inputs/Outputs data type not support: %s", desc);
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
@ -162,7 +162,7 @@ static vsi_bool _check_is_sp_supported_type
    int32_t * axes = self->nn_param.reduce.local2->axes;
    int32_t axes_num = self->nn_param.reduce.local2->axes_num;
-    if ( !self->graph->ctx->config.support_stream_processor ||
+    if ( !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor ||
         (type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN && type != VSI_NN_REDUCE_MAX) )
    {
        return FALSE;
@ -788,7 +788,7 @@ static vsi_bool op_set_reduce_axis(
        }
        *out_rank_x = inputs[0]->attr.dim_num;
    }
-    else if (!self->graph->ctx->config.support_stream_processor ||
+    else if (!((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor ||
             resolved_dim_count > 2)
    {
        optimzation_input_size(
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
@ -61,7 +61,7 @@ static vsi_status op_compute
        vx_nn_reshape_params_t reshape_param;
        memset(&attr, 0, sizeof(attr));
-        attr.size[0] = self->nn_param.reshape.dim_num;
+        attr.size[0] = vsi_nn_max(self->nn_param.reshape.dim_num, 1);
        attr.dim_num = 1;
        attr.is_const = TRUE;
        attr.dtype.vx_type = VSI_NN_TYPE_INT32;
@ -124,17 +124,28 @@ static vsi_bool op_setup
    vsi_bool ret = TRUE;
    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
    {
-        vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
+        if (self->nn_param.reshape.dim_num == 0 ||
-        uint32_t i = 0;
+            self->nn_param.reshape.size == NULL
-        for (i = 0; i < self->nn_param.reshape.dim_num; i++)
+            )
        {
-            shape[i] = (uint32_t)-1 == self->nn_param.reshape.size[i] ? \
+            outputs[0]->attr.size[0] = 1;
-                (vsi_size_t)-1 : (vsi_size_t)self->nn_param.reshape.size[i];
+            outputs[0]->attr.dim_num = 1;
            vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
        }
        else
        {
            vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
            uint32_t i = 0;
            for (i = 0; i < self->nn_param.reshape.dim_num; i++)
            {
                shape[i] = (uint32_t)-1 == self->nn_param.reshape.size[i] ? \
                    (vsi_size_t)-1 : (vsi_size_t)self->nn_param.reshape.size[i];
            }
            ret = vsi_nn_CalcReshapeTensor(inputs[0],
                outputs[0],
                shape,
                self->nn_param.reshape.dim_num);
        }
        ret = vsi_nn_CalcReshapeTensor(inputs[0],
            outputs[0],
            shape,
            self->nn_param.reshape.dim_num);
    }
    return ret;
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
@ -66,7 +66,7 @@ static vsi_status op_compute
        }
        memset(&attr, 0, sizeof(attr));
-        attr.size[0] = self->nn_param.reshape2.dim_num;
+        attr.size[0] = vsi_nn_max(self->nn_param.reshape2.dim_num, 1);
        attr.dim_num = 1;
        attr.is_const = TRUE;
        attr.dtype.vx_type = VSI_NN_TYPE_INT32;
@ -161,13 +161,24 @@ static vsi_bool op_setup
    vsi_bool ret = TRUE;
    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
    {
-        vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
+        if (self->nn_param.reshape2.dim_num == 0 ||
-        memcpy(shape, self->nn_param.reshape2.size,
+            self->nn_param.reshape2.size == NULL
-            sizeof(vsi_size_t) * self->nn_param.reshape2.dim_num);
+            )
-        ret = vsi_nn_CalcReshapeTensor(inputs[0],
+        {
-            outputs[0],
+            outputs[0]->attr.size[0] = 1;
-            shape,
+            outputs[0]->attr.dim_num = 1;
-            self->nn_param.reshape2.dim_num);
+            vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
        }
        else
        {
            vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
            memcpy(shape, self->nn_param.reshape2.size,
                sizeof(vsi_size_t) * self->nn_param.reshape2.dim_num);
            ret = vsi_nn_CalcReshapeTensor(inputs[0],
                outputs[0],
                shape,
                self->nn_param.reshape2.dim_num);
        }
    }
    return ret;
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_rope.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rope.c
@ -0,0 +1,145 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <string.h>
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "vsi_nn_error.h"
 typedef struct _rope_local_data_t {
    int32_t placeholder;
 } rope_local_data_t;
 /*
 Declare number of input and output.
 */
 #define _INPUT_NUM          (3)
 #define _OUTPUT_NUM         (1)
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_param_t* param = NULL;
    int32_t axis = self->nn_param.rope.axis;
    vsi_bool interleaved = self->nn_param.rope.interleaved;
    param = vsi_nn_kernel_param_create();
    vsi_nn_kernel_param_add_int32(param, "axis", axis);
    vsi_nn_kernel_param_add_int32(param, "interleaved", interleaved);
    self->n = (vx_node)vsi_nn_kernel_selector(self->graph, "rope",
        inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param);
    if ( self->n )
    {
        status = VSI_SUCCESS;
    }
    if (param != NULL)
    {
        vsi_nn_kernel_param_release(&param);
    }
    return status;
 } /* op_compute() */
 static vsi_bool op_check
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    BEGIN_IO_TYPE_DECL(ROPE, _INPUT_NUM, _OUTPUT_NUM)
        IO_TYPE(D_F32,          D_F32,          D_F32,          D_F32)
        IO_TYPE(D_BF16,         D_BF16,         D_BF16,         D_BF16)
        IO_TYPE(D_F16,          D_F16,          D_F16,          D_F16)
        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_I8|Q_SYM,     D_I8|Q_SYM)
        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM)
        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP)
        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_I16|Q_SYM,    D_I16|Q_SYM)
        IO_TYPE(D_U16|Q_ASYM,   D_U16|Q_ASYM,   D_U16|Q_ASYM,   D_U16|Q_ASYM)
        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP,    D_I8|Q_DFP)
        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_I16|Q_SYM,    D_I8|Q_SYM)
        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP,    D_U8|Q_ASYM)
        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_I16|Q_SYM,    D_U8|Q_ASYM)
        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16,          D_I8|Q_DFP)
        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_F16,          D_I8|Q_SYM)
        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_F16,          D_I8|Q_ASYM)
        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16,          D_U8|Q_ASYM)
        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16,          D_I16|Q_DFP)
        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16,          D_I16|Q_SYM)
        IO_TYPE(D_U16|Q_ASYM,   D_F16,          D_F16,          D_U16|Q_ASYM)
        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16,          D_I8|Q_DFP)
        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16,          D_I8|Q_SYM)
        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16,          D_U8|Q_ASYM)
        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16,          D_U8|Q_ASYM)
    END_IO_TYPE_DECL(ROPE)
        if (!VALIDATE_OP_IO_TYPES(ROPE, self, inputs, self->input.num, outputs, self->output.num))
        {
            char* desc = generate_op_io_types_desc(inputs,
                self->input.num, outputs, self->output.num);
            VSILOGE("Inputs/Outputs data type not support: %s", desc);
            destroy_op_io_types_desc(desc);
            return FALSE;
        }
    return TRUE;
 } /* op_check() */
 __BEGIN_DECLS
 /* Registrar */
 DEF_OP_REG
    (
    /* op_name    */ ROPE,
    /* init       */ NULL,
    /* compute    */ op_compute,
    /* deinit     */ vsi_nn_op_common_deinit,
    /* check      */ op_check,
    /* setup      */ vsi_nn_op_common_setup,
    /* optimize   */ NULL,
    /* input_num  */ _INPUT_NUM,
    /* output_num */ _OUTPUT_NUM
    );
 __END_DECLS
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
@ -25,6 +25,7 @@
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_node.h"
@ -188,7 +189,7 @@ static vsi_status op_optimize
    }
    if ( _need_split_softmax(self, inputs) == FALSE ||
         self->nn_param.softmax_internal.axis != 0 ||
-         self->graph->ctx->config.support_stream_processor )
+         ((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor )
    {
        return status;
    }
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
@ -39,6 +39,10 @@
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 #include "vsi_nn_error.h"
 typedef struct _topk_local_data_t {
    vsi_bool use_internal_node;
 } topk_local_data_t;
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (2)
@ -111,19 +115,43 @@ static vsi_status op_compute
    vsi_nn_tensor_t * out1_tensor = NULL;
    vsi_bool ret = FALSE;
-    if (inputs[0]->attr.size[axis] == 1)
+    if (self->nn_param.topk.local->use_internal_node)
    {
        return vsi_nn_internal_compute_node( self );
    }
-    ret = vsi_nn_kernel_optimize_softmax_shape(
+    if (inputs[0]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH)
-            inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+    {
-            shapes[0], &rank_in, &new_axis0);
+        int32_t i = 1;
-    ret = vsi_nn_kernel_optimize_softmax_shape(
+        shapes[0][0] = inputs[0]->attr.size[0];
-            outputs[0]->attr.size, outputs[0]->attr.dim_num, axis,
+        shapes[1][0] = outputs[0]->attr.size[0];
-            shapes[1], &rank_out, &new_axis1);
+        shapes[0][1] = 1;
        shapes[1][1] = 1;
        for (i = 1; i < (int32_t)(inputs[0]->attr.dim_num); i++)
        {
            shapes[0][1] = shapes[0][1] * inputs[0]->attr.size[i];
        }
        for (i = 1; i < (int32_t)(outputs[0]->attr.dim_num); i++)
        {
            shapes[1][1] = shapes[1][1] * outputs[0]->attr.size[i];
        }
        new_axis0 = axis;
        new_axis1 = axis;
        rank_in = 2;
        rank_out = 2;
        ret = TRUE;
    }
    else
    {
        ret = vsi_nn_kernel_optimize_softmax_shape(
                inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
                shapes[0], &rank_in, &new_axis0);
        ret = vsi_nn_kernel_optimize_softmax_shape(
                outputs[0]->attr.size, outputs[0]->attr.dim_num, axis,
                shapes[1], &rank_out, &new_axis1);
    }
    if (ret)
    {
        uint32_t perm_in[VSI_NN_MAX_DIM_NUM] = {0};
@ -303,10 +331,12 @@ static vsi_bool op_setup
        vsi_nn_internal_tensor_t* const0_input = NULL;
        vsi_nn_tensor_attr_t attr;
        p->local->use_internal_node = TRUE;
        vsi_nn_internal_init_node_wksp(self);
        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
-        curr->inputs[0]  = inputs[0];
+        curr->inputs[0] = inputs[0];
        curr->outputs[0] = outputs[0];
        vsi_nn_internal_setup_node(self, curr);
@ -318,10 +348,42 @@ static vsi_bool op_setup
        CHECK_PTR_FAIL_GOTO(const0_input, "Create tensor failed", final);
        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
-        curr->inputs[0]  = const0_input->t;
+        curr->inputs[0] = const0_input->t;
        curr->outputs[0] = outputs[1];
        vsi_nn_internal_setup_node(self, curr);
    }
    else if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE)
    {
        vsi_nn_internal_node_t* curr = NULL;
        vsi_nn_internal_tensor_t* temp_tensor = NULL;
        vsi_nn_tensor_attr_t attr;
        p->local->use_internal_node = TRUE;
        vsi_nn_internal_init_node_wksp(self);
        memcpy(&attr, &inputs[0]->attr, sizeof(vsi_nn_tensor_attr_t));
        attr.dim_num = VSI_NN_DIM_AUTO;
        attr.vtl = TRUE;
        attr.is_const = FALSE;
        temp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
        CHECK_PTR_FAIL_GOTO(temp_tensor, "Create tensor failed", final);
        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_TOPK, 1, 2);
        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
        curr->node->nn_param.topk.axis = p->axis;
        curr->node->nn_param.topk.k = p->k;
        curr->inputs[0] = inputs[0];
        curr->outputs[0] = temp_tensor->t;
        curr->outputs[1] = outputs[1];
        vsi_nn_internal_setup_node(self, curr);
        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
        curr->inputs[0] = temp_tensor->t;
        curr->outputs[0] = outputs[0];
        vsi_nn_internal_setup_node(self, curr);
    }
    return TRUE;
 final:
@ -341,7 +403,7 @@ static vsi_status op_optimize
    VSI_UNREFERENCED(outputs);
    p = &(self->nn_param.topk);
-    if (inputs[0]->attr.size[p->axis] == 1)
+    if (p->local->use_internal_node)
    {
        return vsi_nn_internal_optimize_node( self, direction );
    }
@ -357,6 +419,14 @@ static vsi_status op_init
    vsi_status status = VSI_SUCCESS;
    self->nn_param.topk.axis = 0;
    self->nn_param.topk.local = \
        (topk_local_data_t*)malloc(sizeof(topk_local_data_t));
    if (NULL == self->nn_param.topk.local)
    {
        return  VX_ERROR_NO_MEMORY;
    }
    memset(self->nn_param.topk.local, 0, sizeof(topk_local_data_t));
    return status;
 } /* op_init() */
@ -365,7 +435,12 @@ static vsi_status op_deinit
    vsi_nn_node_t * self
    )
 {
-    vsi_nn_internal_deinit_node_wksp(self);
+    if (self->nn_param.topk.local->use_internal_node)
    {
        vsi_nn_internal_deinit_node_wksp(self);
    }
    vsi_nn_safe_free(self->nn_param.topk.local);
    vsi_nn_op_common_deinit(self);
    return VSI_SUCCESS;
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@ -475,6 +475,7 @@ static _op_param_gen_t s_op_gen[] =
    /* GROUPED_CONV3D */        NULL,
    /* COL2IM */                NULL,
    /* L1_LAYER_NORM */         NULL,
    /* ROPE */                  NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
--- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
@ -98,7 +98,7 @@ static VSI_INLINE_API void _convert_bfloat16_to_float
    uint32_t i;
    for( i = 0; i < size; i ++ )
    {
-        out_buffer[i] = bfp16_to_fp32( (int16_t)buffer[i] );
+        out_buffer[i] = bfp16_to_fp32( (uint16_t)buffer[i] );
    }
 } /* _convert_bfloat16_to_float */
--- a/src/tim/vx/internal/src/utils/vsi_nn_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c
@ -40,6 +40,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_log.h"
@ -1261,7 +1262,9 @@ vsi_bool vsi_nn_is_same_quant_type(
            break;
        }
 #ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
-        case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC: {
+        case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC:
        case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC:
        {
            const float diff = (float)1e-5;
            int32_t i = 0;
            int32_t scale_cnt0 = src_dtype->group_count;
@ -1627,12 +1630,12 @@ vsi_bool vsi_nn_is_stream_process_supported_types
 {
    size_t i = 0;
-    if ( graph->ctx->config.support_stream_processor == 0 )
+    if ( ((vsi_nn_graph_prv_t*)graph)->options->config.support_stream_processor == 0 )
    {
        return FALSE;
    }
-    if ( graph->ctx->config.sp_exec_count == 0 )
+    if ( ((vsi_nn_graph_prv_t*)graph)->options->config.sp_exec_count == 0 )
    {
        return FALSE;
    }
@ -1769,3 +1772,11 @@ typedef enum
    return support;
 }
 uint32_t vsi_nn_get_tensor_dims
    (
        vsi_nn_tensor_t* tensor
    )
 {
    return vsi_nn_GetTensorIsScalar(tensor) ? 0 : tensor->attr.dim_num;
 }
--- a/src/tim/vx/internal/src/vsi_nn_context.c
+++ b/src/tim/vx/internal/src/vsi_nn_context.c
@ -39,6 +39,9 @@ static vsi_status query_hardware_caps
 #endif
 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
    vx_hardware_caps_params_ext_t paramExt;
 #if VX_FIXED_FUNCTION_DEVICE_SUPPORT
    vx_hardware_caps_params_ext3_t paramExt3;
 #endif
    memset(&paramExt, 0, sizeof(vx_hardware_caps_params_ext_t));
    status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt),
@ -73,6 +76,13 @@ static vsi_status query_hardware_caps
    }
 #endif
 #if VX_FIXED_FUNCTION_DEVICE_SUPPORT
    memset(&paramExt3, 0, sizeof(vx_hardware_caps_params_ext3_t));
    status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt3),
        sizeof(vx_hardware_caps_params_ext3_t));
    context->config.support_ffd = paramExt3.supportFixedFunctionDevice;
 #endif
 #endif
    if(param.evis1 == TRUE && param.evis2 == FALSE)
@ -93,6 +103,85 @@ final:
    return status;
 }
 vsi_status query_hardware_caps_runtime
    (
    vsi_nn_context_t context,
    vsi_nn_runtime_option_t* options
    )
 {
    vsi_status status = VSI_FAILURE;
    vx_hardware_caps_params_t param;
    VSI_UNREFERENCED(options);
    memset(&(options->config), 0, sizeof(vsi_nn_hw_config_t));
 #if VX_STREAM_PROCESSOR_SUPPORT
    vx_hardware_caps_params_ext2_t paramExt2;
 #endif
 #if VX_FIXED_FUNCTION_DEVICE_SUPPORT
    vx_hardware_caps_params_ext3_t paramExt3;
 #endif
 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
    vx_hardware_caps_params_ext_t paramExt;
    memset(&paramExt, 0, sizeof(vx_hardware_caps_params_ext_t));
    status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt),
                sizeof(vx_hardware_caps_params_ext_t));
    param.evis1 = paramExt.base.evis1;
    param.evis2 = paramExt.base.evis2;
 #else
    memset(&param, 0, sizeof(vx_hardware_caps_params_t));
    status = vxQueryHardwareCaps(context->c, &param, sizeof(vx_hardware_caps_params_t));
 #endif
    TEST_CHECK_STATUS(status, final);
 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
    options->config.subGroupSize = paramExt.subGroupSize;
 #ifdef VSI_40BIT_VA_SUPPORT
    options->config.use_40bits_va = paramExt.supportVA40;
 #endif
 #if VX_STREAM_PROCESSOR_SUPPORT
    memset(&paramExt2, 0, sizeof(vx_hardware_caps_params_ext2_t));
    status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt2),
                sizeof(vx_hardware_caps_params_ext2_t));
    if (options->enable_stream_processor)
    {
        options->config.support_stream_processor = paramExt.supportStreamProcessor;
        options->config.sp_exec_count = paramExt2.streamProcessorExecCount;
        options->config.sp_vector_depth = paramExt2.streamProcessorVectorSize;
        if (options->config.sp_exec_count > 0)
        {
            options->config.sp_per_core_vector_depth =
                options->config.sp_vector_depth / options->config.sp_exec_count;
        }
    }
 #endif
 #if VX_FIXED_FUNCTION_DEVICE_SUPPORT
    memset(&paramExt3, 0, sizeof(vx_hardware_caps_params_ext3_t));
    status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt3),
        sizeof(vx_hardware_caps_params_ext3_t));
    options->config.support_ffd = paramExt3.supportFixedFunctionDevice;
 #endif
 #endif
    if(param.evis1 == TRUE && param.evis2 == FALSE)
    {
        options->config.evis.ver = VSI_NN_HW_EVIS_1;
    }
    else if(param.evis1 == FALSE && param.evis2 == TRUE)
    {
        options->config.evis.ver = VSI_NN_HW_EVIS_2;
    }
    else
    {
        options->config.evis.ver = VSI_NN_HW_EVIS_NONE;
        VSILOGW("Unsupported evis version");
    }
 final:
    return status;
 }
 #if (defined(__ANDROID__)) && ((ANDROID_SDK_VERSION >= 30) || (__ANDROID_API__ >= 30))
 static const char* ENV_ENABLE_SHADER = "vendor.VIV_VX_ENABLE_SHADER";
 static const char* ENV_ENABLE_OPCHECK = "vendor.VSI_NN_ENABLE_OPCHECK";
@ -153,6 +242,44 @@ vsi_status vsi_nn_initOptions
    return VSI_SUCCESS;
 }
 vsi_status vsi_nn_initOptions_runtime
    (
    vsi_nn_runtime_option_t *options,
    vsi_nn_context_t ctx
    )
 {
    int32_t default_value = 1;
    options->enable_shader = vsi_nn_getenv_asint(ENV_ENABLE_SHADER, 1);
    options->enable_opcheck = vsi_nn_getenv_asint(ENV_ENABLE_OPCHECK, 1);
 #if (VX_CONCAT_OPT_SUPPORT)
    default_value = 0;
 #else
    default_value = 1;
 #endif
    options->enable_concat_optimize = vsi_nn_getenv_asint(ENV_ENABLE_CONCAT_OPTIMIZE, default_value);
    options->enable_i8_to_u8 = vsi_nn_getenv_asint(ENV_ENABLE_I8TOU8, 1);
    options->enable_dataconvert_optimize = vsi_nn_getenv_asint(ENV_ENABLE_DATACONVERT_OPTIMIZE, 1);
    options->enable_stream_processor = vsi_nn_getenv_asint(ENV_ENABLE_STREAM_PROCESSOR, 1);
    options->enable_rgb88_planar_nhwc = vsi_nn_getenv_asint(ENV_FORCE_RGB888_OUT_NHWC, 0);
 #if (VX_STRIDED_SLICE_OPT_SUPPORT)
    default_value = 0;
 #else
    default_value = 1;
 #endif
    options->enable_slice_optimize = vsi_nn_getenv_asint(ENV_ENABLE_SLICE_OPTIMIZE, default_value);
    options->enable_batch_opt = vsi_nn_getenv_asint(ENV_ENABLE_BATCH_OPT, 0);
    options->enable_save_file_type = vsi_nn_getenv_asint(ENV_SAVE_FILE_TYPE, 0);
    options->enable_use_image_process = vsi_nn_getenv_asint(VSI_USE_IMAGE_PROCESS, -1);
    options->enable_use_from_handle = vsi_nn_getenv_asint(VSI_USE_FROM_HANDLE, -1);
    /*init hw params*/
    options->config = ctx->config;
    return VSI_SUCCESS;
 }
 vsi_nn_context_t vsi_nn_CreateContext
    ( void )
 {
--- a/src/tim/vx/internal/src/vsi_nn_graph.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph.c
@ -1362,7 +1362,7 @@ vsi_nn_graph_t * vsi_nn_CreateGraph
            graph->isAllowFastMode = TRUE;
            vsi_nn_MapInit( graph->node_table );
            vsi_nn_MapInit( graph->tensor_table );
-            vsi_nn_initOptions( ((vsi_nn_graph_prv_t*) graph)->options );
+            vsi_nn_initOptions_runtime( ((vsi_nn_graph_prv_t*) graph)->options, ctx );
        }
        else
        {
@ -3398,6 +3398,7 @@ char* vsi_nn_GetRunTimeVariable
 #define varSize 256
    char* value_str = (char*)malloc(sizeof(char) * varSize);
    CHECK_PTR_FAIL_GOTO(value_str, "Create value_str fail.", final);
    CHECK_PTR_FAIL_GOTO(graph, "Graph is NULL!", final);
    memset(value_str, 0, varSize);
    char tmp_value[varSize] = {0};
    VSI_UNREFERENCED(tmp_value);
@ -3502,6 +3503,8 @@ vsi_status vsi_nn_SetRunTimeVariable
                break;
            case VSI_VX_ENABLE_STREAM_PROCESSOR:
                options->enable_stream_processor = atoi(value);
                options->config.support_stream_processor = atoi(value);
                status = query_hardware_caps_runtime(graph->ctx, options);
                break;
            case VSI_VX_ENABLE_BATCH_OPT:
                options->enable_batch_opt = atoi(value);
--- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
@ -895,10 +895,13 @@ static void _convert_const_I8toU8
    attr->dtype.vx_type = VSI_NN_TYPE_UINT8;
    attr->dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
    attr->dtype.zero_point += 128;
-
+    if (tensor->t) vxReleaseTensor(&tensor->t);
    if ( tensor->t ) vxReleaseTensor(&tensor->t);
    tensor->t = vsi_nn_CreateRawTensorFromData(graph, data, attr);
-
+#if defined(VSI_TENSOR_SPARSITY_SUPPORT)
    int32_t is_sparsity = 0;
    is_sparsity = vsi_nn_GetTensorIsSparsity(tensor);
    vsi_nn_SetTensorIsSparsity(tensor, is_sparsity);
 #endif
 final:
    vsi_nn_safe_free( data );
 }/* _convert_const_I8toU8() */
--- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
+++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
@ -247,7 +247,8 @@ static void _set_preproc_node_input_attr
    vsi_nn_tensor_attr_t* attr,
    vsi_nn_preprocess_image_size_t* input_size,
    vsi_nn_preprocess_source_format_e* source_format,
-    vsi_nn_preprocess_source_layout_e* source_layout
+    vsi_nn_preprocess_source_layout_e* source_layout,
    vsi_nn_preprocess_dtype_convert_t* data_convert
    )
 {
    *input_attr = *attr;
@ -266,26 +267,33 @@ static void _set_preproc_node_input_attr
    }
    if(*source_format == VSI_NN_SOURCE_FORMAT_TENSOR)
    {
-        input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+        if(data_convert != NULL)
-        input_attr->dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+        {
            input_attr->dtype = data_convert->dtype;
        }
        else
        {
            input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
            input_attr->dtype.vx_type = VSI_NN_TYPE_FLOAT32;
        }
    }
    else
    {
        input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
        input_attr->dtype.vx_type = VSI_NN_TYPE_UINT8;
    }
-    if(*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB)
+    if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB)
    {
-        if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC)
+        if (*source_layout == VSI_NN_SOURCE_LAYOUT_NCHW)
        {
-            input_attr->size[0] = input_attr->size[1]*input_attr->size[0];
+            vsi_size_t channel = input_attr->size[2];
-            input_attr->size[1] = input_attr->size[2];
+            if (channel != 3)
-            input_attr->size[2] = 1;
+            {
-        }
+                VSILOGE("RGB chanel must be 3, please have a check!");
-        else
+            }
-        {
+            input_attr->size[2] = input_attr->size[1];
-            input_attr->size[0] = input_attr->size[2]*input_attr->size[0];
+            input_attr->size[1] = input_attr->size[0];
-            input_attr->size[2] = 1;
+            input_attr->size[0] = channel;
        }
    }
@ -333,15 +341,10 @@ static void _set_preproc_node_input_attr
 static void _set_preproc_node_output_attr
    (
    vsi_nn_tensor_attr_t* output_attr,
-    vsi_nn_tensor_attr_t* attr,
+    vsi_nn_tensor_attr_t* attr
    vsi_nn_preprocess_dtype_convert_t* data_convert
    )
 {
    *output_attr = *attr;
    if(data_convert != NULL)
    {
        output_attr->dtype = data_convert->dtype;
    }
    output_attr->dtype.fmt = VSI_NN_DIM_FMT_NCHW;
    output_attr->dim_num = VSI_NN_DIM_AUTO;
    output_attr->is_const = FALSE;
@ -603,10 +606,11 @@ vsi_status vsi_nn_add_single_preproc_node
    _set_preproc_node_out_attr(node, image_resize, &org_norm_tensor->attr, source_layout);
    /* Set input tensor attr */
-    _set_preproc_node_input_attr(&input_attr, &org_norm_tensor->attr, input_size, source_format, source_layout);
+    _set_preproc_node_input_attr(&input_attr, &org_norm_tensor->attr, input_size,
                                 source_format, source_layout, data_convert);
    /* Set output tensor attr */
-    _set_preproc_node_output_attr(&output_attr, &org_norm_tensor->attr, data_convert);
+    _set_preproc_node_output_attr(&output_attr, &org_norm_tensor->attr);
    /* Create new norm and virtual tensors */
    if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 ||
--- a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c
+++ b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c
@ -33,6 +33,7 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_util.h"
 #include "vsi_nn_rnn_helper.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_error.h"
 vsi_bool vsi_nn_rnn_find_best_kernel_size
@ -804,7 +805,7 @@ vsi_status vsi_nn_rnn_data_check_aligned
        vsi_size_t tensor_size = vsi_nn_GetTensorSize( input[i]->attr.size,
            input[i]->attr.dim_num, input[i]->attr.dtype.vx_type );
-        if( ofst & 0x3f && !self->graph->ctx->config.support_stream_processor)
+        if( ofst & 0x3f && !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
        {
            vsi_nn_internal_init_tensor_attr(&attr, &input[i]->attr.dtype, use_virtual_tensor);
            output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
--- a/src/tim/vx/internal/src/vsi_nn_tensor.c
+++ b/src/tim/vx/internal/src/vsi_nn_tensor.c
@ -155,6 +155,15 @@ static void print_tensor
                         tensor->attr.dtype.group_size);
        ext_attr[count] = 0;
        break;
    case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC:
        count = snprintf(&ext_attr[0],
                         _EXT_ATTR_BUF_SZ,
                         "ASYM GPTQ axis=%d, count=%d, group_size=%d",
                         tensor->attr.dtype.group_channel_dim,
                         tensor->attr.dtype.group_count,
                         tensor->attr.dtype.group_size);
        ext_attr[count] = 0;
        break;
 #endif
    default:
        vsi_nn_strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ);
@ -449,6 +458,11 @@ static vsi_bool _init_tensor
        scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.group_count);
        CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
        memcpy(scales, tensor->attr.dtype.group_scales, tensor->attr.dtype.group_count * sizeof(float));
        zeroPoints = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.zero_points_dim);
        CHECK_PTR_FAIL_GOTO( zeroPoints, "Create buffer fail.", final );
        memcpy(zeroPoints,
               tensor->attr.dtype.zero_points,
               tensor->attr.dtype.zero_points_dim * sizeof(int32_t));
        params.quant_data.affinePerGroup.channel_dim = tensor->attr.dtype.group_channel_dim;
        params.quant_data.affinePerGroup.group_size = tensor->attr.dtype.group_size;
        params.quant_data.affinePerGroup.scale_group_count = tensor->attr.dtype.group_count;
@ -460,6 +474,32 @@ static vsi_bool _init_tensor
        VSILOGE(
            "can't support qnt_type "
            "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC.");
        break;
 #endif
    case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC:
 #ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
        params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_GROUP;
        // This is a hack that driver doesn't support const scales
        scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.group_count);
        CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
        memcpy(scales, tensor->attr.dtype.group_scales, tensor->attr.dtype.group_count * sizeof(float));
        zeroPoints = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.zero_points_dim);
        CHECK_PTR_FAIL_GOTO( zeroPoints, "Create buffer fail.", final );
        memcpy(zeroPoints,
               tensor->attr.dtype.group_zero_points,
               tensor->attr.dtype.group_count * sizeof(int32_t));
        params.quant_data.affinePerGroup.channel_dim = tensor->attr.dtype.group_channel_dim;
        params.quant_data.affinePerGroup.group_size = tensor->attr.dtype.group_size;
        params.quant_data.affinePerGroup.scale_group_count = tensor->attr.dtype.group_count;
        params.quant_data.affinePerGroup.scales = scales;
        params.quant_data.affinePerGroup.zero_points = zeroPoints;
        params.quant_data.affinePerGroup.zero_point_group_count = tensor->attr.dtype.group_count;
        break;
 #else
        VSILOGE(
            "can't support qnt_type "
            "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC.");
        break;
 #endif
    default:
        break;
@ -1788,6 +1828,57 @@ int8_t vsi_nn_GetTensorIsScalar
    return _get_tensor_is_scalar((vsi_nn_tensor_prv_t*)tensor);
 }
 int32_t _get_tensor_is_sparsity
 (
    vsi_nn_tensor_prv_t* tensor
 )
 {
    int32_t is_sparsity = FALSE;
    if (NULL == tensor)
    {
        VSILOGE("To get is_sparsity, tensor pointer SHOULD NOT be NULL.");
        goto final;
    }
 #if defined(VSI_TENSOR_SPARSITY_SUPPORT)
    is_sparsity = tensor->sparsity_type;
 #endif
 final:
    return is_sparsity;
 }
 int32_t vsi_nn_GetTensorIsSparsity
 (
    vsi_nn_tensor_t* tensor
 )
 {
    return _get_tensor_is_sparsity((vsi_nn_tensor_prv_t*)tensor);
 }
 vsi_status vsi_nn_SetTensorIsSparsity
 (
    vsi_nn_tensor_t* tensor,
    int32_t is_sparsity
 )
 {
    VSI_UNREFERENCED(is_sparsity);
    vsi_status status = VSI_SUCCESS;
    if (NULL == tensor) {
        status = VSI_FAILURE;
        goto final;
    }
 #if defined(VSI_TENSOR_SPARSITY_SUPPORT)
    vxSetTensorAttribute(tensor->t,
                        VX_TENSOR_SPARSITY_TYPE,
                        &is_sparsity,
                        sizeof(vx_enum));
    status = VSI_SUCCESS;
    ((vsi_nn_tensor_prv_t*)tensor)->sparsity_type = is_sparsity;
 #endif
 final:
    return status;
 }
 vsi_status vsi_nn_CopyRawDataToTensor
    (
    vsi_nn_graph_t*         graph,
--- a/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h
+++ b/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h
@ -75,6 +75,11 @@ vsi_status _set_tensor_is_scalar
    int8_t is_salar
    );
 vsi_status _set_tensor_is_sparsity(
    vsi_nn_tensor_prv_t* tensor,
    int32_t is_sparsity
 );
 int8_t _get_tensor_is_from_axisram
    (
    vsi_nn_tensor_prv_t* tensor
@ -127,6 +132,11 @@ vsi_nn_tensor_t * vsi_nn_kernel_insert_reshape_node
        vsi_nn_opt_direction_e direction
    );
 uint32_t vsi_nn_get_tensor_dims
    (
        vsi_nn_tensor_t* tensor
    );
 #ifdef __cplusplus
 }
 #endif
--- a/src/tim/vx/internal/src/vsi_nn_types_prv.h
+++ b/src/tim/vx/internal/src/vsi_nn_types_prv.h
@ -108,6 +108,11 @@ typedef struct _vsi_nn_tensor_prv
    /** create tensor from axisram.*/
    int8_t is_from_axisram;
    /** 2:4 sparsity attr. */
 #if defined(VSI_TENSOR_SPARSITY_SUPPORT)
    vx_tensor_sparsity_param_e sparsity_type; /*!< \brief sparsity type for the tensor */
 #endif
    // Add tensor internal attribute here...
 } vsi_nn_tensor_prv_t;