Update cmake .

Add unified-tina and viplite runtime library in arm linux.
[vx_platform] Fix create sub device crash issue (#715 )
2025-12-02 15:02:41 +00:00 · 2025-12-02 15:01:18 +00:00 · 2025-11-25 17:19:56 +08:00 · 2025-10-13 13:15:57 +08:00 · 2025-10-13 13:15:31 +08:00 · 2025-10-09 18:39:22 +08:00
246 changed files with 54658 additions and 4550 deletions
--- a/.github/workflows/cmake_x86_vsim.yml
+++ b/.github/workflows/cmake_x86_vsim.yml
@ -35,7 +35,7 @@ jobs:
      run: |
        cmake --install ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
    - name: upload tim-vx-install
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
      with:
        name: tim-vx-install
        path: |
@ -75,7 +75,7 @@ jobs:
      VIVANTE_SDK_DIR: ${{github.workspace}}/prebuilt-sdk/x86_64_linux/
    steps:
    - name: download tim-vx build output
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
      with:
        name: tim-vx-install

@ -102,7 +102,7 @@ jobs:
      VIV_VX_DISABLE_TP_NN_EVIS: 1
    steps:
    - name: download tim-vx build output
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
      with:
        name: tim-vx-install

@ -117,21 +117,21 @@ jobs:
    needs: tim-vx-build
    steps:
      - name: download tim-vx build output
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
        with:
          name: tim-vx-install
      - name: apply-patch-build
        run: |
          git config --global user.email "xiang.zhang@verisilicon.com"
          git config --global user.name "xiang.zhang"
-          git clone https://github.com/tensorflow/tensorflow.git ${{github.workspace}}/3rd-party/tensorflow && cd ${{github.workspace}}/3rd-party/tensorflow/ && git checkout v2.10.0
+          git clone https://github.com/tensorflow/tensorflow.git ${{github.workspace}}/3rd-party/tensorflow && cd ${{github.workspace}}/3rd-party/tensorflow/ && git checkout v2.16.1
          git clone https://github.com/VeriSilicon/tflite-vx-delegate.git ${{github.workspace}}/vx-delegate
          cmake -B ${{github.workspace}}/vx-delegate/build -S ${{github.workspace}}/vx-delegate -DFETCHCONTENT_SOURCE_DIR_TENSORFLOW=${{github.workspace}}/3rd-party/tensorflow -DTIM_VX_INSTALL=${{github.workspace}}/tim-vx.install.dir/ -DTFLITE_ENABLE_NNAPI=OFF -DTFLITE_ENABLE_XNNPACK=OFF
          cmake --build ${{github.workspace}}/vx-delegate/build --config ${{env.BUILD_TYPE}}
          cd ${{github.workspace}}/vx-delegate/build
          make vx_delegate benchmark_model
      - name: upload vx-delegate
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
          name: vx-delegate-bin
          path: |
@ -144,7 +144,7 @@ jobs:
    needs: [vx-delegate-build, tim-vx-unit-test]
    steps:
      - name: download binary
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4

      - name: download mobilenet_v2_quant.tflite
        run: |
@ -159,7 +159,7 @@ jobs:
    needs: [vx-delegate-build, tim-vx-unit-test]
    steps:
      - name: download binary
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
      - name: download mobilenet_v2_b8_quant.tflite
        run: |
          curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/mobilenet_v2_b8_quant.tflite
@ -173,7 +173,7 @@ jobs:
    needs: [vx-delegate-build, tim-vx-unit-test]
    steps:
      - name: download test binary
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
      - name: download resnet_quant.tflite
        run: |
          curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/resnet_quant.tflite
@ -187,7 +187,7 @@ jobs:
    needs: [vx-delegate-build, tim-vx-unit-test]
    steps:
      - name: download test binary
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
      - name: download model
        run: |
          curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/inception_v3_quant.tflite
@ -201,7 +201,7 @@ jobs:
    needs: [vx-delegate-build, tim-vx-unit-test]
    steps:
      - name: download test binary
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
      - name: download model
        run: |
          curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/mobilenet_v3_b4_quant.tflite
@ -215,7 +215,7 @@ jobs:
    needs: [vx-delegate-build, tim-vx-unit-test]
    steps:
      - name: download test binary
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
      - name: download model
        run: |
          curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/mobilenet_v3_quant.tflite
@ -229,7 +229,7 @@ jobs:
    needs: [vx-delegate-build, tim-vx-unit-test]
    steps:
      - name: download test binary
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
      - name: download model
        run: |
          curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/mv3_depth_quant.tflite
@ -243,7 +243,7 @@ jobs:
    needs: [vx-delegate-build, tim-vx-unit-test]
    steps:
      - name: download test binary
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
      - name: download model
        run: |
          curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/yolo_v4_tiny_quant.tflite
@ -258,7 +258,7 @@ jobs:
  #   needs: [vx-delegate-build, tim-vx-unit-test]
  #   steps:
  #     - name: download test binary
-  #       uses: actions/download-artifact@v3
+  #       uses: actions/download-artifact@v4
  #     - name: download model
  #       run: |
  #         curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/deeplab_v3_plus_quant.tflite
@ -273,7 +273,7 @@ jobs:
  #   needs: vx-delegate-build
  #   steps:
  #     - name: download test binary
-  #       uses: actions/download-artifact@v3
+  #       uses: actions/download-artifact@v4
  #     - name: download model
  #       run: |
  #         wget https://storage.googleapis.com/tfhub-lite-models/google/lite-model/movenet/multipose/lightning/tflite/float16/1.tflite
@ -283,68 +283,68 @@ jobs:
  #         chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
  #         ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/tfhub.movenet.multipose.tflite

-  tfhub-efficientdet-lite0:
-    runs-on: ubuntu-latest
-    needs: [vx-delegate-build, tim-vx-unit-test]
-    steps:
-      - name: download test binary
-        uses: actions/download-artifact@v3
-      - name: download model
-        run: |
-          wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite0/detection/metadata/1.tflite
-      - name: benchmark-model
-        run: |
-          chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
-          ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
+  # tfhub-efficientdet-lite0:
+  #   runs-on: ubuntu-latest
+  #   needs: [vx-delegate-build, tim-vx-unit-test]
+  #   steps:
+  #     - name: download test binary
+  #       uses: actions/download-artifact@v4
+  #     - name: download model
+  #       run: |
+  #         wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite0/detection/metadata/1.tflite
+  #     - name: benchmark-model
+  #       run: |
+  #         chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
+  #         ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite

-  tfhub-efficientdet-lite1:
-    runs-on: ubuntu-latest
-    needs: [vx-delegate-build, tim-vx-unit-test]
-    steps:
-      - name: download test binary
-        uses: actions/download-artifact@v3
-      - name: download model
-        run: |
-          wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite1/detection/metadata/1.tflite
-      - name: benchmark-model
-        run: |
-          chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
-          ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
+  # tfhub-efficientdet-lite1:
+  #   runs-on: ubuntu-latest
+  #   needs: [vx-delegate-build, tim-vx-unit-test]
+  #   steps:
+  #     - name: download test binary
+  #       uses: actions/download-artifact@v4
+  #     - name: download model
+  #       run: |
+  #         wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite1/detection/metadata/1.tflite
+  #     - name: benchmark-model
+  #       run: |
+  #         chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
+  #         ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite

-  tfhub-efficientdet-lite2:
-    runs-on: ubuntu-latest
-    needs: [vx-delegate-build, tim-vx-unit-test]
-    steps:
-      - name: download test binary
-        uses: actions/download-artifact@v3
-      - name: download model
-        run: |
-          wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
-      - name: benchmark-model
-        run: |
-          chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
-          ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
+  # tfhub-efficientdet-lite2:
+  #   runs-on: ubuntu-latest
+  #   needs: [vx-delegate-build, tim-vx-unit-test]
+  #   steps:
+  #     - name: download test binary
+  #       uses: actions/download-artifact@v4
+  #     - name: download model
+  #       run: |
+  #         wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
+  #     - name: benchmark-model
+  #       run: |
+  #         chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
+  #         ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite

-  tfhub-efficientdet-lite3:
-    runs-on: ubuntu-latest
-    needs: [vx-delegate-build, tim-vx-unit-test]
-    steps:
-      - name: download test binary
-        uses: actions/download-artifact@v3
-      - name: download model
-        run: |
-          wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
-      - name: benchmark-model
-        run: |
-          chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
-          ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
+  # tfhub-efficientdet-lite3:
+  #   runs-on: ubuntu-latest
+  #   needs: [vx-delegate-build, tim-vx-unit-test]
+  #   steps:
+  #     - name: download test binary
+  #       uses: actions/download-artifact@v4
+  #     - name: download model
+  #       run: |
+  #         wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
+  #     - name: benchmark-model
+  #       run: |
+  #         chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
+  #         ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite

  # acuity-yolov3-608-quant:
  #   runs-on: ubuntu-latest
  #   needs: [vx-delegate-build, tim-vx-unit-test]
  #   steps:
  #     - name: download test binary
-  #       uses: actions/download-artifact@v3
+  #       uses: actions/download-artifact@v4
  #     - name: download model
  #       run: |
  #         curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/yolov3_608relu_quant.acuity.tflite
@ -359,7 +359,7 @@ jobs:
  #   needs: vx-delegate-build
  #   steps:
  #     - name: download test binary
-  #       uses: actions/download-artifact@v3
+  #       uses: actions/download-artifact@v4
  #     - name: download model
  #       run: |
  #         wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite4/detection/metadata/1.tflite
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,13 +2,13 @@ cmake_minimum_required (VERSION 3.14)
 project(tim-vx LANGUAGES C CXX)

 option(BUILD_SHARED_LIBS                "Build using shared libraries"          ON)
-option(TIM_VX_ENABLE_CUSTOM_OP          "Enable custom op support"              OFF)
+option(TIM_VX_ENABLE_CUSTOM_OP          "Enable custom op support"              ON)
 option(TIM_VX_ENABLE_TEST               "Build the unit test"                   OFF)
 option(TIM_VX_ENABLE_LAYOUT_INFER       "Enable layout inference support"       ON)
-option(TIM_VX_ENABLE_NBG_PARSER         "Enable NBG parser"                     OFF)
+option(TIM_VX_ENABLE_NBG_PARSER         "Enable NBG parser"                     ON)
 option(TIM_VX_CODE_COVERAGE             "Run code coverage with gconv(gcc only" OFF)
 option(TIM_VX_USE_EXTERNAL_OVXLIB       "Use external OVXLIB"                   OFF)
-option(TIM_VX_BUILD_EXAMPLES            "Build demos show general usage"        OFF)
+option(TIM_VX_BUILD_EXAMPLES            "Build demos show general usage"        ON)
 option(TIM_VX_ENABLE_VIPLITE            "Enable lite driver api support"        OFF)
 option(TIM_VX_ENABLE_40BIT              "Enable large memory support"           OFF)
 option(TIM_VX_ENABLE_PLATFORM           "Enable multi devices support"          OFF)
--- a/README.md
+++ b/README.md
@ -35,7 +35,7 @@ Main Features
 - [TVM](https://github.com/VeriSilicon/tvm) (Fork)
 - [Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite) (Official)
 - [OpenCV](https://github.com/opencv/opencv/wiki/TIM-VX-Backend-For-Running-OpenCV-On-NPU) (Offical)
- MLIR Dialect (In development)
+- [ONNXRuntime](https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/vsinpu) (Official)

 Feel free to raise a github issue if you wish to add TIM-VX for other frameworks.

--- a/2
+++ b/2
@ -1 +1 @@
-1.2.14
+1.2.22
--- a/cmake/local_sdk.cmake
+++ b/cmake/local_sdk.cmake
@ -9,7 +9,11 @@ list(APPEND OVXDRV_INCLUDE_DIRS
 if("${CONFIG}" STREQUAL "BUILDROOT")
    set(VIV_SDK_DRIVER_PREFIX "usr/lib")
 else()
-    set(VIV_SDK_DRIVER_PREFIX "drivers")
+    if(EXISTS ${EXTERNAL_VIV_SDK}/drivers)
+       set(VIV_SDK_DRIVER_PREFIX "drivers")
+    else()
+       set(VIV_SDK_DRIVER_PREFIX "lib")
+    endif()
 endif()

 message("using driver libs from ${EXTERNAL_VIV_SDK}/${VIV_SDK_DRIVER_PREFIX}")
--- a/docs/image/timvx_overview.svg
+++ b/docs/image/timvx_overview.svg
--- a/include/tim/vx/platform/lite/lite_native.h
+++ b/include/tim/vx/platform/lite/lite_native.h
@ -25,72 +25,58 @@
 #define TIM_VX_LITE_NATIVE_H_

 #include "tim/vx/platform/platform.h"
-#include "vip_lite.h"
-#include "nbg_linker.h"

 namespace tim {
 namespace vx {
 namespace platform {

-class LiteNativeExecutor
-    : public IExecutor,
-      public std::enable_shared_from_this<LiteNativeExecutor> {
+class LiteNativeDevice : public IDevice {
 public:
-  LiteNativeExecutor(const std::shared_ptr<IDevice>& device);
-  virtual ~LiteNativeExecutor();
-  bool Submit(const std::shared_ptr<IExecutable>& executable,
-              const std::shared_ptr<IExecutable>& ref,
-              bool after = true) override;
-  bool Trigger(bool async = false) override;
-  std::shared_ptr<IExecutable> Compile(
-      const std::shared_ptr<Graph>& graph) override;
-
- private:
-  vip_task_descriptor_t* task_descriptor_;
-  vip_database database_;
+  virtual ~LiteNativeDevice() {};
+  virtual bool Submit(const std::shared_ptr<Graph>& graph) = 0;
+  virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0;
+  virtual bool DeviceExit() = 0;
+  virtual void WaitDeviceIdle() = 0;
+  virtual std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
+                                                    const int32_t core_count = -1,
+                                                    const std::shared_ptr<Context>& context = nullptr) = 0;
+  static std::vector<std::shared_ptr<IDevice>> Enumerate();
+  static bool vip_initialized;
+};
+class LiteNativeExecutor
+    : public IExecutor {
+ public:
+  virtual ~LiteNativeExecutor() {};
+  virtual bool Submit(const std::shared_ptr<IExecutable>& executable,
+                      const std::shared_ptr<IExecutable>& ref,
+                      bool after = true) = 0;
+  virtual bool Trigger(bool async = false) = 0;
+  virtual std::shared_ptr<IExecutable> Compile(
+      const std::shared_ptr<Graph>& graph) = 0;
 };

 class LiteNativeExecutable : public IExecutable {
 public:
-  LiteNativeExecutable(const std::shared_ptr<IExecutor>& executor,
-                       const std::vector<char>& nb_buf);
-  virtual ~LiteNativeExecutable();
-  void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
-  void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
-  void GetOutput(
-      const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
-  bool Submit(const std::shared_ptr<IExecutable>& ref, bool after) override;
-  bool Trigger(bool async) override;
-  bool Verify() override;
-  std::shared_ptr<ITensorHandle> AllocateTensor(
-      const TensorSpec& tensor_spec) override;
-
-  vip_network network_;
-
- private:
-  void SetBuffer(vip_memory_t* dst, gcvip_videomemory_t* src);
-
-  int32_t input_count_;
-  int32_t output_count_;
-
-  gcvip_videomemory_t* coeff_;
-  gcvip_videomemory_t* command_;
-  gcvip_videomemory_t* memory_pool_;
-  gcvip_videomemory_t* others_;
-  gcvip_videomemory_t* pre_command_;
+  virtual ~LiteNativeExecutable() {};
+  virtual void SetInput(const std::shared_ptr<ITensorHandle>& th) = 0;
+  virtual void SetOutput(const std::shared_ptr<ITensorHandle>& th) = 0;
+  virtual void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual bool Submit(const std::shared_ptr<IExecutable>& ref, bool after) = 0;
+  virtual bool Trigger(bool async) = 0;
+  virtual bool Verify() = 0;
+  virtual std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec,
+                                                        void* data = nullptr, uint32_t size = 0) = 0;
 };

 class LiteNativeTensorHandle : public ITensorHandle {
 public:
-  LiteNativeTensorHandle(const std::shared_ptr<Tensor>& tensr);
-  virtual ~LiteNativeTensorHandle();
-  bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override;
-  bool CopyDataFromTensor(void* data) override;
-
-  gcvip_videomemory_t* tensor_buffer_;
+  virtual ~LiteNativeTensorHandle() {};
+  bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0;
+  bool CopyDataFromTensor(void* data) = 0;
 };
 }  // namespace platform
 }  // namespace vx
 }  // namespace tim

-#endif
+#endif
--- a/include/tim/vx/platform/native.h
+++ b/include/tim/vx/platform/native.h
@ -37,51 +37,41 @@ class NativeDevice : public IDevice {
  virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0;
  virtual bool DeviceExit() = 0;
  virtual void WaitDeviceIdle() = 0;
+  virtual std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
+                                                    const int32_t core_count = -1,
+                                                    const std::shared_ptr<Context>& context = nullptr) = 0;
  static std::vector<std::shared_ptr<IDevice>> Enumerate();
 };

 class NativeExecutable : public IExecutable {
 public:
-  NativeExecutable(const std::shared_ptr<IExecutor>& executor,
-                   const std::vector<char>& nb_buf, size_t inputs,
-                   size_t outputs);
-  ~NativeExecutable(){};
-  void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
-  void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
-  void GetOutput(
-      const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
-  bool Submit(const std::shared_ptr<IExecutable>& ref,
-              bool after = true) override;
-  bool Trigger(bool async = false) override;
-  std::shared_ptr<ITensorHandle> AllocateTensor(
-      const TensorSpec& tensor_spec) override;
-  bool Verify() override;
-
- protected:
-  std::shared_ptr<tim::vx::ops::NBG> nb_node_;
-  std::vector<char> nb_buf_;
+  virtual ~NativeExecutable() {};
+  virtual void SetInput(const std::shared_ptr<ITensorHandle>& th) = 0;
+  virtual void SetOutput(const std::shared_ptr<ITensorHandle>& th) = 0;
+  virtual void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual bool Submit(const std::shared_ptr<IExecutable>& ref,
+                      bool after = true) = 0;
+  virtual bool Trigger(bool async = false) = 0;
+  virtual std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec,
+                                                        void* data = nullptr, uint32_t size = 0) = 0;
+  virtual bool Verify() = 0;
 };

-class NativeExecutor : public IExecutor,
-                       public std::enable_shared_from_this<NativeExecutor> {
+class NativeExecutor : public IExecutor {
 public:
-  NativeExecutor(const std::shared_ptr<IDevice>& device);
-  NativeExecutor(const std::shared_ptr<IDevice>& device,
-                 const std::shared_ptr<Context>& context);
-  ~NativeExecutor(){};
-  bool Submit(const std::shared_ptr<IExecutable>& executable,
-              const std::shared_ptr<IExecutable>& ref,
-              bool after = true) override;
-  bool Trigger(bool async = false) override;
-  std::shared_ptr<IExecutable> Compile(
-      const std::shared_ptr<Graph>& graph) override;
+  virtual ~NativeExecutor(){};
+  virtual bool Submit(const std::shared_ptr<IExecutable>& executable,
+                      const std::shared_ptr<IExecutable>& ref,
+                      bool after = true) = 0;
+  virtual bool Trigger(bool async = false) = 0;
+  virtual std::shared_ptr<IExecutable> Compile(const std::shared_ptr<Graph>& graph) = 0;
 };

 class NativeTensorHandle : public ITensorHandle {
 public:
-  NativeTensorHandle(const std::shared_ptr<Tensor>& tensor);
-  bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override;
-  bool CopyDataFromTensor(void* data) override;
+  virtual bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0;
+  virtual bool CopyDataFromTensor(void* data) = 0;
 };

 }  // namespace platform
--- a/include/tim/vx/platform/platform.h
+++ b/include/tim/vx/platform/platform.h
@ -46,15 +46,12 @@ namespace platform {

 class IDevice;
 class IExecutable;
-class ExecutableSet;
 class IExecutor;
 class ITensorHandle;

 std::shared_ptr<IExecutable> Compile(
    const std::shared_ptr<Graph>& graph,
    const std::shared_ptr<IExecutor>& executor);
-std::shared_ptr<IExecutable> CreateExecutableSet(
-    const std::vector<std::shared_ptr<IExecutable>>& executables);

 class IDevice {
 public:
@ -68,17 +65,25 @@ class IDevice {
  virtual ~IDevice(){};
  virtual bool Submit(const std::shared_ptr<Graph>& graph) = 0;
  virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0;
-  device_id_t Id() const;
+  device_id_t Id() const { return device_id_;};
  virtual void WaitDeviceIdle() = 0;
  virtual bool DeviceExit() = 0;
  virtual void RemoteReset();
+  uint32_t CoreCount() const {return core_count_;};
+  virtual std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
+                                                    const int32_t core_count = -1,
+                                                    const std::shared_ptr<Context>& context = nullptr) = 0;
+  static std::vector<std::shared_ptr<IDevice>> Enumerate();

 protected:
  device_id_t device_id_;
+  uint32_t core_count_;
+
 };

 class IExecutor {
 public:
+  //using task = std::shared_ptr<IExecutable>;
  using task = std::weak_ptr<IExecutable>;
  virtual ~IExecutor(){};
  virtual bool Submit(const std::shared_ptr<IExecutable>& executable,
@ -87,13 +92,17 @@ class IExecutor {
  virtual bool Trigger(bool async = false) = 0;  // todo: async=true
  virtual std::shared_ptr<IExecutable> Compile(
      const std::shared_ptr<Graph>& graph) = 0;
-  virtual std::shared_ptr<IDevice> Device() const;
-  virtual std::shared_ptr<Context> Contex() const;
-
+  virtual std::shared_ptr<IDevice> Device() const {return device_;};
+  virtual std::shared_ptr<Context> Contex() const {return context_;};
+  virtual uint32_t CoreIndex() const {return core_index_; };
+  virtual uint32_t CoreCount() const {return core_count_; };
 protected:
  std::vector<task> tasks_;
  std::shared_ptr<IDevice> device_;
  std::shared_ptr<Context> context_;
+  uint32_t core_index_;
+  uint32_t core_count_;
+
 };

 class IExecutable : public std::enable_shared_from_this<IExecutable> {
@ -101,40 +110,24 @@ class IExecutable : public std::enable_shared_from_this<IExecutable> {
  virtual ~IExecutable(){};
  virtual void SetInput(const std::shared_ptr<ITensorHandle>& th) = 0;
  virtual void SetOutput(const std::shared_ptr<ITensorHandle>& th) = 0;
-  virtual void GetOutput(
-      const std::vector<std::shared_ptr<ITensorHandle>>& th) = 0;  // for remote
+  virtual void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual std::vector<std::shared_ptr<ITensorHandle>> GetOutputs() { return input_handles_;};
+  virtual std::vector<std::shared_ptr<ITensorHandle>> Getinputs() { return input_handles_;};
  virtual bool Submit(const std::shared_ptr<IExecutable>& ref,
                      bool after = true) = 0;
  virtual bool Trigger(bool async = false) = 0;  // todo: async=true
  virtual bool Verify() = 0;
-  virtual std::shared_ptr<Graph> NBGraph() const;
-  virtual std::shared_ptr<ITensorHandle> AllocateTensor(
-      const TensorSpec& tensor_spec) = 0;
-  virtual std::shared_ptr<IExecutor> Executor() const;
+  std::shared_ptr<Graph> NBGraph() const {return nb_graph_;};
+  virtual std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec ,
+                                                        void* data = nullptr, uint32_t size = 0) = 0;

 protected:
  std::weak_ptr<IExecutor> executor_;
  std::shared_ptr<Context> context_;
  std::shared_ptr<Graph> nb_graph_;
-};
-
-class ExecutableSet : public IExecutable {
- public:
-  ExecutableSet(const std::vector<std::shared_ptr<IExecutable>>& executables);
-  void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
-  void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
-  void GetOutput(
-      const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
-  bool Submit(const std::shared_ptr<IExecutable>& ref,
-              bool after = true) override;
-  bool Trigger(bool async = false) override;
-  bool Verify() override;
-  std::shared_ptr<ITensorHandle> AllocateTensor(
-      const TensorSpec& tensor_spec) override;
-  std::vector<std::shared_ptr<IExecutable>> Executables() const;
-
- protected:
-  std::vector<std::shared_ptr<IExecutable>> executables_;
+  std::vector<std::shared_ptr<ITensorHandle>> input_handles_;
+  std::vector<std::shared_ptr<ITensorHandle>> output_handles_;
 };

 class ITensorHandle {
@ -142,13 +135,15 @@ class ITensorHandle {
  virtual ~ITensorHandle(){};
  virtual bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0;
  virtual bool CopyDataFromTensor(void* data) = 0;
-  virtual std::shared_ptr<Tensor> GetTensor() const;
+  virtual std::shared_ptr<Tensor> GetTensor() const { return tensor_;};
+  virtual TensorSpec& GetSpec() { return spec_;};

 protected:
  std::shared_ptr<Tensor> tensor_;
+  TensorSpec spec_;
 };

 }  // namespace platform
 }  // namespace vx
 }  // namespace tim
-#endif
+#endif
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@ -20,9 +20,7 @@ endif()
 if(TIM_VX_ENABLE_PLATFORM)
    add_subdirectory("lenet_multi_device")
    add_subdirectory("multi_device")
-    if(${TIM_VX_ENABLE_PLATFORM_LITE})
-        add_subdirectory("lite_multi_device")
-    endif()
+    add_subdirectory("platform_sample")
    if(TIM_VX_ENABLE_GRPC)
        add_subdirectory("grpc")
    endif()
--- a/samples/lenet_multi_device/CMakeLists.txt
+++ b/samples/lenet_multi_device/CMakeLists.txt
@ -11,5 +11,10 @@ target_include_directories(${TARGET_NAME} PRIVATE
    ${PROJECT_SOURCE_DIR}/include
 )

+target_include_directories(${TARGET_NAME} PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${PROJECT_SOURCE_DIR}/include
+)
+
 install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
    DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
--- a/samples/lenet_multi_device/lenet_multi_device.cc
+++ b/samples/lenet_multi_device/lenet_multi_device.cc
@ -33,7 +33,6 @@
 #include "tim/vx/context.h"
 #include "tim/vx/graph.h"
 #include "tim/vx/platform/platform.h"
-#include "tim/vx/platform/native.h"

 std::vector<uint8_t> input_data = {
    0,   0,   0,   0,   0,   0,   0,   0,   6,   0,   2,   0,   0,   8,   0,
@ -108,17 +107,17 @@ static void printTopN(const T* prob, int outputCount, int topNum) {
  }
 }

+
 int main(int argc, char** argv) {
  (void) argc, (void) argv;
  auto context0 = tim::vx::Context::Create();
  auto graph0 = lenet(context0);
  auto graph1 = lenet(context0);

-  auto devices = tim::vx::platform::NativeDevice::Enumerate();
+  auto devices = tim::vx::platform::IDevice::Enumerate();
  auto device = devices[0];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor = std::make_shared<tim::vx::platform::NativeExecutor> (device);
-
-  auto executable0 = tim::vx::platform::Compile(graph0, executor);  // compile to nbg
+  auto executor = device->CreateExecutor(0,-1,context0);
+  auto executable0 = tim::vx::platform::Compile(graph0, executor);
  auto input_handle0 = executable0->AllocateTensor(graph0->InputsTensor()[0]->GetSpec());
  auto output_handle0 = executable0->AllocateTensor(graph0->OutputsTensor()[0]->GetSpec());
  executable0->SetInput(input_handle0);
@ -127,7 +126,18 @@ int main(int argc, char** argv) {
  assert(executable0->Submit(executable0));
  executable0->Trigger();

-  auto executable1 = tim::vx::platform::Compile(graph1, executor);  // compile to nbg
+  std::vector<float> output_data;
+  output_data.resize(1 * 10);
+  if (!output_handle0->CopyDataFromTensor(output_data.data())) {
+    std::cout << "Copy output data fail." << std::endl;
+    return -1;
+  }
+  std::cout << "executable0 out." << std::endl;
+  printTopN(output_data.data(), output_data.size(), 5);
+  output_data.assign(output_data.size(),0);
+  output_handle0->CopyDataToTensor(output_data.data(), output_data.size());
+
+  auto executable1 = tim::vx::platform::Compile(graph1, executor);
  auto input_handle1 = executable1->AllocateTensor(graph1->InputsTensor()[0]->GetSpec());
  auto output_handle1 = executable1->AllocateTensor(graph1->OutputsTensor()[0]->GetSpec());
  executable1->SetInput(input_handle1);
@ -136,34 +146,28 @@ int main(int argc, char** argv) {
  assert(executable1->Submit(executable0));
  executable1->Trigger();

+  std::vector<float> output_data1;
+  output_data1.resize(1 * 10);
+  if (!output_handle1->CopyDataFromTensor(output_data1.data())) {
+    std::cout << "Copy output data fail." << std::endl;
+    return -1;
+  }
+  std::cout << "executable1 out." << std::endl;
+  printTopN(output_data1.data(), output_data1.size(), 5);
+  output_data1.assign(output_data1.size(),0);
+  output_handle1->CopyDataToTensor(output_data1.data(), output_data1.size());
+
  executor->Submit(executable0, executable0);
  executor->Submit(executable1, executable0);

-  std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables0;
-  executables0.push_back(executable0);
-  executables0.push_back(executable1);
-  auto executable_set0 = tim::vx::platform::CreateExecutableSet(executables0);
-  executor->Submit(executable_set0, executable_set0);
  executor->Trigger();
-
-  std::vector<uint8_t> input_data0;
-  input_data0.resize(28 * 28);
-  if (!input_handle0->CopyDataFromTensor(input_data0.data())) {
-    std::cout << "Copy intput data fail." << std::endl;
-    return -1;
-  }
-  printTopN(input_data0.data(), input_data0.size(), 5);
-
-  std::vector<float> output_data;
-  output_data.resize(1 * 10);
+  std::cout << "executor out." << std::endl;
  if (!output_handle0->CopyDataFromTensor(output_data.data())) {
    std::cout << "Copy output data fail." << std::endl;
    return -1;
  }
  printTopN(output_data.data(), output_data.size(), 5);

-  std::vector<float> output_data1;
-  output_data1.resize(1 * 10);
  if (!output_handle1->CopyDataFromTensor(output_data1.data())) {
    std::cout << "Copy output data fail." << std::endl;
    return -1;
--- a/samples/lite_multi_device/CMakeLists.txt
+++ b/samples/lite_multi_device/CMakeLists.txt
@ -1,13 +0,0 @@
-message("samples/lite_multi_device")
-
-set(TARGET_NAME "lite_multi_device")
-
-add_executable(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/lite_multi_device.cc)
-
-target_link_libraries(${TARGET_NAME} PRIVATE -Wl,--whole-archive tim-vx)
-target_include_directories(${TARGET_NAME} PRIVATE
-    ${PROJECT_SOURCE_DIR}/include
-    ${PROJECT_SOURCE_DIR}/prebuilt-sdk/viplite/build/sdk/include)
-
-install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
-    DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
--- a/samples/multi_device/README
+++ b/samples/multi_device/README
@ -1,15 +1,25 @@
 ## brief
-The multi_device demo uses some acuity exported tim-vx networks, and running on 4 devices of NPU using platform api.
+The multi_device demo uses some acuity exported tim-vx networks, and running on multi-core devices of NPU using platform api.

-## environment
-  export VSIMULATOR_CONFIG=VIP9400O_PID0XD9
-  export VIV_MGPU_AFFINITY="1:0"
-  export VIV_OVX_USE_MULTI_DEVICE="1:1"
-  export TIM_VX_ROOT="${workspaceFolder}/tim-vx"
+## note
+Please note that if you have enabled lite platform, a dedicated VIVANTE_SDK(NO_KERNEL) is required as the compiler for NBG.
+The driver for the NPU is the VIPLITE driver
+
+##requirements
+Vivante SDK >= 6.4.22
+ovxlib >= 1.2.26
+viplite >=2.0.0

 ## build
 cd build
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON
+cmake .. -DCMAKE_BUILD_TYPE=Release -DTIM_VX_USE_EXTERNAL_OVXLIB=ON -DEXTERNAL_VIV_SDK=${VIVANTE_NOKERNEL_SDK_DIR} -DOVXLIB_INC=${OVXLIB_DIR}/include \
+         -DOVXLIB_LIB=/path/to/libovxlib.so -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON -DTIM_VX_ENABLE_PLATFORM_LITE=ON -DVIP_LITE_SDK=${VIP_LITE_SDK}
+
+## environment
+# Export VIV_GPU_FILE to specify the NPU hardware configuration file for the NBG compiler
+# VIV_GPU_FILE Specify the NPU hardware configuration file for the NBG compiler
+export VIV_GPU_FILE="/path/to/VIP9400NANOQ_PLUS_PID0X10000055.config"
+export TIM_VX_ROOT="${workspaceFolder}/tim-vx"

 ## run
 cd build
--- a/samples/multi_device/multi_device.cc
+++ b/samples/multi_device/multi_device.cc
@ -35,7 +35,6 @@
 #include "tim/vx/context.h"
 #include "tim/vx/graph.h"
 #include "tim/vx/platform/platform.h"
-#include "tim/vx/platform/native.h"
 #include "vx_lenet.h"
 #include "vx_mobilenet.h"
 #include "vx_resnet50.h"
@ -59,7 +58,7 @@ static void printTopN(const T* prob, int outputCount, int topNum) {
 }

 template <typename T>
-void print_topN(std::size_t size, std::shared_ptr<tim::vx::platform::ITensorHandle> handle) {
+void print_topN(std::size_t size, std::shared_ptr<tim::vx::platform::ITensorHandle> & handle) {
  std::vector<T> output_data;
  output_data.resize(size);
  if (!handle->CopyDataFromTensor(output_data.data())) {
@ -94,7 +93,8 @@ void executor_trigger(std::shared_ptr<tim::vx::platform::IExecutor> executor) {
 }

 auto context = tim::vx::Context::Create();
-std::pair<std::shared_ptr<tim::vx::platform::IExecutable>, std::shared_ptr<tim::vx::platform::ITensorHandle>> generate_executable(
+std::pair<std::shared_ptr<tim::vx::platform::IExecutable>, std::shared_ptr<tim::vx::platform::ITensorHandle>>
+  generate_executable(
    std::shared_ptr<tim::vx::platform::IExecutor> executor,
    std::function<void(std::shared_ptr<tim::vx::Graph>, const char*)> construct_func,
    std::string weight_file,
@ -114,15 +114,17 @@ std::pair<std::shared_ptr<tim::vx::platform::IExecutable>, std::shared_ptr<tim::

 int main(int argc, char** argv) {
  (void) argc, (void) argv;
-  auto devices = tim::vx::platform::NativeDevice::Enumerate();
+  auto devices = tim::vx::platform::IDevice::Enumerate();
  auto device0 = devices[0];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor0 = std::make_shared<tim::vx::platform::NativeExecutor> (device0);
-  auto device1 = devices[1];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor1 = std::make_shared<tim::vx::platform::NativeExecutor> (device1);
-  auto device2 = devices[2];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor2 = std::make_shared<tim::vx::platform::NativeExecutor> (device2);
-  auto device3 = devices[3];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor3 = std::make_shared<tim::vx::platform::NativeExecutor> (device3);
+  auto total_core_count = device0->CoreCount();
+  uint32_t core_index = 0;
+  auto use_core_count = 1;
+  std::vector<std::shared_ptr<tim::vx::platform::IExecutor>> executors;
+
+  for(core_index = 0; core_index < total_core_count; core_index += use_core_count) {
+    auto executor = device0->CreateExecutor(core_index,use_core_count, context);
+    executors.push_back(executor);
+  }

  auto root = std::getenv("TIM_VX_ROOT");
  assert(root != NULL);
@ -142,46 +144,57 @@ int main(int argc, char** argv) {
  auto resnet50_weight_file = ROOT + "/samples/multi_device/resnet50/resnet50.export.data";
  std::function<void(std::shared_ptr<tim::vx::Graph>, const char*)> resnet50_construct_func = acuitylite::resnet50::construct_graph;

-  std::shared_ptr<tim::vx::platform::IExecutable> lenet_0, lenet_2, lenet_3, mobilenet_1, mobilenet_2, mobilenet_3, resnet50_0, resnet50_1;
-  std::shared_ptr<tim::vx::platform::ITensorHandle> lenet_0_outhandle, lenet_2_outhandle, lenet_3_outhandle, mobilenet_1_outhandle, mobilenet_2_outhandle, mobilenet_3_outhandle,
-    resnet50_0_outhandle, resnet50_1_outhandle;
+  auto excutor_cnt  = executors.size();

-  std::tie(lenet_0, lenet_0_outhandle) = generate_executable(executor0, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes);
-  std::tie(resnet50_0, resnet50_0_outhandle) = generate_executable(executor0, resnet50_construct_func, resnet50_weight_file, resnet50_input_files, resnet50_input_bytes);
-  executor0->Submit(lenet_0, lenet_0);
-  executor0->Submit(resnet50_0, lenet_0);
+  //each excutor run 2 models.
+  auto lenet = [&](std::shared_ptr<tim::vx::platform::IExecutor> executor) {
+    return generate_executable(executor, lenet_construct_func, lenet_weight_file,
+                               lenet_input_files, lenet_input_bytes);
+  };
+  auto resnet = [&](std::shared_ptr<tim::vx::platform::IExecutor> executor) {
+     return generate_executable(executor, resnet50_construct_func, resnet50_weight_file,
+                                resnet50_input_files, resnet50_input_bytes);
+  };
+  auto mobilenet = [&](std::shared_ptr<tim::vx::platform::IExecutor> executor) {
+     return generate_executable(executor, mobilenet_construct_func, mobilenet_weight_file,
+                                mobilenet_input_files, mobilenet_input_bytes);
+  };
+  std::vector<std::pair<std::shared_ptr<tim::vx::platform::IExecutable>,
+              std::shared_ptr<tim::vx::platform::ITensorHandle>>> nets;
+  for (size_t i = 0; i < excutor_cnt; i++) {
+    if(i % 3 == 0) {
+      //lenet + resnet
+      nets.push_back(lenet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+      nets.push_back(resnet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+    }
+    if(i % 3 == 1) {
+      //resnet + mobilenet
+      nets.push_back(resnet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+      nets.push_back(mobilenet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+    }
+    if(i % 3 == 2) {
+      //lenet + mobilenet
+      nets.push_back(mobilenet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+      nets.push_back(lenet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+    }
+  }
+  std::vector<std::thread> threads;
+  for(auto executor:executors) {
+        threads.push_back(std::thread(executor_trigger, executor));
+  }
+  for(std::thread &t : threads) {
+     t.join();
+  }

-  std::tie(mobilenet_1, mobilenet_1_outhandle) = generate_executable(executor1, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes);
-  std::tie(resnet50_1, resnet50_1_outhandle) = generate_executable(executor1, resnet50_construct_func, resnet50_weight_file, resnet50_input_files, resnet50_input_bytes);
-  auto executable_set1 = tim::vx::platform::CreateExecutableSet({mobilenet_1, resnet50_1});
-  executor1->Submit(executable_set1, executable_set1);
-
-  std::tie(lenet_2, lenet_2_outhandle) = generate_executable(executor2, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes);
-  std::tie(mobilenet_2, mobilenet_2_outhandle) = generate_executable(executor2, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes);
-  auto executable_set2 = tim::vx::platform::CreateExecutableSet({lenet_2, mobilenet_2});
-  executor2->Submit(executable_set2, executable_set2);
-
-  std::tie(lenet_3, lenet_3_outhandle) = generate_executable(executor3, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes);
-  std::tie(mobilenet_3, mobilenet_3_outhandle) = generate_executable(executor3, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes);
-  auto executable_set3 = tim::vx::platform::CreateExecutableSet({lenet_3, mobilenet_3});
-  executor3->Submit(executable_set3, executable_set3);
-
-  std::thread t0(executor_trigger, executor0);
-  std::thread t1(executor_trigger, executor1);
-  std::thread t2(executor_trigger, executor2);
-  std::thread t3(executor_trigger, executor3);
-  t0.join();
-  t1.join();
-  t2.join();
-  t3.join();
-
-  print_topN<float>(1 * 10, lenet_0_outhandle);
-  print_topN<float>(1 * 10, lenet_2_outhandle);
-  print_topN<float>(1 * 10, lenet_3_outhandle);
-  print_topN<float>(1 * 1001, mobilenet_1_outhandle);
-  print_topN<float>(1 * 1001, mobilenet_2_outhandle);
-  print_topN<float>(1 * 1001, mobilenet_3_outhandle);
-  print_topN<uint16_t>(1 * 1000, resnet50_0_outhandle);
-  print_topN<uint16_t>(1 * 1000, resnet50_1_outhandle);
+for (auto net : nets) {
+  auto size = net.second->GetSpec().GetElementNum();
+  print_topN<float>(size, net.second);
+}
  return 0;
 }
--- a/samples/multi_device/multi_device_demo.cc
+++ b/samples/multi_device/multi_device_demo.cc
@ -29,7 +29,7 @@
 #include "tim/vx/graph.h"
 #include "tim/vx/operation.h"
 #include "tim/vx/tensor.h"
-#include "tim/vx/platform/native.h"
+#include "tim/vx/platform/platform.h"

 static void printTopN() {
 }
@ -46,9 +46,9 @@ int demo(int argc, char** argv) {
  tim::vx::TensorSpec g0_input0, g0_output0, g1_output0, g2_output0, g3_output0, g4_output0, g5_output0;

  // query device and get executor of devcie
-  auto devices = tim::vx::platform::NativeDevice::Enumerate();
+  auto devices = tim::vx::platform::IDevice::Enumerate();
  auto device = devices[0];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor = std::make_shared<tim::vx::platform::NativeExecutor> (device);
+  auto executor = device->CreateExecutor(0,-1, context);

  // executable0
  auto executable0 = executor->Compile(g0);  // compile to nbg
@ -89,33 +89,6 @@ int demo(int argc, char** argv) {
  // trigger
  executor->Trigger();  // run all submitted executables

-  /* 2. another way to run */
-  // executable_set0
-  std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables0;
-  executables0.push_back(executable0);
-  auto executable_set0 = CreateExecutableSet(executables0);
-  // executable_set1
-  std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables1;
-  executables1.push_back(executable1);
-  executables1.push_back(executable3);
-  auto executable_set1 = CreateExecutableSet(executables1);
-  // executable_set2
-  std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables2;
-  executables2.push_back(executable2);
-  executables2.push_back(executable4);
-  auto executable_set2 = CreateExecutableSet(executables2);
-  // executable_set3
-  std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables3;
-  executables3.push_back(executable5);
-  auto executable_set3 = CreateExecutableSet(executables3);
-  // submit executaleSets
-  executable_set0->Submit(executable_set0);
-  executable_set1->Submit(executable_set0);
-  executable_set2->Submit(executable_set1);
-  executable_set3->Submit(executable_set2);
-  // trigger
-  executor->Trigger();  // run all submitted executableSets
-
  printTopN();

  return 0;
--- a/samples/multi_device/vx_resnet50.cc
+++ b/samples/multi_device/vx_resnet50.cc
@ -1296,7 +1296,7 @@ void resnet50::construct_graph
    auto input_0 = graph->CreateTensor(input_0_spec);

    tim::vx::ShapeType output_229_shape({1000,1});
-    tim::vx::TensorSpec output_229_spec(tim::vx::DataType::FLOAT16, output_229_shape,
+    tim::vx::TensorSpec output_229_spec(tim::vx::DataType::FLOAT32, output_229_shape,
    tim::vx::TensorAttribute::OUTPUT);
    auto output_229 = graph->CreateTensor(output_229_spec);

--- a/samples/platform_sample/CMakeLists.txt
+++ b/samples/platform_sample/CMakeLists.txt
@ -0,0 +1,13 @@
+message("samples/platform_sample")
+
+set(TARGET_NAME "platform_sample")
+
+add_executable(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/platform_sample.cc)
+
+target_link_libraries(${TARGET_NAME} PRIVATE -Wl,--whole-archive tim-vx)
+target_include_directories(${TARGET_NAME} PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${PROJECT_SOURCE_DIR}/include)
+
+install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
--- a/samples/platform_sample/README
+++ b/samples/platform_sample/README
@ -0,0 +1,25 @@
+## brief
+The platform sample usage which use platform api.
+
+## note
+Please note that if you have enabled lite platform, a dedicated VIVANTE_SDK(NO_KERNEL) is required as the compiler for NBG.
+The driver for the NPU is the VIPLITE driver
+
+##requirements
+Vivante SDK >= 6.4.22
+ovxlib >= 1.2.26
+viplite >=2.0.0
+
+## build
+cd build
+cmake .. -DCMAKE_BUILD_TYPE=Release -DTIM_VX_USE_EXTERNAL_OVXLIB=ON -DEXTERNAL_VIV_SDK=${VIVANTE_NOKERNEL_SDK_DIR} -DOVXLIB_INC=${OVXLIB_DIR}/include \
+         -DOVXLIB_LIB=${VIVANTE_NOKERNEL_SDK_DIR}/drivers/libovxlib.so -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON \
+         -DTIM_VX_ENABLE_PLATFORM_LITE=ON -DVIP_LITE_SDK=${VIP_LITE_SDK}
+
+## environment
+# Export VIV_GPU_FILE to specify the NPU hardware configuration file for the NBG compiler
+export VIV_GPU_FILE="/path/to/VIP9000NANOQ_PLUS_PID0X100000XX.config"
+
+## run
+cd build
+./samples/platform_sample/platform_sample
--- a/samples/lite_multi_device/lite_multi_device.cc
+++ b/samples/lite_multi_device/lite_multi_device.cc
@ -26,8 +26,8 @@
 #include "tim/vx/graph.h"
 #include "tim/vx/ops.h"
 #include "tim/vx/types.h"
-#include "tim/vx/platform/native.h"
-#include "tim/vx/platform/lite/lite_native.h"
+#include "tim/vx/platform/platform.h"
+

 int main() {
  //construct tim-vx graph
@ -49,9 +49,15 @@ int main() {
  std::vector<int> data_vec_i0({1, 2, 3, 4});
  std::vector<int> data_vec_i1({4, 3, 2, 1});

-  auto devices = tim::vx::platform::NativeDevice::Enumerate();
+  auto devices = tim::vx::platform::IDevice::Enumerate();
+
+  std::cout << "NPU device count: " << devices.size() <<std::endl;
  auto device = devices[0];
-  auto executor = std::make_shared<tim::vx::platform::LiteNativeExecutor>(device);
+  //run 1 core in device 0
+  std::cout << "NPU device[0] has " << device->CoreCount() << "cores" <<std::endl;
+  auto use_core_count = -1;
+  auto executor = device->CreateExecutor(use_core_count);
+
  auto executable = executor->Compile(graph);
  auto input0_handle = executable->AllocateTensor(input_spec);
  auto input1_handle = executable->AllocateTensor(input_spec);
@ -73,6 +79,10 @@ int main() {
  //each output value should be "5" in this demo
  for (int i = 0; i < 4; ++i) {
    std::cout << "output value: " << data[i] << std::endl;
+    if(data[i] != 5) {
+      std::cout << "test failed" << std::endl;
+      break;
+    }
  }
  free(data);
  return 0;
--- a/src/tim/CMakeLists.txt
+++ b/src/tim/CMakeLists.txt
@ -61,8 +61,10 @@ if(TIM_VX_ENABLE_PLATFORM)
        endif()
        list(APPEND LITE_EXTERNAL_LIBS
            ${VIP_LITE_SDK}/drivers/libNBGlinker.so
-            ${VIP_LITE_SDK}/drivers/libVIPlite.so)
-        list(APPEND LITE_INC_DIRS ${VIP_LITE_SDK}/include)
+            ${VIP_LITE_SDK}/drivers/libVIPhal.so)
+        list(APPEND LITE_INC_DIRS
+            ${VIP_LITE_SDK}/include
+            ${VIP_LITE_SDK}/include/nbg_linker)
    endif()

    if(TIM_VX_ENABLE_GRPC)
--- a/src/tim/transform/permute_vector.h
+++ b/src/tim/transform/permute_vector.h
@ -26,6 +26,7 @@

 #include <array>
 #include <cassert>
+#include <cstdint>
 #include <memory>
 #include <vector>
 #include <string>
--- a/src/tim/vx/internal/include/custom/custom_node_type.def
+++ b/src/tim/vx/internal/include/custom/custom_node_type.def
@ -9,3 +9,4 @@ DEF_NODE_TYPE(custom_sample)
 DEF_NODE_TYPE(custom_tiny_yolov4_postprocess)
 DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_confidence)
 DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_box)
+DEF_NODE_TYPE(custom_letterbox)
--- a/src/tim/vx/internal/include/custom/custom_ops.def
+++ b/src/tim/vx/internal/include/custom/custom_ops.def
@ -9,3 +9,4 @@ DEF_OP(CUSTOM_SAMPLE)
 DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS)
 DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE)
 DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX)
+DEF_OP(CUSTOM_LETTERBOX)
--- a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_letterbox.h
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_letterbox.h
@ -0,0 +1,61 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VSI_NN_OP_CUSTOM_LETTERBOX_H
+#define _VSI_NN_OP_CUSTOM_LETTERBOX_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_custom_letterbox_param
+{
+    struct _custom_letterbox_local_data_t* local;
+    int32_t new_shape_w;
+    int32_t new_shape_h;
+    vx_bool auto_bool;
+    vx_bool scaleFill;
+    vx_bool scaleup;
+    int32_t stride;
+    vx_bool center;
+    float mean_r;
+    float mean_g;
+    float mean_b;
+    float scale_r;
+    float scale_g;
+    float scale_b;
+    int32_t pad_value_r;
+    int32_t pad_value_g;
+    int32_t pad_value_b;
+    vx_bool reverse_channel;
+} vsi_nn_custom_letterbox_param;
+_compiler_assert(offsetof(vsi_nn_custom_letterbox_param, local) == 0, \
+    vsi_nn_custom_lertterbox_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
+++ b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
@ -34,5 +34,6 @@
 #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h"
 #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h"
 #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h"
+#include "custom/ops/vsi_nn_op_custom_letterbox.h"

 #endif
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@ -203,3 +203,4 @@ DEF_OP(BITCAST)
 DEF_OP(GROUPED_CONV3D)
 DEF_OP(COL2IM)
 DEF_OP(L1_LAYER_NORM)
+DEF_OP(ROPE)
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
@ -80,7 +80,7 @@ typedef struct _vsi_nn_pre_process_rgb_param
    float g_scale;
    float b_scale;
    /* pre process rgb layer local data structure */
-    vsi_nn_pre_process_rgb_lcl_data local;
+    vsi_nn_pre_process_rgb_lcl_data *local;
 } vsi_nn_pre_process_rgb_param;

 #ifdef __cplusplus
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_rope.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_rope.h
@ -1,6 +1,6 @@
 /****************************************************************************
 *
-*    Copyright (c) 2020-2023 Vivante Corporation
+*    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
@ -21,38 +21,29 @@
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
-#ifndef TIM_VX_NATIVE_DEVICE_PRIVATE_H_
-#define TIM_VX_NATIVE_DEVICE_PRIVATE_H_

-#include "tim/vx/platform/native.h"
-#include "vip/virtual_device.h"
-#include "graph_private.h"
+#ifndef _VSI_NN_OP_ROPE_H
+#define _VSI_NN_OP_ROPE_H

-namespace tim {
-namespace vx {
+#include "vsi_nn_types.h"

-class GraphImpl;
+#ifdef __cplusplus
+extern "C" {
+#endif

-namespace platform {
+typedef struct _vsi_nn_rope_param
+{
+    struct _rope_local_data_t* local;
+    // Add parameters here
+    int32_t axis;
+    vsi_bool interleaved;
+} vsi_nn_rope_param;
+_compiler_assert(offsetof(vsi_nn_rope_param, local) == 0, \
+    vsi_nn_rope_h );

-class NativeDeviceImpl : public NativeDevice {
- public:
-  NativeDeviceImpl(device_id_t id);
-  ~NativeDeviceImpl(){};
+#ifdef __cplusplus
+}
+#endif

-  bool Submit(const std::shared_ptr<tim::vx::Graph>& graph) override;
-  bool Trigger(bool async = false, async_callback cb = NULL) override;
-  bool DeviceExit() override;
-  void WaitDeviceIdle() override;
+#endif

- protected:
-  std::unique_ptr<vip::IDevice> vip_device_;
-  std::vector<vsi_nn_graph_t*> vsi_graph_v_;
-
-};
-
-}  // namespace platform
-}  // namespace vx
-}  // namespace tim
-
-#endif /* TIM_VX_NATIVE_DEVICE_PRIVATE_H_*/
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
@ -34,6 +34,7 @@ typedef struct _vsi_nn_topk_param
 {
    uint32_t     k;
    int32_t      axis;
+    struct _topk_local_data_t* local;
 } vsi_nn_topk_param;

 #ifdef __cplusplus
--- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
@ -384,25 +384,17 @@ static VSI_INLINE_API float fp16_to_fp32

 static VSI_INLINE_API float bfp16_to_fp32
    (
-    int16_t in
+    uint16_t in
    )
 {
-    uint32_t t1, t2, t3;
    float out;
    fp32_bit_cast_t fp32_bit_cast;

-    t1 = in & 0x00FF;                       // Mantissa
-    t2 = in & 0xFF00;                       // Sign bit + Exponent
-    t3 = in & 0x7F00;                       // Exponent
+    fp32_bit_cast.data = (uint32_t)(in << 16);

-    t1 <<= 16;
-    t2 <<= 16;                              // Shift (sign + Exponent) bit into position
-    t1 |= t2;                               // Re-insert (sign + Exponent) bit
-
-    fp32_bit_cast.data = t1;
    out = fp32_bit_cast.val;

-    return t3 == 0 ? 0.0f : out;
+    return out;
 } /* bfp16_to_fp32() */

 static VSI_INLINE_API uint16_t fp32_to_fp16
@ -720,7 +712,7 @@ static VSI_INLINE_API vsi_status dtype_to_float32
        *dst = fp16_to_fp32( *(int16_t *)src );
        break;
    case VSI_NN_TYPE_BFLOAT16:
-        *dst = bfp16_to_fp32( *(int16_t *)src );
+        *dst = bfp16_to_fp32( *(uint16_t *)src );
        break;
    case VSI_NN_TYPE_FLOAT8_E4M3:
        *dst = fp8_e4m3_to_fp32(*(int8_t*)src, src_dtype->scale);
--- a/src/tim/vx/internal/include/vsi_nn/vsi_nn.h
+++ b/src/tim/vx/internal/include/vsi_nn/vsi_nn.h
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@ -61,14 +61,13 @@ typedef struct _vsi_nn_hw_config_t
 {
    char target_name[VSI_NN_MAX_TARGET_NAME];
    vsi_nn_hw_evis_t evis;
-#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
    uint32_t subGroupSize;
-#endif
    uint32_t use_40bits_va;
    uint32_t support_stream_processor;
    uint32_t sp_exec_count;
    uint32_t sp_vector_depth;
    uint32_t sp_per_core_vector_depth;
+    uint32_t support_ffd;
 } vsi_nn_hw_config_t;

 typedef struct _vsi_nn_runtime_option_t
@ -89,6 +88,7 @@ typedef struct _vsi_nn_runtime_option_t
    int32_t enable_save_file_type;
    int32_t enable_use_image_process;
    int32_t enable_use_from_handle;
+    vsi_nn_hw_config_t config;
 } vsi_nn_runtime_option_t;

 /**
@ -101,6 +101,15 @@ typedef struct _vsi_nn_context_t
    vsi_nn_runtime_option_t options;
 } VSI_PUBLIC_TYPE *vsi_nn_context_t;

+/**
+ * Query and set options->config hw params.
+ */
+OVXLIB_API vsi_status query_hardware_caps_runtime
+    (
+    vsi_nn_context_t ctx,
+    vsi_nn_runtime_option_t *options
+    );
+
 /**
 * Create context
 * Create ovxlib NN runtime context.
@ -113,6 +122,11 @@ OVXLIB_API vsi_status vsi_nn_initOptions
    (
    vsi_nn_runtime_option_t *options
    );
+OVXLIB_API vsi_status vsi_nn_initOptions_runtime
+    (
+    vsi_nn_runtime_option_t *options,
+    vsi_nn_context_t ctx
+    );
 /**
 * Release context
 * Release ovxlib NN runtime resource and reset context handle to NULL.
--- a/src/tim/vx/internal/include/vsi_nn_feature_config.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h
@ -57,5 +57,8 @@
 #define VSI_PER_GROUP_QUANTIZATION_SUPPORT
 #endif
 #define VSI_GRAPH_RUNTIME_ENV_SUPPORT
+#if defined(VX_TENSOR_SPARSITY_SUPPORT)
+#define VSI_TENSOR_SPARSITY_SUPPORT
+#endif

 #endif
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@ -216,6 +216,7 @@
 #include "ops/vsi_nn_op_grouped_conv3d.h"
 #include "ops/vsi_nn_op_col2im.h"
 #include "ops/vsi_nn_op_l1_layer_norm.h"
+#include "ops/vsi_nn_op_rope.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 #include "ops/vsi_nn_op_inverse_sigmoid.h"
@ -420,6 +421,7 @@ typedef union _vsi_nn_nn_param
    vsi_nn_grouped_conv3d_param     grouped_conv3d;
    vsi_nn_col2im_param             col2im;
    vsi_nn_l1_layer_norm_param      l1_layer_norm;
+    vsi_nn_rope_param               rope;
    void*                         client_param;

    /* custom node data struct define */
--- a/src/tim/vx/internal/include/vsi_nn_tensor.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor.h
@ -86,8 +86,10 @@ typedef enum
    VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6,
    /** perchannel float8 */
    VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7,
-    /** GPQT */
+    /** pergroup symmetric */
    VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC = 0x8,
+    /** pergroup asymmetric */
+    VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC = 0x9,
    /** undefined type */
    VSI_NN_QNT_TYPE_NA = 0xff,
 } vsi_nn_qnt_type_e;
--- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
@ -418,6 +418,34 @@ OVXLIB_API vsi_status vsi_nn_SetTensorIsScalar
    int8_t is_scalar
 );

+/**
+ * Get Tensor is_scalar
+ * Get the is_sparsity of the tensor
+ *
+ * @param[in] tensor Tensor.
+ *
+ * @return is_sparsity flag of the tensor.
+ */
+OVXLIB_API int32_t vsi_nn_GetTensorIsSparsity
+(
+    vsi_nn_tensor_t* tensor
+);
+
+/**
+ * Set Weight Tensor whether is sparsity
+ * Set the is_sparsity for the tensor
+ *
+ * @param[in] tensor Tensor.
+ * @param[in] new is_sparsity value of the tensor.
+ *
+ * @return VSI_SUCCESS on success, or error core otherwise.
+**/
+
+OVXLIB_API vsi_status vsi_nn_SetTensorIsSparsity(
+    vsi_nn_tensor_t* tensor,
+    int32_t is_sparsity
+);
+
 OVXLIB_API vsi_status vsi_nn_CopyRawDataToTensor
    (
    vsi_nn_graph_t*         graph,
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@ -33,7 +33,7 @@ extern "C"{

 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 2
-#define VSI_NN_VERSION_PATCH 14
+#define VSI_NN_VERSION_PATCH 22
 #define VSI_NN_VERSION \
    (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_letterbox_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_letterbox_evis.c
@ -0,0 +1,475 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define _CUSTOM_LETTERBOX_KERNEL_SOURCE      "custom_letterbox"
+
+// Add kernel hashtable here
+#define CUSTOM_LETTERBOX_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        (( IN_DTYPE ) | ( OUT_DTYPE << 8 ))
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { CUSTOM_LETTERBOX_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+          CVIVANTE_NAMESPACE("evis.custom_letterbox_"#IN_DTYPE"to"#OUT_DTYPE), \
+          _CUSTOM_LETTERBOX_KERNEL_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _custom_letterbox_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( U8, U8 ),
+    PACK_KERNEL_MAP( U8, I8 ),
+    PACK_KERNEL_MAP( U8, F16 ),
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _custom_letterbox_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+#define _CUSTOM_LETTERBOX_PARAM_NUM  _cnt_of_array( _custom_letterbox_kernel_param_def )
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_custom_letterbox_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        2,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    VSI_UNREFERENCED(param_size);
+    int32_t      top = 0;
+    int32_t      bottom = 0;
+    int32_t      left = 0;
+    int32_t      right = 0;
+    float        scale_w = 0;
+    float        scale_h = 0;
+    int32_t      resize_w = 0;
+    int32_t      resize_h = 0;
+    int32_t      resize_max_w = 0;
+    int32_t      resize_max_h = 0;
+    float        output_scale = 1.0f;
+    float        output_zp = 0;
+    float        out_scale_r = 0;
+    float        out_zp_r = 0;
+    float        out_scale_g = 0;
+    float        out_zp_g = 0;
+    float        out_scale_b = 0;
+    float        out_zp_b = 0;
+    float        pad_v_r = 0;
+    float        pad_v_g = 0;
+    float        pad_v_b = 0;
+    int32_t      in_width  = 0;
+    int32_t      in_height = 0;
+    int32_t      out_width  = 0;
+    int32_t      out_height = 0;
+    float        mean_r = 0;
+    float        mean_g = 0;
+    float        mean_b = 0;
+    float        scale_r = 0;
+    float        scale_g = 0;
+    float        scale_b = 0;
+    vx_int32     pad_value_r = 0;
+    vx_int32     pad_value_g = 0;
+    vx_int32     pad_value_b = 0;
+    vx_int32     r_order = 0;
+    vx_int32     b_order = 0;
+    vx_int32     reverse_channel = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &top);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &bottom);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &left);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &right);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[6], &mean_r);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &mean_g);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &mean_b);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &scale_r);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &scale_g);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[11], &scale_b);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &pad_value_r);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &pad_value_g);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[14], &pad_value_b);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[15], &reverse_channel);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    in_width = (int32_t)attr[0]->shape->data[0] / 3;
+    in_height = (int32_t)attr[0]->shape->data[1];
+    out_width = (int32_t)attr[1]->shape->data[0];
+    out_height = (int32_t)attr[1]->shape->data[1] / 3;
+
+    output_scale = 1.0f / attr[1]->scale;
+    output_zp = (float)(attr[1]->zero_point);
+
+    resize_w = out_width - left - right;
+    resize_h = out_height - top - bottom;
+    resize_max_w = out_width - right;
+    resize_max_h = out_height - bottom;
+    scale_w = (float)in_width / resize_w;
+    scale_h = (float)in_height / resize_h;
+    out_scale_r = scale_r / output_scale;
+    out_zp_r = output_zp - out_scale_r * mean_r;
+    out_scale_g = scale_g / output_scale;
+    out_zp_g = output_zp - out_scale_g * mean_g;
+    out_scale_b = scale_b / output_scale;
+    out_zp_b = output_zp - out_scale_b * mean_b;
+    pad_v_r = pad_value_r * out_scale_r + out_zp_r;
+    pad_v_g = pad_value_g * out_scale_g + out_zp_g;
+    pad_v_b = pad_value_b * out_scale_b + out_zp_b;
+
+    if (reverse_channel)
+    {
+        r_order = out_height * 2;
+        b_order = 0;
+    }
+    else
+    {
+        r_order = 0;
+        b_order = out_height * 2;
+    }
+
+    {
+        gpu_dp_inst_t uniU8RightSubLeft_4x4 = {{
+            0x00090909, // TCfg
+            0x00000000, // ASelt
+            0x00140003, 0x00000025, // ABin
+            0x000a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniLeftToFloat32_4x4 = {{
+            0x00010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00000002, // ABin
+            0x00020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtactHalf8_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtract8Data_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniLeftToFloat32_4x4", &uniLeftToFloat32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Data_2x8", &uniExtract8Data_2x8 );
+    }
+    status |= vsi_nn_kernel_gpu_add_param( node, "top", &top );
+    status |= vsi_nn_kernel_gpu_add_param( node, "left", &left );
+    status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_r", &out_scale_r );
+    status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_g", &out_scale_g );
+    status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_b", &out_scale_b );
+    status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_r", &out_zp_r );
+    status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_g", &out_zp_g );
+    status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_b", &out_zp_b );
+    status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_r", &pad_v_r );
+    status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_g", &pad_v_g );
+    status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_b", &pad_v_b );
+    status |= vsi_nn_kernel_gpu_add_param( node, "scale_w", &scale_w );
+    status |= vsi_nn_kernel_gpu_add_param( node, "scale_h", &scale_h );
+    status |= vsi_nn_kernel_gpu_add_param( node, "resize_max_w", &resize_max_w );
+    status |= vsi_nn_kernel_gpu_add_param( node, "resize_max_h", &resize_max_h );
+    status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height );
+    status |= vsi_nn_kernel_gpu_add_param( node, "r_order", &r_order );
+    status |= vsi_nn_kernel_gpu_add_param( node, "b_order", &b_order );
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_size[0] = out_width;
+    gpu_param.global_size[1] = out_height;
+
+    status |= vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+} /* _custom_warp_affine_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _custom_letterbox_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _custom_letterbox_kernel_map );
+    vx_param_description_t * param_def  = _custom_letterbox_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array( _custom_letterbox_kernel_param_def );
+    vx_kernel_initialize_f  initializer = _custom_letterbox_initializer;
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = CUSTOM_LETTERBOX_HASH_KEY( in_dtype, out_dtype );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (vx_uint32)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CUSTOM_LETTERBOX_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    size_t i = 0;
+
+    int32_t top = vsi_nn_kernel_param_get_int32( params, "top");
+    int32_t bottom = vsi_nn_kernel_param_get_int32( params, "bottom");
+    int32_t left = vsi_nn_kernel_param_get_int32( params, "left");
+    int32_t right = vsi_nn_kernel_param_get_int32( params, "right");
+    float   mean_r = vsi_nn_kernel_param_get_float32( params, "mean_r");
+    float   mean_g = vsi_nn_kernel_param_get_float32( params, "mean_g");
+    float   mean_b = vsi_nn_kernel_param_get_float32( params, "mean_b");
+    float   scale_r = vsi_nn_kernel_param_get_float32( params, "scale_r");
+    float   scale_g = vsi_nn_kernel_param_get_float32( params, "scale_g");
+    float   scale_b = vsi_nn_kernel_param_get_float32( params, "scale_b");
+    int32_t pad_value_r = vsi_nn_kernel_param_get_int32( params, "pad_value_r");
+    int32_t pad_value_g = vsi_nn_kernel_param_get_int32( params, "pad_value_g");
+    int32_t pad_value_b = vsi_nn_kernel_param_get_int32( params, "pad_value_b");
+    int32_t reverse_channel = vsi_nn_kernel_param_get_int32( params, "reverse_channel");
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+
+    uint32_t param_num = _CUSTOM_LETTERBOX_PARAM_NUM;
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+    shapes[0][0] = inputs[0]->attr.size[1] * 3;
+    shapes[0][1] = inputs[0]->attr.size[2];
+    shapes[1][0] = outputs[0]->attr.size[0];
+    shapes[1][1] = outputs[0]->attr.size[1] * 3;
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+            inputs[0], shapes[0], 2 );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shapes[1], 2 );
+
+    if (reshape_tensors[0] == NULL ||
+        reshape_tensors[1] == NULL)
+    {
+        goto final;
+    }
+
+    if (reverse_channel)
+    {
+        float mean_temp = mean_r;
+        float scale_temp = scale_r;
+        int32_t pad_value_temp = pad_value_r;
+        mean_r = mean_b;
+        mean_b = mean_temp;
+        scale_r = scale_b;
+        scale_b = scale_temp;
+        pad_value_r = pad_value_b;
+        pad_value_b = pad_value_temp;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 2;
+
+            vsi_nn_kernel_node_pack_io( node_params, param_num,
+                    reshape_tensors, 1, &reshape_tensors[1], 1 );
+
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &bottom );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &right );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_r );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_g );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_b );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_r );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_g );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_b );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_r );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_g );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_b );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse_channel );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, param_num );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+            vsi_nn_kernel_scalar_release( &node_params[13] );
+            vsi_nn_kernel_scalar_release( &node_params[14] );
+            vsi_nn_kernel_scalar_release( &node_params[15] );
+
+            CHECK_STATUS(status);
+        }
+    }
+
+final:
+    for (i = 0; i < 2; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( custom_letterbox, _setup )
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
@ -35,6 +35,7 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vsi_nn_vxkernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"

 #define _CPU_ARG_NUM            (1)
 #define _CPU_INPUT_NUM          (1)
@ -42,6 +43,7 @@
 #define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
 #define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
 #define _KERNEL_NAME            ("com.vivantecorp.extension.Softmax2VXC")
+#define _KERNEL_NAME_U8         ("com.vivantecorp.extension.Softmax2VXC_u8")

 #define SCALAR_INPUT_AXIS          (2)

@ -64,7 +66,11 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
 {
    vsi_status status = VSI_FAILURE;
    int sf_size = 0;
-    vsi_nn_kernel_tensor_attr_t* attr = NULL;
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    float srcZP = 0.0f;
+    float srcScale = 1.0f;
+    float dstZP = 0.0f;
+    float dstScale = 1.0f;
    // Alignment with a power of two value.
    gpu_param_t gpu_param = {
        2,          // workdim
@ -75,14 +81,19 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)

    VSI_UNREFERENCED(param_size);

-    attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
-    if (!attr)
+    attr[0] = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
+    attr[1] = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]);
+    if ((!attr[0]) || (!attr[1]))
    {
        VSILOGE("Query failure! at line");
        return status;
    }

-    sf_size  =  (int)attr->shape->data[0];
+    sf_size  =  (int)attr[0]->shape->data[0];
+    srcScale = attr[0]->scale;
+    srcZP = (float)attr[0]->zero_point;
+    dstScale = 1.0f / attr[1]->scale;
+    dstZP = (float)attr[1]->zero_point;

    gpu_param.global_offset[0] = 0;
    gpu_param.global_offset[1] = 0;
@ -91,7 +102,7 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
    gpu_param.local_size[0]    = 1;
    gpu_param.local_size[1]    = 1;
    gpu_param.global_size[0]   =
-        gpu_align_p2((1 + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0],
+        gpu_align_p2((attr[0]->shape->data[1] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0],
                gpu_param.local_size[0]);
    gpu_param.global_size[1]   =
        gpu_align_p2((1 + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1],
@ -107,25 +118,50 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
            0x00000001, 0x00000000, 0x00000001, 0x00000000,
            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtract8Bin_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};

        status = vsi_nn_kernel_gpu_add_param( node,
                "Uni4x4_Fp16ToFp32", &Uni4x4_Fp16ToFp32 );
-        vsi_nn_kernel_gpu_add_param(node,
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                "uniExtract8Bin_2x8", &uniExtract8Bin_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param(node,
                "sf_size", &sf_size);
+        status |= vsi_nn_kernel_gpu_add_param(node, "srcScale", &srcScale);
+        status |= vsi_nn_kernel_gpu_add_param(node, "srcZP", &srcZP);
+        status |= vsi_nn_kernel_gpu_add_param(node, "dstScale", &dstScale);
+        status |= vsi_nn_kernel_gpu_add_param(node, "dstZP", &dstZP);
    }

-    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    status |= vsi_nn_kernel_gpu_config( node, &gpu_param );

    if(status != VSI_SUCCESS)
    {
        VSILOGE("Initializer  failure!");
    }
-    if (attr) vsi_nn_kernel_tensor_attr_release( &attr );
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }

    return status;
 }

-static const vx_kernel_description_t _kernel_info =
+static const vx_kernel_description_t _kernel_info1 =
 {
    KERNEL_ID_PLACEHOLDER,
    _KERNEL_NAME,
@ -139,6 +175,20 @@ static const vx_kernel_description_t _kernel_info =
    vsi_nn_KernelDeinitializer
 };

+static const vx_kernel_description_t _kernel_info2 =
+{
+    KERNEL_ID_PLACEHOLDER,
+    _KERNEL_NAME_U8,
+    NULL,
+    kernel_param_def,
+    _cnt_of_array( kernel_param_def ),
+    vsi_nn_KernelValidator,
+    NULL,
+    NULL,
+    _softmax_initializer,
+    vsi_nn_KernelDeinitializer
+};
+
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -146,9 +196,20 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    VSI_UNREFERENCED(inputs);
-    VSI_UNREFERENCED(outputs);
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+
+    in_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
+    out_dtype = vsi_nn_kernel_map_dtype(outputs[0]->attr.dtype.vx_type);
+
+    if (in_dtype == U8 && out_dtype == U8)
+    {
+        memmove( &kernel->info, &_kernel_info2, sizeof(vx_kernel_description_t) );
+    }
+    else
+    {
+        memmove( &kernel->info, &_kernel_info1, sizeof(vx_kernel_description_t) );
+    }

    vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
            "vsi_nn_kernel_header",
@ -173,12 +234,42 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
    vsi_nn_kernel_node_t node = NULL;
    int32_t axis = 0;
+    vsi_nn_tensor_t* reshape_tensors[2] = {NULL};
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
+    uint32_t rank_in = 0;
+    int32_t new_axis = 0;
+    uint32_t i = 0;
+    vsi_bool ret = vx_false_e;

    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);

    axis = vsi_nn_kernel_param_get_int32(params, "axis");

+    ret = vsi_nn_kernel_optimize_softmax_shape(inputs[0]->attr.size,
+                                           inputs[0]->attr.dim_num,
+                                           axis,
+                                           shapes[0],
+                                           &rank_in,
+                                           &new_axis);
+
+    if (ret)
+    {
+        reshape_tensors[0] = vsi_nn_reshape_tensor(graph, inputs[0], shapes[0], rank_in);
+        reshape_tensors[1] = vsi_nn_reshape_tensor(graph, outputs[0], shapes[0], rank_in);
+    }
+    else
+    {
+        return NULL;
+    }
+
+    if (!vsi_nn_kernel_gpu_check_shape(reshape_tensors[0]->attr.size,
+                                       reshape_tensors[0]->attr.dim_num) ||
+        new_axis > 2)
+    {
+        return NULL;
+    }
+
    status = _query_kernel( inputs, outputs, kernel );
    if( VSI_SUCCESS == status)
    {
@ -187,9 +278,9 @@ static vsi_nn_kernel_node_t _setup
        {
            /* Set inputs and outputs */
            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
+                    reshape_tensors, _CPU_INPUT_NUM, &reshape_tensors[1], _CPU_OUTPUT_NUM );
            backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
+                    graph, I32, &new_axis );

            /* Pass parameters to node. */
            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
@ -200,6 +291,11 @@ static vsi_nn_kernel_node_t _setup
            status = VSI_FAILURE;
        }
    }
+
+    for (i = 0; i < 2; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
    return node;
 } /* _setup() */

--- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_letterbox.c
+++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_letterbox.c
@ -0,0 +1,227 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_internal_node.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _custom_letterbox_local_data_t {
+    int32_t placeholder;
+} custom_letterbox_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+int32_t my_round(float in)
+{
+    if (in >= 0)
+    {
+        return (int)(in + 0.5f);
+    }
+    else
+    {
+        return (int)(in - 0.5f);
+    }
+}
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_custom_letterbox_param * p;
+    p = &(self->nn_param.custom_letterbox);
+    int32_t shape_w = (int32_t)inputs[0]->attr.size[1];
+    int32_t shape_h = (int32_t)inputs[0]->attr.size[2];
+    int32_t new_shape_w = (int32_t)outputs[0]->attr.size[0];
+    int32_t new_shape_h = (int32_t)outputs[0]->attr.size[1];
+    vx_bool auto_bool = p->auto_bool;
+    vx_bool scaleFill = p->scaleFill;
+    vx_bool scaleup = p->scaleup;
+    int32_t stride = p->stride;
+    vx_bool center = p->center;
+
+    float r = 1.0f;
+    int32_t new_unpad_w = 0;
+    int32_t new_unpad_h = 0;
+    int32_t dw = 0;
+    int32_t dh = 0;
+    int32_t top = 0;
+    int32_t bottom = 0;
+    int32_t left = 0;
+    int32_t right = 0;
+
+    r = (float)fmin((float)new_shape_w / shape_w, (float)new_shape_h / shape_h);
+    if (!scaleup)
+    {
+        r = (float)fmin(r, 1.0f);
+    }
+
+    new_unpad_w = my_round(r * shape_w);
+    new_unpad_h = my_round(r * shape_h);
+    dw = new_shape_w - new_unpad_w;
+    dh = new_shape_h - new_unpad_h;
+    if (auto_bool)
+    {
+        dw = dw % stride;
+        dh = dh % stride;
+    }
+    else if (scaleFill)
+    {
+        dw = 0;
+        dh = 0;
+        new_unpad_w = new_shape_w;
+        new_unpad_h = new_shape_h;
+    }
+    if (center)
+    {
+        top = my_round(dh / 2.0f - 0.1f);
+        bottom = my_round(dh / 2.0f + 0.1f);
+        left = my_round(dw / 2.0f - 0.1f);
+        right = my_round(dw / 2.0f + 0.1f);
+    }
+    else
+    {
+        top = 0;
+        bottom = my_round(dh + 0.1f);
+        left = 0;
+        right = my_round(dw + 0.1f);
+    }
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32( param, "top", top);
+    vsi_nn_kernel_param_add_int32( param, "bottom", bottom);
+    vsi_nn_kernel_param_add_int32( param, "left", left);
+    vsi_nn_kernel_param_add_int32( param, "right", right);
+    vsi_nn_kernel_param_add_float32( param, "mean_r", p->mean_r);
+    vsi_nn_kernel_param_add_float32( param, "mean_g", p->mean_g);
+    vsi_nn_kernel_param_add_float32( param, "mean_b", p->mean_b);
+    vsi_nn_kernel_param_add_float32( param, "scale_r", p->scale_r);
+    vsi_nn_kernel_param_add_float32( param, "scale_g", p->scale_g);
+    vsi_nn_kernel_param_add_float32( param, "scale_b", p->scale_b);
+    vsi_nn_kernel_param_add_int32( param, "pad_value_r", p->pad_value_r);
+    vsi_nn_kernel_param_add_int32( param, "pad_value_g", p->pad_value_g);
+    vsi_nn_kernel_param_add_int32( param, "pad_value_b", p->pad_value_b);
+    vsi_nn_kernel_param_add_int32( param, "reverse_channel", p->reverse_channel);
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+            "custom_letterbox",
+            inputs, 1,
+            outputs, 1, param );
+
+    vsi_nn_kernel_param_release( &param );
+
+    return VSI_SUCCESS;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(LETTERBOX, 1, 1)
+        IO_TYPE(D_U8,         D_F16)
+        IO_TYPE(D_U8,         D_U8|Q_ASYM)
+        IO_TYPE(D_U8,         D_I8|Q_DFP)
+        IO_TYPE(D_U8,         D_I8|Q_ASYM)
+        IO_TYPE(D_U8,         D_I8|Q_SYM)
+    END_IO_TYPE_DECL(LETTERBOX)
+    if (!VALIDATE_OP_IO_TYPES(LETTERBOX, self, inputs, self->input.num, outputs, self->output.num)) {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        outputs[0]->attr.size[0] = self->nn_param.custom_letterbox.new_shape_w;
+        outputs[0]->attr.size[1] = self->nn_param.custom_letterbox.new_shape_h;
+        outputs[0]->attr.size[2] = 3;
+        outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    status = vsi_nn_op_common_deinit(self);
+
+    return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ CUSTOM_LETTERBOX,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
--- a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
@ -85,18 +85,24 @@ static const struct {
    HASH_CUMSUM_KERNELS(0, U8,  U8)
    HASH_CUMSUM_KERNELS(0, F32, F32)
    HASH_CUMSUM_KERNELS(0, F32, U8)
+    HASH_CUMSUM_KERNELS(0, I32, I32)
    HASH_CUMSUM_KERNELS(1, U8,  U8)
    HASH_CUMSUM_KERNELS(1, F32, F32)
    HASH_CUMSUM_KERNELS(1, F32, U8)
+    HASH_CUMSUM_KERNELS(1, I32, I32)
    HASH_CUMSUM_KERNELS(2, U8,  U8)
    HASH_CUMSUM_KERNELS(2, F32, F32)
    HASH_CUMSUM_KERNELS(2, F32, U8)
+    HASH_CUMSUM_KERNELS(2, I32, I32)
+
    HASH_CUMSUM_KERNELS_2D(0, U8,  U8)
    HASH_CUMSUM_KERNELS_2D(0, F32, F32)
    HASH_CUMSUM_KERNELS_2D(0, F32, U8)
+    HASH_CUMSUM_KERNELS_2D(0, I32, I32)
    HASH_CUMSUM_KERNELS_2D(1, U8,  U8)
    HASH_CUMSUM_KERNELS_2D(1, F32, F32)
    HASH_CUMSUM_KERNELS_2D(1, F32, U8)
+    HASH_CUMSUM_KERNELS_2D(1, I32, I32)

    HASH_CUMSUM_ARRAY_KERNELS(0, U8,  U8, KERNEL_SOURCE_3)
    HASH_CUMSUM_ARRAY_KERNELS(0, F32, F32, KERNEL_SOURCE_3)
--- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
@ -26,6 +26,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@ -644,7 +645,8 @@ static vsi_nn_kernel_node_t _setup

 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
            shader_cnt_support =
-                (graph->ctx->config.subGroupSize >= 64 && graph->ctx->config.use_40bits_va) ? TRUE : FALSE;
+                (((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize >= 64 &&
+                    ((vsi_nn_graph_prv_t*)graph)->options->config.use_40bits_va) ? TRUE : FALSE;
 #endif
            if ((in1_h % 64 == 0) && (transFlg == 1) && (out_h % 8 == 0) && shader_cnt_support)
            {
--- a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
@ -75,6 +75,7 @@ static const _kernel_map_type _one_hot_kernel_map[] =
    PACK_ONE_HOT_KERNEL_MAP( F32, F32 ),
    PACK_ONE_HOT_KERNEL_MAP( I32, I32 ),
    PACK_ONE_HOT_KERNEL_MAP( I32, F32 ),
+    PACK_ONE_HOT_KERNEL_MAP( I32, BF16 ),
    PACK_ONE_HOT_KERNEL_MAP( I32, U8 ),
    PACK_ONE_HOT_KERNEL_MAP( U8,  U8 ),
 };
--- a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c
@ -79,7 +79,7 @@ static const struct {
        const char* source_name;
    } kernel_map[] =
 {
-    PRELU_KERNELS_FLOAT(F32, F32, F32,  KERNEL_SOURCE_1)
+    PRELU_KERNELS_FLOAT(F32, F32, F32, KERNEL_SOURCE_1)
    PRELU_KERNELS_FLOAT(F16, F16, F16, KERNEL_SOURCE_1)
    PRELU_KERNELS(U8, U8, U8, KERNEL_SOURCE_1)
    PRELU_KERNELS(I32, I32, I32, KERNEL_SOURCE_1)
--- a/src/tim/vx/internal/src/kernel/cl/rope_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/rope_cl.c
@ -0,0 +1,329 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_ROPE,
+} _internal_kernel_e;
+
+#define _ROPE_KERNEL_SOURCE      "rope"
+#define _ROPE_KERNEL_NAME        CVIVANTE_NAMESPACE("cl.rope")
+
+// Add kernel hashtable here
+#define STR(a) #a
+#define ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ) \
+        ((IN0_DTYPE) | (IN0_DTYPE << 8) | (OUT_DTYPE << 16) | (AXIS << 25))
+#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ) \
+        { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ), \
+          CVIVANTE_NAMESPACE("cl.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_axis"STR(AXIS)), \
+         "rope_0" }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _rope_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( F32, F32, F32, 0 ),
+    PACK_KERNEL_MAP( F32, F32, F32, 1 ),
+    PACK_KERNEL_MAP( F32, F32, F32, 2 ),
+    PACK_KERNEL_MAP( I32, I32, I32, 0 ),
+    PACK_KERNEL_MAP( I32, I32, I32, 1 ),
+    PACK_KERNEL_MAP( I32, I32, I32, 2 ),
+    PACK_KERNEL_MAP( U32, U32, U32, 0 ),
+    PACK_KERNEL_MAP( U32, U32, U32, 1 ),
+    PACK_KERNEL_MAP( U32, U32, U32, 2 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _rope_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _ROPE_PARAM_NUM  _cnt_of_array( _rope_kernel_param_def )
+#define SCALAR_AXIS                 (4)
+#define SCALAR_IN_ZP                (5)
+#define SCALAR_COS_ZP               (6)
+#define SCALAR_SIN_ZP               (7)
+#define SCALAR_SCALE0               (8)
+#define SCALAR_SCALE1               (9)
+#define SCALAR_OUT_ZP               (10)
+#define SCALAR_HALF_HEAD_SIZE       (11)
+#define SCALAR_STEP                 (12)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_rope_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,         // workdim
+        {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0}, // localWorkSize: local group size in thread
+        {0, 0, 0}  // globalWorkSize: image size in thread
+    };
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_attr_t* attr[2] = { NULL };
+    int32_t axis = 0;
+    vsi_size_array_t* out_shape = NULL;
+    vsi_size_t shape[3] = { 1 };
+
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+    out_shape = attr[1]->shape;
+    shape[0] = out_shape->data[0];
+    shape[1] = out_shape->data[1];
+    shape[2] = out_shape->data[2];
+    shape[axis] = shape[axis] / 2;
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+    gpu_param.global_size[0] = shape[0];
+    gpu_param.global_size[1] = shape[1];
+    gpu_param.global_size[2] = out_shape->size > 2 ? shape[2] : 1;
+
+    status = vsi_nn_kernel_gpu_config(node, &gpu_param);
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(attr[0]);
+    SAFE_FREE_TENSOR_ATTR(attr[1]);
+#undef SAFE_FREE_TENSOR_ATTR
+
+    return status;
+} /* _rope_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t axis
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e in2_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _rope_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _rope_kernel_map );
+    vx_param_description_t * param_def  = _rope_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _rope_initializer;
+
+    uint32_t key = 0;
+    uint32_t i;
+
+    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type);
+    in2_dtype = vsi_nn_kernel_map_dtype(inputs[2]->attr.dtype.vx_type);
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+#define _PACK_SELECT_KEY( in0_type, in1_type, in2_type, out_type ) \
+    ((in0_type) | (in1_type << 8) | (in2_type << 16) | (out_type << 24))
+    switch (_PACK_SELECT_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32, F32, F32):
+    case _PACK_SELECT_KEY(F16, F16, F16, F16):
+        key = ROPE_HASH_KEY(F32, F32, F32, axis);
+        break;
+    case _PACK_SELECT_KEY(U8,  U8,  U8,  U8):
+    case _PACK_SELECT_KEY(U16, U16, U16, U16):
+        key = ROPE_HASH_KEY(U32, U32, U32, axis);
+        break;
+    case _PACK_SELECT_KEY(I8,  I8,  I8,  I8):
+    case _PACK_SELECT_KEY(I16, I16, I16, I16):
+    case _PACK_SELECT_KEY(I32, I32, I32, I32):
+        key = ROPE_HASH_KEY(I32, I32, I32, axis);
+        break;
+    default:
+        break;
+    }
+#undef _PACK_SELECT_KEY
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _rope_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_ROPE_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
+    int32_t interleaved = vsi_nn_kernel_param_get_int32(params, "interleaved");
+    float in_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float cos_scale = vsi_nn_get_tensor_scale(inputs[1]);
+    float sin_scale = vsi_nn_get_tensor_scale(inputs[2]);
+    float out_scale = vsi_nn_get_tensor_scale(outputs[0]);
+    float in_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float cos_zp = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
+    float sin_zp = (float)vsi_nn_get_tensor_zero_point(inputs[2]);
+    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    int32_t half_head_size = interleaved ? 1 : (int32_t)(inputs[0]->attr.size[axis] / 2);
+    float scale0 = in_scale * cos_scale / out_scale;
+    float scale1 = in_scale * sin_scale / out_scale;
+    int32_t step = interleaved ? 2 : 1;
+    int32_t i = 0;
+
+    // Check if gpu can support the size
+    if ( !vsi_nn_kernel_gpu_check_shape(
+        inputs[0]->attr.size, inputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, axis );
+    if (VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _ROPE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_AXIS] = vsi_nn_kernel_scalar_create(
+                graph, I32, &axis);
+            node_params[SCALAR_IN_ZP] = vsi_nn_kernel_scalar_create(
+                graph, F32, &in_zp);
+            node_params[SCALAR_COS_ZP] = vsi_nn_kernel_scalar_create(
+                graph, F32, &cos_zp);
+            node_params[SCALAR_SIN_ZP] = vsi_nn_kernel_scalar_create(
+                graph, F32, &sin_zp);
+            node_params[SCALAR_SCALE0] = vsi_nn_kernel_scalar_create(
+                graph, F32, &scale0);
+            node_params[SCALAR_SCALE1] = vsi_nn_kernel_scalar_create(
+                graph, F32, &scale1);
+            node_params[SCALAR_OUT_ZP] = vsi_nn_kernel_scalar_create(
+                graph, F32, &output_zp);
+            node_params[SCALAR_HALF_HEAD_SIZE] = vsi_nn_kernel_scalar_create(
+                graph, I32, &half_head_size);
+            node_params[SCALAR_STEP] = vsi_nn_kernel_scalar_create(
+                graph, I32, &step);
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ROPE_PARAM_NUM );
+        }
+    }
+
+    for (i = SCALAR_AXIS; i < (int32_t)_ROPE_PARAM_NUM; i++)
+    {
+        if (node_params[i])
+        {
+            vsi_nn_kernel_scalar_release(&node_params[i]);
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( rope, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cl/swish_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/swish_cl.c
@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@ -299,7 +300,7 @@ static vsi_nn_kernel_node_t _setup
    VSI_UNREFERENCED(output_num);

 #if (VX_ACTIVATION_EXT_SUPPORT)
-    if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
+    if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
    {
        return NULL;
    }
--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@ -26,6 +26,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@ -457,7 +458,7 @@ static vsi_nn_kernel_node_t _setup
    vsi_bool is_odd_even_sort = FALSE;
    vsi_bool is_bitnoic_segment = FALSE;
    size_t param_num = _TOPK_PARAM_NUM;
-    int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2);
+    int32_t max_stages = 7 + (int32_t)log2(((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize >> 2);
    vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

@ -483,6 +484,11 @@ static vsi_nn_kernel_node_t _setup
        return NULL;
    }

+    if (block_size >= GPU_TENSOR_MAX_WIDTH)
+    {
+        return NULL;
+    }
+
    shape[0][0] = block_size;
    shape[0][1] = block_num;
    shape[1][0] = top_k;
--- a/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c
@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@ -192,7 +193,7 @@ static vsi_bool _bucketize_support_types
        return FALSE;
    }

-    if (in_dtype == F16 && graph->ctx->config.evis.ver != VSI_NN_HW_EVIS_2)
+    if (in_dtype == F16 && ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver != VSI_NN_HW_EVIS_2)
    {
        return FALSE;
    }
--- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@ -771,7 +772,8 @@ static vsi_nn_kernel_node_t _setup
    temp_tensor[1] = weights;
    temp_tensor[2] = biases;

-    ks = get_kernel_size(weights->attr.size[0], dilation, stride, graph->ctx->config.evis.ver);
+    ks = get_kernel_size(weights->attr.size[0], dilation, stride,
+        ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver);

    status = _query_kernel( kernel, temp_tensor, outputs, dilation, ks);

--- a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
@ -121,7 +121,9 @@ static const _kernel_map_type _groupnorm_sums_kernel_map[] =
    TENSOR_GROUPNORM_SUMS_KERNELS( U8, F32, KERNEL_SOURCE_0 )
    TENSOR_GROUPNORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_0 )
    TENSOR_GROUPNORM_SUMS_KERNELS( I16, F32, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SUMS_KERNELS( U16, F32, KERNEL_SOURCE_2 )
    TENSOR_GROUPNORM_SUMS_KERNELS_2D( I16, F32, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SUMS_KERNELS_2D( U16, F32, KERNEL_SOURCE_2 )
    TENSOR_GROUPNORM_SUMS_KERNELS( F16, F32, KERNEL_SOURCE_2 )
    TENSOR_GROUPNORM_SUMS_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 )
 };
@ -174,6 +176,9 @@ static const _kernel_map_type _groupnorm_kernel_map[] =
    TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, U8, KERNEL_SOURCE_2 )
    TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, F16, KERNEL_SOURCE_2 )
    TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, F16, KERNEL_SOURCE_2 )
+
+    TENSOR_GROUPNORM_SCALE_KERNELS( U16, F32, U16, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( U16, F32, U16, KERNEL_SOURCE_2 )
 };

 /*
@ -245,6 +250,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
    float sum_x2_tail0 = 1;
    float sum_x2_tail1 = 1;
    float work_item_pixels = 1;
+    vsi_bool is_input_8bits = FALSE;

    VSI_UNREFERENCED(param_size);

@ -263,12 +269,13 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
    width = (int32_t)(input_shape->data[0]);
    height = (int32_t)(input_shape->data[1]);
    chn = (int32_t)(attr[1]->shape->data[1]);
+    is_input_8bits = attr[0]->dtype == I8 || attr[0]->dtype == U8;
    if (is2D)
    {
        height = 1;
    }

-    work_item_pixels = (float)height * 16;
+    work_item_pixels = is_input_8bits ? 16 * (float)height : 8 * (float)height;

    sum_x_tail = -work_item_pixels * input_zp * input_scale;
    sum_x2_tail0 = work_item_pixels * input_zp * input_zp * input_scale2;
@ -281,11 +288,11 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
    shaderParam.local_size[1]  = 1;
    shaderParam.local_size[2]  = 1;

-    if (attr[0]->dtype == I8 || attr[0]->dtype == U8)
+    if (is_input_8bits)
    {
        shaderParam.global_size[0]   = (width + 255) / 256 * 16;
    }
-    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
    {
        shaderParam.global_size[0]   = (width + 127) / 128 * 16;
    }
@ -324,7 +331,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
        status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1);
        CHECK_STATUS_FAIL_GOTO(status, OnError );
    }
-    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
    {
        gpu_dp_inst_t uniSum_X_X2_8x2 = {{
            0x55555555, // TCfg
@ -483,7 +490,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
    }

    shaderParam.global_scale[0]  = 16;
-    if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
    {
        shaderParam.global_scale[0]  = 8;
    }
@ -610,6 +617,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
                    CHECK_STATUS_FAIL_GOTO(status, OnError );
                }
                break;
+            case _PACK_SELECT_KEY( U16, U16 ):
            case _PACK_SELECT_KEY( I16, I16 ):
            case _PACK_SELECT_KEY( I16, F16 ):
            case _PACK_SELECT_KEY( F16, F16 ):
@ -838,8 +846,7 @@ static vsi_nn_kernel_node_t _setup
    attr.is_const = FALSE;
    attr.vtl = TRUE;
    attr.size[0] = ((new_shape[0] + 255) / 256) * 4;
-    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
-        || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
+    if (in0_dtype == I16 || in0_dtype == F16 || in0_dtype == U16)
    {
        attr.size[0] = ((new_shape[0] + 127) / 128) * 4;
    }
--- a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
@ -124,22 +124,23 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
        {0, 0, 0}
        };
    int8_t      in0_fl          = 0;
-    int32_t     inputZP0        = 0;
-    float       input_scale0    = 1.0f;
-    int32_t     inputZP1        = 0;
-    float       input_scale1    = 1.0f;
+    int32_t     input0_zp       = 0;
+    float       input0_scale    = 1.0f;
+    int32_t     input1_zp       = 0;
+    float       input1_scale    = 1.0f;
+    float       output_zp       = 0;
    int8_t      out_fl          = 0;
-    float       outputZP        = 0;

-    int32_t  shift0             = 0;
-    vsi_bool is_ge_fl           = FALSE;
+    int32_t     shift0          = 0;
+    vsi_bool    is_ge_fl        = FALSE;
+
    vsi_bool is_2d_img          = FALSE;
    uint32_t evis_version       = 0;

    vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
    vsi_size_array_t * out_shape = NULL;
    uint32_t pack_key;
-    vx_context                  ctx       = vxGetContext((vx_reference)node);
+    vx_context ctx = vxGetContext((vx_reference)node);
    vx_hardware_caps_params_t   hw_param;

    VSI_UNREFERENCED(param_size);
@ -165,34 +166,30 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );

    out_shape     = attr[2]->shape;
-    inputZP0      = attr[0]->zero_point;
-    input_scale0  = attr[0]->scale;
-    inputZP1      = attr[1]->zero_point;
-    input_scale1  = attr[1]->scale;
-    outputZP      = (float)attr[2]->zero_point;
-    input_scale0  = input_scale0 / attr[2]->scale;
+    input0_zp     = attr[0]->zero_point;
+    input0_scale  = attr[0]->scale;
+    input1_zp     = attr[1]->zero_point;
+    input1_scale  = attr[1]->scale;
+    output_zp     = (float)attr[2]->zero_point;
+    input0_scale  = input0_scale / attr[2]->scale;

-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP &&
+        attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
    {
        in0_fl = (int8_t)attr[0]->dfp.fl;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
        out_fl = (int8_t)attr[2]->dfp.fl;
+        shift0 = in0_fl - out_fl;
+        is_ge_fl = shift0 >= 0;
    }

-    shift0 = in0_fl - out_fl;
-
    is_2d_img = (out_shape->size < 3) || (out_shape->data[2] == 1);
-    is_ge_fl  = shift0 >= 0;

 #define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, GE_FL, IMG_2D, EVIS2 )    \
        (IN0_TYPE  | ( OUT_TYPE << 16) | (GE_FL << 24) | (IMG_2D << 25) | (EVIS2 << 26))

-    pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype, is_ge_fl, is_2d_img, evis_version );
+    pack_key = _PACK_SELECT_KEY(attr[0]->dtype, attr[2]->dtype, is_ge_fl, is_2d_img, evis_version);

-    if ( attr[0]->dtype == I8 && attr[2]->dtype == I8 && is_ge_fl)
+    if (attr[0]->dtype == I8 && attr[2]->dtype == I8 && is_ge_fl)
    {
        gpu_param.global_scale[0] = 16;
        gpu_param.global_scale[1] = 1;
@ -204,7 +201,6 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
        gpu_param.global_scale[1] = 1;
        gpu_param.global_scale[2] = 1;
    }
-
    gpu_param.global_size[0] = gpu_align_p2(
            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
            / gpu_param.global_scale[0], 4);
@ -215,97 +211,97 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)

    switch( pack_key )
    {
-        case _PACK_SELECT_KEY( I8,  I8,  1, 1, 2 ):
-        case _PACK_SELECT_KEY( I16, I16, 1, 1, 2 ):
+    case _PACK_SELECT_KEY(I8,  I8,  1, 1, 2):
+    case _PACK_SELECT_KEY(I16, I16, 1, 1, 2):
+    {
+        gpu_dp_inst_t uniPreluDFPLo_2x8b = { {
+            0x77777777, // TCfg
+            0x44444444, // ASelt
+            0x33221100, 0x77665544, // ABin
+            0x00000000, // BSelt
+            0x30201000, 0x70605040, // BBin
+            0x00004000, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniPreluDFPHi_2x8b = { {
+            0x77777777, // TCfg
+            0x44444444, // ASelt
+            0xbbaa9988, 0xffeeddcc, // ABin
+            0x00000000, // BSelt
+            0x30201000, 0x70605040, // BBin
+            0x00004000, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        if (attr[0]->dtype == I16)
        {
-            gpu_dp_inst_t uniPreluDFPLo_2x8b = {{
-                0x77777777, // TCfg
-                0x44444444, // ASelt
-                0x33221100, 0x77665544, // ABin
-                0x00000000, // BSelt
-                0x30201000, 0x70605040, // BBin
-                0x00004000, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniPreluDFPHi_2x8b = {{
-                0x77777777, // TCfg
-                0x44444444, // ASelt
-                0xbbaa9988, 0xffeeddcc, // ABin
-                0x00000000, // BSelt
-                0x30201000, 0x70605040, // BBin
-                0x00004000, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            if ( attr[0]->dtype == I16 )
-            {
-                uniPreluDFPLo_2x8b.data[7] = 0x00003000;
-                uniPreluDFPHi_2x8b.data[7] = 0x00003000;
-            }
-
-            gpu_dp_inst_update_postshfit( &uniPreluDFPLo_2x8b, shift0 );
-            gpu_dp_inst_update_postshfit( &uniPreluDFPHi_2x8b, shift0 );
-
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniPreluDFPLo_2x8b", &uniPreluDFPLo_2x8b );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniPreluDFPHi_2x8b", &uniPreluDFPHi_2x8b );
-            CHECK_STATUS_FAIL_GOTO(status, final );
+            uniPreluDFPLo_2x8b.data[7] = 0x00003000;
+            uniPreluDFPHi_2x8b.data[7] = 0x00003000;
        }
-        break;
-        case _PACK_SELECT_KEY( I8,  I8,  1, 1, 1 ):
-        case _PACK_SELECT_KEY( I16, I16, 1, 1, 1 ):
-        {
-            gpu_dp_inst_t uniPreluInt8_2x8 = {{
-                0x55555555, // TCfg
-                0x00000000, // ASelt
-                0xb3a29180, 0xf7e6d5c4, // ABin
-                0x66666666, // BSelt
-                0x30201000, 0x70605040, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniPreluInt16_part0_4x4 = {{
-                0x05050505, // TCfg
-                0x00000000, // ASelt
-                0x00510040, 0x00730062, // ABin
-                0x06060606, // BSelt
-                0x00100000, 0x00300020, // BBin
-                0x00000400, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000000, 0x00000001, 0x00000000,
-                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniPreluInt16_part1_4x4 = {{
-                0x05050505, // TCfg
-                0x00000000, // ASelt
-                0x00510040, 0x00730062, // ABin
-                0x06060606, // BSelt
-                0x00500040, 0x00700060, // BBin
-                0x00000400, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000000, 0x00000001, 0x00000000,
-                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };

-            gpu_dp_inst_update_postshfit( &uniPreluInt8_2x8, shift0 );
-            gpu_dp_inst_update_postshfit( &uniPreluInt16_part0_4x4, shift0 );
-            gpu_dp_inst_update_postshfit( &uniPreluInt16_part1_4x4, shift0 );
+        gpu_dp_inst_update_postshfit(&uniPreluDFPLo_2x8b, shift0);
+        gpu_dp_inst_update_postshfit(&uniPreluDFPHi_2x8b, shift0);

-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniPreluInt8_2x8", &uniPreluInt8_2x8 );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniPreluInt16_part0_4x4", &uniPreluInt16_part0_4x4 );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniPreluInt16_part1_4x4", &uniPreluInt16_part1_4x4 );
-            CHECK_STATUS_FAIL_GOTO(status, final );
-        }
-        break;
-        case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 1 ):
-        case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 2 ):
-        case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 1 ):
-        case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 2 ):
+        status = vsi_nn_kernel_gpu_add_param(node,
+            "uniPreluDFPLo_2x8b", &uniPreluDFPLo_2x8b);
+        status |= vsi_nn_kernel_gpu_add_param(node,
+            "uniPreluDFPHi_2x8b", &uniPreluDFPHi_2x8b);
+        CHECK_STATUS_FAIL_GOTO(status, final);
+    }
+    break;
+    case _PACK_SELECT_KEY(I8,  I8,  1, 1, 1):
+    case _PACK_SELECT_KEY(I16, I16, 1, 1, 1):
+    {
+        gpu_dp_inst_t uniPreluInt8_2x8 = { {
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0xb3a29180, 0xf7e6d5c4, // ABin
+            0x66666666, // BSelt
+            0x30201000, 0x70605040, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniPreluInt16_part0_4x4 = { {
+            0x05050505, // TCfg
+            0x00000000, // ASelt
+            0x00510040, 0x00730062, // ABin
+            0x06060606, // BSelt
+            0x00100000, 0x00300020, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniPreluInt16_part1_4x4 = { {
+            0x05050505, // TCfg
+            0x00000000, // ASelt
+            0x00510040, 0x00730062, // ABin
+            0x06060606, // BSelt
+            0x00500040, 0x00700060, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_update_postshfit(&uniPreluInt8_2x8, shift0);
+        gpu_dp_inst_update_postshfit(&uniPreluInt16_part0_4x4, shift0);
+        gpu_dp_inst_update_postshfit(&uniPreluInt16_part1_4x4, shift0);
+
+        status = vsi_nn_kernel_gpu_add_param(node,
+            "uniPreluInt8_2x8", &uniPreluInt8_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node,
+            "uniPreluInt16_part0_4x4", &uniPreluInt16_part0_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node,
+            "uniPreluInt16_part1_4x4", &uniPreluInt16_part1_4x4);
+        CHECK_STATUS_FAIL_GOTO(status, final);
+    }
+    break;
+    case _PACK_SELECT_KEY(BF16, BF16, 0, 1, 1):
+    case _PACK_SELECT_KEY(BF16, BF16, 0, 1, 2):
+    case _PACK_SELECT_KEY(BF16, BF16, 0, 0, 1):
+    case _PACK_SELECT_KEY(BF16, BF16, 0, 0, 2):
        {
            gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
                0x11111111, // TCfg
@ -446,15 +442,15 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
            status |= vsi_nn_kernel_gpu_add_param( node,
                "uniConvF16toF32_part1_4x4", &uniConvF16toF32_part1_4x4 );
            status |= vsi_nn_kernel_gpu_add_param( node,
-                "inputZP0", &inputZP0 );
+                "input0_zp", &input0_zp);
            status |= vsi_nn_kernel_gpu_add_param( node,
-                "input_scale0", &input_scale0 );
+                "input0_scale", &input0_scale );
            status |= vsi_nn_kernel_gpu_add_param( node,
-                "inputZP1", &inputZP1 );
+                "input1_zp", &input1_zp);
            status |= vsi_nn_kernel_gpu_add_param( node,
-                "input_scale1", &input_scale1 );
+                "input1_scale", &input1_scale );
            status |= vsi_nn_kernel_gpu_add_param( node,
-                "outputZP", &outputZP );
+                "output_zp", &output_zp );
            if (attr[2]->dtype == F16)
            {
                status |= vsi_nn_kernel_gpu_add_param( node,
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@ -58,53 +59,92 @@ typedef enum
 #define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type)  "resize_bilinear_"#_input_type"_opt"
 #define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_1"
 #define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_2"
+#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_3"
+#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC4(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_4"
+#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC5(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_5"

 #define STR(a) #a
 // Add kernel hashtable here
-#define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag ) \
-        (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (scale_flag))
+#define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag, same_type ) \
+        (( IN_DTYPE ) | ( OUT_DTYPE << 8) | (scale_flag << 16) | (same_type << 22))

-#define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN ), \
+#define _PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN, SAME_TYPE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_DOWN"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) }

-#define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP ), \
+#define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \
+        _PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, TRUE ), \
+        _PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, FALSE )
+
+#define _PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP, SAME_TYPE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) }

-#define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT ), \
+#define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \
+        _PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, TRUE ), \
+        _PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, FALSE )
+
+#define _PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT, SAME_TYPE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP_opt"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(IN_DTYPE) }

-#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF ), \
+#define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \
+        _PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, TRUE ), \
+        _PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, FALSE )
+
+#define PACK_KERNEL_MAP_UP_2X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF, TRUE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
            "_SAME_2x_upsample_half_pixel_centers"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }

-#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF ), \
+#define PACK_KERNEL_MAP_UP_4X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF, TRUE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
            "_SAME_4x_upsample_half_pixel_centers"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }

-#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF ), \
+#define PACK_KERNEL_MAP_UP_8X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF, TRUE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
            "_SAME_8x_upsample_half_pixel_centers"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(IN_DTYPE) }

-#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF ), \
+#define PACK_KERNEL_MAP_UP_3X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF, TRUE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
            "_SAME_3x_upsample_half_pixel_centers"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }

+#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF, FALSE ), \
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_2x_upsample_half_pixel_centers"), \
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(IN_DTYPE) }
+
+#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF, FALSE ), \
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_4x_upsample_half_pixel_centers"), \
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(IN_DTYPE) }
+
+#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF, FALSE ), \
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_8x_upsample_half_pixel_centers"), \
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC5(IN_DTYPE) }
+
+#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF, FALSE ), \
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_3x_upsample_half_pixel_centers"), \
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC4(IN_DTYPE) }
+
 #define PACK_KERNEL_MAP_UP_8X_ALIGN( IN_DTYPE, OUT_DTYPE ) \
-        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN ), \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN, TRUE ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
            "_SAME_8x_upsample_align_corners"), \
          "resize_bilinear_align_corners" }
@ -135,6 +175,10 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
    PACK_KERNEL_MAP_UP(F16, F16),
    PACK_KERNEL_MAP_UP(BF16, BF16),
    PACK_KERNEL_MAP_UP_OPT(U8, U8),
+    PACK_KERNEL_MAP_UP_2X_HALF_SAME_TYPE(U8, U8),
+    PACK_KERNEL_MAP_UP_3X_HALF_SAME_TYPE(U8, U8),
+    PACK_KERNEL_MAP_UP_4X_HALF_SAME_TYPE(U8, U8),
+    PACK_KERNEL_MAP_UP_8X_HALF_SAME_TYPE(U8, U8),
    PACK_KERNEL_MAP_UP_2X_HALF(U8, U8),
    PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
    PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
@ -672,18 +716,23 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
        };
    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
    vsi_nn_kernel_tensor_attr_t * input_attr    = NULL;
-    vsi_size_array_t             * out_shape     = NULL;
-    vsi_size_array_t             * in_shape      = NULL;
+    vsi_size_array_t             * out_shape    = NULL;
+    vsi_size_array_t             * in_shape     = NULL;
    vsi_nn_kernel_dtype_e         input_dtype   = F16;
+    vsi_nn_kernel_dtype_e         output_dtype  = F16;
    uint32_t    depth = 0;
    uint32_t    in_width = 0;
    uint32_t    in_height = 0;
    uint32_t    out_width = 0;
    uint32_t    out_height = 0;
+    vsi_bool    is_same_type = FALSE;
    vsi_bool    is_2x_up_kernel  = FALSE;
    vsi_bool    is_3x_up_kernel  = FALSE;
    vsi_bool    is_4x_up_kernel  = FALSE;
    vsi_bool    is_8x_up_kernel  = FALSE;
+    float scale = 1.f;
+    int32_t input_zp = 0;
+    int32_t output_zp = 0;

    VSI_UNREFERENCED(param_size);

@ -692,17 +741,23 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );

-    out_shape     = output_attr->shape;
-    in_shape      = input_attr->shape;
-    input_dtype   = input_attr->dtype;
+    out_shape    = output_attr->shape;
+    in_shape     = input_attr->shape;
+    input_dtype  = input_attr->dtype;
+    output_dtype = output_attr->dtype;

    in_width          = (uint32_t)(in_shape->data[0]);
    in_height         = (uint32_t)(in_shape->data[1]);
    depth             = (uint32_t)(in_shape->data[2]);
    out_width         = (uint32_t)(out_shape->data[0]);
    out_height        = (uint32_t)(out_shape->data[1]);
+    scale = input_attr->scale;
+    input_zp = input_attr->zero_point;
+    scale /= output_attr->scale;
+    output_zp = output_attr->zero_point;
+    is_same_type = _is_same_quant(input_attr, output_attr);

-    if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
+    if ((U8 == input_dtype) && (output_dtype == U8))
    {
        is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
        is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
@ -728,206 +783,303 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
        gpu_param.global_scale[2] = 1;
    }

-    if (is_2x_up_kernel)
+    if (is_2x_up_kernel || is_3x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
    {
-        gpu_dp_inst_t uniResize2xUp_0_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
-            0x00000704, // AccumType, ConstantType, and PostShift
-            0x09030301, 0x03090103, 0x09030301, 0x03090103,
-            0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize2xUp_1_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
-            0x00000704, // AccumType, ConstantType, and PostShift
-            0x09030301, 0x03090103, 0x09030301, 0x03090103,
-            0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
-        }, GPU_DP_TYPE_16};
+        uint16_t M0 = 0;
+        int32_t  postShift = 0;
+        uint32_t multAndoutZP[2] = { 0 };
+        gpu_dp_inst_t uniU8PostProcess_2x8 = { {
+            0xdddddddd, // TCfg
+            0x44444444, // ASelt
+            0x13121110, 0x17161514, // ABin
+            0x11111111, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002600, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };

-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if (is_3x_up_kernel)
-    {
-        gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{
-            0x15515515, // TCfg
-            0x00000000, // ASelt
-            0x21210110, 0x03323202, // ABin
-            0x2aa2aa2a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000610, // AccumType, ConstantType, and PostShift
-            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
-            0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{
-            0x05155155, // TCfg
-            0x00000000, // ASelt
-            0x54044343, 0x00650554, // ABin
-            0x0a2aa2aa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000610, // AccumType, ConstantType, and PostShift
-            0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
-            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{
-            0x55551155, // TCfg
-            0x50501050, // ASelt
-            0x01011010, 0x21212121, // ABin
-            0xaaaa22aa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x0000060f, // AccumType, ConstantType, and PostShift
-            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
-            0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{
-            0x11555511, // TCfg
-            0x10505010, // ASelt
-            0x32320202, 0x03033232, // ABin
-            0x22aaaa22, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x0000060f, // AccumType, ConstantType, and PostShift
-            0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
-            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{
-            0x55115555, // TCfg
-            0x50105050, // ASelt
-            0x43434343, 0x54540404, // ABin
-            0xaa22aaaa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x0000060f, // AccumType, ConstantType, and PostShift
-            0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
-            0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{
-            0x00551155, // TCfg
-            0x00501050, // ASelt
-            0x05055454, 0x00006565, // ABin
-            0x00aa22aa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x0000060f, // AccumType, ConstantType, and PostShift
-            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
-            0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
+        if (is_2x_up_kernel)
+        {
+            gpu_dp_inst_t uniResize2xUp_0_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
+                0x00000704, // AccumType, ConstantType, and PostShift
+                0x09030301, 0x03090103, 0x09030301, 0x03090103,
+                0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize2xUp_1_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
+                0x00000704, // AccumType, ConstantType, and PostShift
+                0x09030301, 0x03090103, 0x09030301, 0x03090103,
+                0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
+            }, GPU_DP_TYPE_16 };

-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if (is_4x_up_kernel)
-    {
-        gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
-            0x00000406, // AccumType, ConstantType, and PostShift
-            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
-            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
-            0x00000406, // AccumType, ConstantType, and PostShift
-            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
-            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
-            0x00000406, // AccumType, ConstantType, and PostShift
-            0x23150503, 0x31070701, 0x07310107, 0x15230305,
-            0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
-            0x00000406, // AccumType, ConstantType, and PostShift
-            0x23150503, 0x31070701, 0x07310107, 0x15230305,
-            0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
-        }, GPU_DP_TYPE_16};
+            if (!is_same_type)
+            {
+                float f2i_radio = 16.0f;
+                gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
+                multAndoutZP[0] = (uint32_t)(M0);
+                multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);

-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if (is_8x_up_kernel)
-    {
-        gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
-            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
-            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
-            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
-            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
-            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
-            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
-            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
-            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
-        }, GPU_DP_TYPE_16};
+                gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
+                uniResize2xUp_0_4x8.data[7] = 0x00000700;
+                uniResize2xUp_1_4x8.data[7] = 0x00000700;

-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
-        CHECK_STATUS_FAIL_GOTO(status, final );
+                status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
+                    &uniU8PostProcess_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
+                CHECK_STATUS_FAIL_GOTO(status, final);
+            }
+
+            status = vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
+            CHECK_STATUS_FAIL_GOTO(status, final);
+        }
+        else if (is_3x_up_kernel)
+        {
+            gpu_dp_inst_t uniResize3xUp_l00_2x8 = { {
+                0x15515515, // TCfg
+                0x00000000, // ASelt
+                0x21210110, 0x03323202, // ABin
+                0x2aa2aa2a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000610, // AccumType, ConstantType, and PostShift
+                0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
+                0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize3xUp_l01_2x8 = { {
+                0x05155155, // TCfg
+                0x00000000, // ASelt
+                0x54044343, 0x00650554, // ABin
+                0x0a2aa2aa, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000610, // AccumType, ConstantType, and PostShift
+                0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
+                0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize3xUp_l10_4x4 = { {
+                0x55551155, // TCfg
+                0x50501050, // ASelt
+                0x01011010, 0x21212121, // ABin
+                0xaaaa22aa, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x0000060f, // AccumType, ConstantType, and PostShift
+                0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
+                0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize3xUp_l11_4x4 = { {
+                0x11555511, // TCfg
+                0x10505010, // ASelt
+                0x32320202, 0x03033232, // ABin
+                0x22aaaa22, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x0000060f, // AccumType, ConstantType, and PostShift
+                0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
+                0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize3xUp_l12_4x4 = { {
+                0x55115555, // TCfg
+                0x50105050, // ASelt
+                0x43434343, 0x54540404, // ABin
+                0xaa22aaaa, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x0000060f, // AccumType, ConstantType, and PostShift
+                0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
+                0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize3xUp_l13_4x4 = { {
+                0x00551155, // TCfg
+                0x00501050, // ASelt
+                0x05055454, 0x00006565, // ABin
+                0x00aa22aa, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x0000060f, // AccumType, ConstantType, and PostShift
+                0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
+                0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            if (!is_same_type)
+            {
+                float f2i_radio = 256.0f;
+                gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
+                multAndoutZP[0] = (uint32_t)(M0);
+                multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
+
+                gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
+                uniResize3xUp_l00_2x8.data[7] = 0x00000608;
+                uniResize3xUp_l01_2x8.data[7] = 0x00000608;
+                uniResize3xUp_l10_4x4.data[7] = 0x00000607;
+                uniResize3xUp_l11_4x4.data[7] = 0x00000607;
+                uniResize3xUp_l12_4x4.data[7] = 0x00000607;
+                uniResize3xUp_l13_4x4.data[7] = 0x00000607;
+
+                status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
+                    &uniU8PostProcess_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
+                CHECK_STATUS_FAIL_GOTO(status, final);
+            }
+
+            status = vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
+            CHECK_STATUS_FAIL_GOTO(status, final);
+        }
+        else if (is_4x_up_kernel)
+        {
+            gpu_dp_inst_t uniResize4xUp_l00_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
+                0x00000406, // AccumType, ConstantType, and PostShift
+                0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
+                0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize4xUp_l01_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
+                0x00000406, // AccumType, ConstantType, and PostShift
+                0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
+                0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize4xUp_l10_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
+                0x00000406, // AccumType, ConstantType, and PostShift
+                0x23150503, 0x31070701, 0x07310107, 0x15230305,
+                0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize4xUp_l11_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
+                0x00000406, // AccumType, ConstantType, and PostShift
+                0x23150503, 0x31070701, 0x07310107, 0x15230305,
+                0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            if (!is_same_type)
+            {
+                float f2i_radio = 64.0f;
+                gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
+                multAndoutZP[0] = (uint32_t)(M0);
+                multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
+
+                gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
+                uniResize4xUp_l00_4x8.data[7] = 0x00000400;
+                uniResize4xUp_l01_4x8.data[7] = 0x00000400;
+                uniResize4xUp_l10_4x8.data[7] = 0x00000400;
+                uniResize4xUp_l11_4x8.data[7] = 0x00000400;
+
+                status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
+                    &uniU8PostProcess_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
+                CHECK_STATUS_FAIL_GOTO(status, final);
+            }
+
+            status = vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
+            CHECK_STATUS_FAIL_GOTO(status, final);
+        }
+        else if (is_8x_up_kernel)
+        {
+            gpu_dp_inst_t uniResize8xUp_l00_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+                0x00000708, // AccumType, ConstantType, and PostShift
+                0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
+                0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize8xUp_l01_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+                0x00000708, // AccumType, ConstantType, and PostShift
+                0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
+                0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize8xUp_l10_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+                0x00000708, // AccumType, ConstantType, and PostShift
+                0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
+                0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize8xUp_l11_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+                0x00000708, // AccumType, ConstantType, and PostShift
+                0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
+                0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize8xUp_l20_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+                0x00000708, // AccumType, ConstantType, and PostShift
+                0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
+                0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize8xUp_l21_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+                0x00000708, // AccumType, ConstantType, and PostShift
+                0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
+                0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize8xUp_l30_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+                0x00000708, // AccumType, ConstantType, and PostShift
+                0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
+                0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniResize8xUp_l31_4x8 = { {
+                0x55555555, 0x55555555, // TCfg
+                0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+                0x00000708, // AccumType, ConstantType, and PostShift
+                0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
+                0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            if (!is_same_type)
+            {
+                float f2i_radio = 256.0f;
+                gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
+                multAndoutZP[0] = (uint32_t)(M0);
+                multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
+
+                gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
+                uniResize8xUp_l00_4x8.data[7] = 0x00000700;
+                uniResize8xUp_l01_4x8.data[7] = 0x00000700;
+                uniResize8xUp_l10_4x8.data[7] = 0x00000700;
+                uniResize8xUp_l11_4x8.data[7] = 0x00000700;
+                uniResize8xUp_l20_4x8.data[7] = 0x00000700;
+                uniResize8xUp_l21_4x8.data[7] = 0x00000700;
+                uniResize8xUp_l30_4x8.data[7] = 0x00000700;
+                uniResize8xUp_l31_4x8.data[7] = 0x00000700;
+
+                status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
+                    &uniU8PostProcess_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
+                CHECK_STATUS_FAIL_GOTO(status, final);
+            }
+
+            status = vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
+            CHECK_STATUS_FAIL_GOTO(status, final);
+        }
    }
    else
    {
@ -1193,22 +1345,22 @@ static vsi_status _query_kernel

    if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0])
    {
-        if (is_same_type && (!align_corners) && (half_pixel_centers) && is_2x_upsample)
+        if ((!align_corners) && (half_pixel_centers) && is_2x_upsample)
        {
            scale_flag = UP_2X_HALF;
            initializer = _bilinear_half_pixel_centers_opt_initializer;
        }
-        else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_3x_upsample)
+        else if ((!align_corners) && (half_pixel_centers) && is_3x_upsample)
        {
            scale_flag = UP_3X_HALF;
            initializer = _bilinear_half_pixel_centers_opt_initializer;
        }
-        else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_4x_upsample)
+        else if ((!align_corners) && (half_pixel_centers) && is_4x_upsample)
        {
            scale_flag = UP_4X_HALF;
            initializer = _bilinear_half_pixel_centers_opt_initializer;
        }
-        else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_8x_upsample)
+        else if ((!align_corners) && (half_pixel_centers) && is_8x_upsample)
        {
            scale_flag = UP_8X_HALF;
            initializer = _bilinear_half_pixel_centers_opt_initializer;
@ -1232,7 +1384,7 @@ static vsi_status _query_kernel
        scale_flag = DOWN;
    }

-    key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
+    key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
    for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
    {
        if( kernel_map[i].key == key )
@ -1244,7 +1396,7 @@ static vsi_status _query_kernel
    if ((UP_OPT == scale_flag) && (i >= kernel_map_size))
    {
        scale_flag = UP;
-        key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
+        key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
        for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
        {
            if( kernel_map[i].key == key )
@ -1257,7 +1409,7 @@ static vsi_status _query_kernel
    if ((UP == scale_flag) && (i >= kernel_map_size))
    {
        scale_flag = DOWN;
-        key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
+        key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
        for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
        {
            if( kernel_map[i].key == key )
@ -1433,7 +1585,7 @@ static vsi_bool _is_image_width_lt16
    size_t bytes = vsi_nn_kernel_dtype_get_bytes(in_dtype);
    vsi_size_t max_cross_read_img_width = bytes == 1 ? 16 : 8;

-    if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
+    if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
    {
        return FALSE;
    }
@ -1468,7 +1620,8 @@ static vsi_nn_kernel_node_t _setup
    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
    vsi_bool is_same_type       = vsi_nn_is_same_type(inputs[0], outputs[0]);
-    vsi_bool is_evis2           = (vsi_bool)(graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_2);
+    vsi_bool is_evis2           = \
+        (vsi_bool)(((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver == VSI_NN_HW_EVIS_2);
    vsi_bool is_run_opt_kernel  = FALSE;
    vsi_nn_tensor_t*  scale     = NULL;
    int32_t pad_left = half_pixel_centers ? 1 : 0;
--- a/src/tim/vx/internal/src/kernel/evis/rope_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/rope_evis.c
@ -0,0 +1,744 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+    B---batch
+    N---num_heads
+    S---sequence length
+    H---head size
+ */
+typedef enum
+{
+    LAYOUT_NONE,
+    LAYOUT_BNHS,
+    LAYOUT_BNH1,
+    LAYOUT_BSNH,
+    LAYOUT_BNSH,
+} _internal_rope_layout_e;
+
+// Add kernel hashtable here
+#define STR(a) #a
+#define ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT, INTERLEAVED ) \
+      ((IN0_DTYPE) | (IN1_DTYPE << 8) | (OUT_DTYPE << 16) | (LAYOUT << 24) | (INTERLEAVED << 28))
+#define PACK_KERNEL_BNHS_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+        { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNHS, 0 ), \
+         CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnhs"), \
+         "rope_0" }
+#define PACK_KERNEL_BNH1_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+        { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNH1, 0 ), \
+         CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnh1"), \
+         "rope_1" }
+
+#define PACK_KERNEL_BSNH_INTERLEVEAD_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+        { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BSNH, 1 ), \
+         CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bsnh"), \
+         "rope_2" }
+
+#define PACK_KERNEL_BNSH_INTERLEVEAD_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+        { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNSH, 1 ), \
+         CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnsh"), \
+         "rope_3" }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+#define PACK_KERNEL_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
+    PACK_KERNEL_BNHS_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
+    PACK_KERNEL_BNH1_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
+    PACK_KERNEL_BSNH_INTERLEVEAD_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
+    PACK_KERNEL_BNSH_INTERLEVEAD_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE),
+
+static const _kernel_map_type _rope_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( BF16, BF16, BF16)
+    PACK_KERNEL_MAP( F16,  F16,  F16 )
+    PACK_KERNEL_MAP( I16,  I16,  I16 )
+    PACK_KERNEL_MAP( I16,  F16,  I16 )
+    PACK_KERNEL_MAP( I16,  I16,  I8 )
+    PACK_KERNEL_MAP( I16,  F16,  I8 )
+    PACK_KERNEL_MAP( I16,  I16,  U8 )
+    PACK_KERNEL_MAP( I16,  F16,  U8 )
+    PACK_KERNEL_MAP( U16,  U16,  U16 )
+    PACK_KERNEL_MAP( U16,  F16,  U16 )
+    PACK_KERNEL_MAP( I8,   I8,   I8  )
+    PACK_KERNEL_MAP( I8,   F16,  I8  )
+    PACK_KERNEL_MAP( U8,   U8,   U8  )
+    PACK_KERNEL_MAP( U8,   F16,  U8  )
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _rope_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _ROPE_PARAM_NUM  _cnt_of_array( _rope_kernel_param_def )
+#define SCALAR_AXIS       (4)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_rope_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+    };
+    vsi_nn_kernel_tensor_attr_t* out_attr = NULL;
+    vsi_nn_kernel_tensor_attr_t* in0_attr = NULL;
+    vsi_nn_kernel_tensor_attr_t* in1_attr = NULL;
+    vsi_nn_kernel_tensor_attr_t* in2_attr = NULL;
+    vsi_size_array_t* in_shape = NULL;
+    vsi_nn_kernel_dtype_e in0_dtype = F16;
+    vsi_nn_kernel_dtype_e in1_dtype = F16;
+    vsi_nn_kernel_dtype_e in2_dtype = F16;
+    vsi_nn_kernel_dtype_e out_dtype = F16;
+    float in0_scale = 1.0f;
+    float in1_scale = 1.0f;
+    float in2_scale = 1.0f;
+    float output_scale = 1.0f;
+    float output_zp = 0;
+    int32_t in0_zp = 0;
+    int32_t cos_zp = 0;
+    int32_t sin_zp = 0;
+    int32_t p = 0;
+    int32_t axis = 0;
+    int32_t interleaved = 0;
+    int32_t half_head_size = 1;
+    vsi_size_t shape[3] = {1};
+    uint32_t pack_key = 0;
+
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param);
+    VSI_UNREFERENCED(param_size);
+    // Add initializer
+
+    in0_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
+    CHECK_PTR_FAIL_GOTO(in0_attr, "Create tensor attr buffer fail.", final);
+    in1_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]);
+    CHECK_PTR_FAIL_GOTO(in1_attr, "Create tensor attr buffer fail.", final);
+    in2_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
+    CHECK_PTR_FAIL_GOTO(in2_attr, "Create tensor attr buffer fail.", final);
+    out_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[3]);
+    CHECK_PTR_FAIL_GOTO(out_attr, "Create tensor attr buffer fail.", final);
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &p);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+    axis = p & 0xFFFF;
+    interleaved = (p >> 16) & 0xFFFF;
+
+    in_shape = in0_attr->shape;
+    in0_dtype = in0_attr->dtype;
+    in1_dtype = in1_attr->dtype;
+    in2_dtype = in2_attr->dtype;
+    out_dtype = out_attr->dtype;
+
+    in0_scale = in0_attr->scale;
+    in1_scale = in1_attr->scale;
+    in2_scale = in2_attr->scale;
+    in0_zp = -in0_attr->zero_point;
+    cos_zp = -in1_attr->zero_point;
+    sin_zp = -in2_attr->zero_point;
+    output_scale = out_attr->scale;
+    output_zp = (float)out_attr->zero_point;
+
+    half_head_size = (int32_t)(in_shape->data[axis] / 2);
+    shape[0] = in_shape->data[0];
+    shape[1] = in_shape->data[1];
+    shape[2] = in_shape->data[2];
+    shape[axis] = half_head_size;
+
+    gpu_param.global_scale[0] = 8;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+    gpu_param.global_size[0] = gpu_align_p2((shape[0] + \
+        gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = shape[1];
+    gpu_param.global_size[2] = shape[2];
+
+#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE )    \
+        ((IN0_TYPE) | (IN1_TYPE << 8) | (IN2_TYPE << 16) | (OUT_TYPE << 24))
+
+    pack_key = _PACK_SELECT_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype);
+    switch (pack_key)
+    {
+    case _PACK_SELECT_KEY(BF16, BF16, BF16, BF16):
+    {
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = { {
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = { {
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x05050404, 0x07070606, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractOddData_2x8 = { {
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x07050301, 0x07050301, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        if (interleaved && axis == 0)
+        {
+            uniExtractOddData_2x8.data[1] = 0x10101010;
+            uniExtractOddData_2x8.data[2] = 0x03030101;
+            uniExtractOddData_2x8.data[3] = 0x07070505;
+        }
+        else
+        {
+            status = vsi_nn_kernel_gpu_add_param(node,
+                "half_head_size", &half_head_size);
+            CHECK_STATUS_FAIL_GOTO(status, final);
+        }
+        status = vsi_nn_kernel_gpu_add_param(node,
+            "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node,
+            "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node,
+            "uniExtractOddData_2x8", &uniExtractOddData_2x8);
+        CHECK_STATUS_FAIL_GOTO(status, final);
+    }
+    break;
+    case _PACK_SELECT_KEY(I16, I16, I16, I16):
+    case _PACK_SELECT_KEY(I16, F16, F16, I16):
+    case _PACK_SELECT_KEY(I16, I16, I16, I8):
+    case _PACK_SELECT_KEY(I16, F16, F16, I8):
+    case _PACK_SELECT_KEY(I16, I16, I16, U8):
+    case _PACK_SELECT_KEY(I16, F16, F16, U8):
+    case _PACK_SELECT_KEY(F16, F16, F16, F16):
+        {
+            float scale0 = in0_scale * in1_scale / output_scale;
+            float scale1 = in0_scale* in2_scale / output_scale;
+            gpu_dp_inst_t uniExtractHalf8_2x8 = { {
+                0x11111111, // TCfg
+                0x11110000, // ASelt
+                0x06040200, 0x06040200, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniExtractInteger_2x8 = { {
+                0x33333333, // TCfg
+                0x11110000, // ASelt
+                0x03020100, 0x03020100, // ABin
+                0x00000000, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniATimesB_0_4x4 = { {
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x01010101, // BSelt
+                0x00010000, 0x00030002, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniATimesB_1_4x4 = { {
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00050004, 0x00070006, // ABin
+                0x01010101, // BSelt
+                0x00050004, 0x00070006, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniAEvenTimesB_0_4x4 = { {
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00020000, 0x00060004, // ABin
+                0x01010101, // BSelt
+                0x00010000, 0x00030002, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniAEvenTimesB_1_4x4 = { {
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00020000, 0x00060004, // ABin
+                0x01010101, // BSelt
+                0x00050004, 0x00070006, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniAOddTimesB_0_4x4 = { {
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00030001, 0x00070005, // ABin
+                0x01010101, // BSelt
+                0x00010000, 0x00030002, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniAOddTimesB_1_4x4 = { {
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00030001, 0x00070005, // ABin
+                0x01010101, // BSelt
+                0x00050004, 0x00070006, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            if (interleaved && axis == 0)
+            {
+                uniExtractHalf8_2x8.data[1] = 0x10101010;
+                uniExtractHalf8_2x8.data[2] = 0x02020000;
+                uniExtractHalf8_2x8.data[3] = 0x06060404;
+                uniExtractInteger_2x8.data[1] = 0x10101010;
+                uniExtractInteger_2x8.data[2] = 0x01010000;
+                uniExtractInteger_2x8.data[3] = 0x03030202;
+
+                status = vsi_nn_kernel_gpu_add_param(node,
+                    "uniAEvenTimesB_0_4x4", &uniAEvenTimesB_0_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node,
+                    "uniAEvenTimesB_1_4x4", &uniAEvenTimesB_1_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node,
+                    "uniAOddTimesB_0_4x4", &uniAOddTimesB_0_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node,
+                    "uniAOddTimesB_1_4x4", &uniAOddTimesB_1_4x4);
+            }
+            else
+            {
+                status = vsi_nn_kernel_gpu_add_param(node,
+                    "uniATimesB_0_4x4", &uniATimesB_0_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node,
+                    "uniATimesB_1_4x4", &uniATimesB_1_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node,
+                    "half_head_size", &half_head_size);
+            }
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "scale0", &scale0);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "scale1", &scale1);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "output_zp", &output_zp);
+            if (out_dtype == F16)
+            {
+                status |= vsi_nn_kernel_gpu_add_param(node,
+                    "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
+            }
+            else
+            {
+                status |= vsi_nn_kernel_gpu_add_param(node,
+                    "uniExtract8Data_2x8", &uniExtractInteger_2x8);
+            }
+            CHECK_STATUS_FAIL_GOTO(status, final);
+        }
+        break;
+    case _PACK_SELECT_KEY(I8,  I8,  I8,  I8):
+    case _PACK_SELECT_KEY(U8,  U8,  U8,  U8):
+    case _PACK_SELECT_KEY(U16, U16, U16, U16):
+    case _PACK_SELECT_KEY(I8,  F16, F16, I8):
+    case _PACK_SELECT_KEY(U8,  F16, F16, U8):
+    case _PACK_SELECT_KEY(U16, F16, F16, U16):
+        {
+            float scale0 = in0_scale * in1_scale / output_scale;
+            float scale1 = in0_scale* in2_scale / output_scale;
+            gpu_dp_inst_t uniExtractInteger_2x8 = { {
+                0x33333333, // TCfg
+                0x11110000, // ASelt
+                0x03020100, 0x03020100, // ABin
+                0x00000000, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniAMinusZp_0_4x4 = { {
+                0x0d0d0d0d, // TCfg
+                0x04040404, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniAMinusZp_1_4x4 = { {
+                0x0d0d0d0d, // TCfg
+                0x04040404, // ASelt
+                0x00050004, 0x00070006, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniAEvenMinusZp_4x4 = { {
+                0x0d0d0d0d, // TCfg
+                0x04040404, // ASelt
+                0x00020000, 0x00060004, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniAOddMinusZp_4x4 = { {
+                0x0d0d0d0d, // TCfg
+                0x04040404, // ASelt
+                0x00030001, 0x00070005, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            if (interleaved && axis == 0)
+            {
+                uniExtractInteger_2x8.data[1] = 0x10101010;
+                uniExtractInteger_2x8.data[2] = 0x01010000;
+                uniExtractInteger_2x8.data[3] = 0x03030202;
+
+                status = vsi_nn_kernel_gpu_add_param(node,
+                    "uniAEvenMinusZp_4x4", &uniAEvenMinusZp_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node,
+                    "uniAOddMinusZp_4x4", &uniAOddMinusZp_4x4);
+            }
+            else
+            {
+                status = vsi_nn_kernel_gpu_add_param(node,
+                    "half_head_size", &half_head_size);
+            }
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "uniAMinusZp_0_4x4", &uniAMinusZp_0_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "uniAMinusZp_1_4x4", &uniAMinusZp_1_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "scale0", &scale0);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "scale1", &scale1);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "output_zp", &output_zp);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "in0_zp", &in0_zp);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "cos_zp", &cos_zp);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "sin_zp", &sin_zp);
+            status |= vsi_nn_kernel_gpu_add_param(node,
+                "uniExtract8Data_2x8", &uniExtractInteger_2x8);
+            CHECK_STATUS_FAIL_GOTO(status, final);
+        }
+        break;
+    default:
+        break;
+    }
+    status = vsi_nn_kernel_gpu_config(node, &gpu_param);
+final:
+    if (in0_attr) vsi_nn_kernel_tensor_attr_release(&in0_attr);
+    if (in1_attr) vsi_nn_kernel_tensor_attr_release(&in1_attr);
+    if (in2_attr) vsi_nn_kernel_tensor_attr_release(&in2_attr);
+    if (out_attr) vsi_nn_kernel_tensor_attr_release(&out_attr);
+    return status;
+} /* _rope_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t axis,
+    int32_t interleaved,
+    _internal_rope_layout_e *layout
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e in2_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    int32_t in0_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
+    int32_t in1_zp = vsi_nn_get_tensor_zero_point(inputs[1]);
+    int32_t in2_zp = vsi_nn_get_tensor_zero_point(inputs[2]);
+    const _kernel_map_type * kernel_map = _rope_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _rope_kernel_map );
+    vx_param_description_t * param_def  = _rope_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _rope_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    in2_dtype  = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    /*only support symmetric int16*/
+    if ( ( (in0_dtype == I16 && in1_dtype == I16 && out_dtype == I16) ||
+           (in0_dtype == I16 && in1_dtype == F16 && out_dtype == I16) ||
+           (in0_dtype == I16 && in1_dtype == F16 && out_dtype == I8)  ||
+           (in0_dtype == I16 && in1_dtype == I16 && out_dtype == I8)  ||
+           (in0_dtype == I16 && in1_dtype == F16 && out_dtype == U8)  ||
+           (in0_dtype == I16 && in1_dtype == I16 && out_dtype == U8) ) &&
+        (in0_zp != 0 || in1_zp != 0 || in2_zp != 0))
+    {
+        return VSI_FAILURE;
+    }
+
+    if (axis == 1 && inputs[0]->attr.size[0] == inputs[1]->attr.size[0] &&
+        in1_dtype == in2_dtype)
+    {
+        if (inputs[0]->attr.size[0] == 1)
+        {
+            *layout = LAYOUT_BNH1;
+        }
+        else
+        {
+            *layout = LAYOUT_BNHS;
+        }
+    }
+    else if (axis == 0 && in1_dtype == in2_dtype)
+    {
+        if (inputs[0]->attr.size[2] == inputs[1]->attr.size[2] &&
+            inputs[1]->attr.size[1] == 1)
+        {
+            *layout = LAYOUT_BSNH;
+        }
+        else
+        {
+            *layout = LAYOUT_BNSH;
+        }
+    }
+
+    key = ROPE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, *layout, interleaved);
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _rope_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_ROPE_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t axis = 0;
+    int32_t i = 0;
+    int32_t interleaved = 0;
+    int32_t param = 0;
+    vsi_size_t shape[3][VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_nn_tensor_t* rs_tensors[4] = { NULL };
+    vsi_nn_tensor_t* reshape_tensors[4] = { NULL };
+    _internal_rope_layout_e layout = LAYOUT_NONE;
+
+    VSI_UNREFERENCED(params);
+
+    axis = vsi_nn_kernel_param_get_int32(params, "axis");
+    interleaved = vsi_nn_kernel_param_get_int32(params, "interleaved");
+
+    // Check if gpu can support the size
+    if ( !vsi_nn_kernel_gpu_check_shape(
+        inputs[0]->attr.size, inputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, axis, interleaved, &layout );
+    if (outputs[0]->attr.size[0] == 1 || layout == LAYOUT_BSNH)
+    {
+        memcpy(shape[0], inputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+        memcpy(shape[1], inputs[1]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+        memcpy(shape[2], outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+
+        if (outputs[0]->attr.size[0] == 1)
+        {
+            for (i = 1; i < 3; i++)
+            {
+                shape[0][i - 1] = shape[0][i];
+                shape[1][i - 1] = shape[1][i];
+                shape[2][i - 1] = shape[2][i];
+            }
+            shape[0][2] = 1;
+            shape[1][2] = 1;
+            shape[2][2] = 1;
+        }
+        else
+        {
+            int32_t j = 0;
+            for (i = 0; i < 3; i++)
+            {
+                if (shape[1][i] != 1)
+                {
+                    shape[1][j] = shape[1][i];
+                    j ++;
+                }
+            }
+            for (; j < 3; j++)
+            {
+                shape[1][j] = 1;
+            }
+        }
+
+        rs_tensors[0] = vsi_nn_reshape_tensor(graph,
+            inputs[0], shape[0], inputs[0]->attr.dim_num);
+        rs_tensors[1] = vsi_nn_reshape_tensor(graph,
+            inputs[1], shape[1], inputs[1]->attr.dim_num);
+        rs_tensors[2] = vsi_nn_reshape_tensor(graph,
+            inputs[2], shape[1], inputs[2]->attr.dim_num);
+        rs_tensors[3] = vsi_nn_reshape_tensor(graph,
+            outputs[0], shape[2], outputs[0]->attr.dim_num);
+
+        if (outputs[0]->attr.size[0] == 1 && axis > 0)
+        {
+            axis--;
+        }
+        reshape_tensors[0] = rs_tensors[0];
+        reshape_tensors[1] = rs_tensors[1];
+        reshape_tensors[2] = rs_tensors[2];
+        reshape_tensors[3] = rs_tensors[3];
+    }
+    else
+    {
+        reshape_tensors[0] = inputs[0];
+        reshape_tensors[1] = inputs[1];
+        reshape_tensors[2] = inputs[2];
+        reshape_tensors[3] = outputs[0];
+    }
+
+    param = (interleaved << 16) | axis;
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _ROPE_PARAM_NUM,
+                reshape_tensors, input_num, &reshape_tensors[3], output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_AXIS] = vsi_nn_kernel_scalar_create(graph, I32, &param);
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ROPE_PARAM_NUM );
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_AXIS]);
+        }
+    }
+
+    for (i = 0; i < 4; i++)
+    {
+        vsi_safe_release_tensor(rs_tensors[i]);
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( rope, _setup )
+
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
@ -186,18 +186,26 @@ static const _kernel_map_type scatter_nd_update_special_ref_map[] =
 {
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I16,  I32, I16,  I16, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U16,  I32, U16,  U16, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(F16,  I32, F16,  F16, KERNEL_SOURCE_4)
 };

 static const _kernel_map_type scatter_nd_update_special_update_map[] =
 {
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U16,  I32, U16,  U16, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I16,  I32, I16,  I16, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(F16,  I32, F16,  F16, KERNEL_SOURCE_4)
 };

 static const _kernel_map_type scatter_nd_update_special_copy_map[] =
 {
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
    TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U16,  I32, U16,  U16, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I16,  I32, I16,  I16, KERNEL_SOURCE_4)
 };

 /*
@ -563,6 +571,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
    {
    case _PACK_SELECT_KEY( I8,  I8 ):
    case _PACK_SELECT_KEY( U8,  U8 ):
+    case _PACK_SELECT_KEY( I16,  I16 ):
+    case _PACK_SELECT_KEY( U16,  U16 ):
        {
            uint16_t M0               = 0;
            int32_t  postShift0       = 0;
@ -605,6 +615,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
            CHECK_STATUS_FAIL_GOTO(status, OnError );
        }
        break;
+    case _PACK_SELECT_KEY( F16,  F16 ):
+        break;
    default:
        break;
    }
@ -759,6 +771,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
    {
    case _PACK_SELECT_KEY( I8,  I8 ):
    case _PACK_SELECT_KEY( U8,  U8 ):
+    case _PACK_SELECT_KEY( I16,  I16 ):
+    case _PACK_SELECT_KEY( U16,  U16 ):
        {
            uint16_t M1               = 0;
            int32_t  postShift1       = 0;
@ -801,6 +815,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
            CHECK_STATUS_FAIL_GOTO(status, OnError );
        }
        break;
+    case _PACK_SELECT_KEY( F16,  F16 ):
+        break;
    default:
        break;
    }
@ -1597,6 +1613,19 @@ static vsi_status _query_kernel_special
        status |= VSI_FAILURE;
    }

+    if (input0_dtype == F16)
+    {
+        input0_dtype = U16;
+    }
+    if (input2_dtype == F16)
+    {
+        input2_dtype = U16;
+    }
+    if (output_dtype == F16)
+    {
+        output_dtype = U16;
+    }
+
    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 6, 1, 0);

    for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_copy_map); i ++ )
--- a/src/tim/vx/internal/src/kernel/evis/swish_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/swish_evis.c
@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@ -591,7 +592,7 @@ static vsi_nn_kernel_node_t _setup
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
 #if (VX_ACTIVATION_EXT_SUPPORT)
-    if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
+    if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
    {
        return NULL;
    }
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@ -548,16 +548,16 @@ static vsi_status _gpu_register
    vsi_status status;
    vx_kernel_description_t* info;
    vx_kernel obj;
-    vsi_nn_context_t context;
    vx_program program = NULL;
    const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
+    vsi_nn_runtime_option_t* options;
+    options = ((vsi_nn_graph_prv_t*)graph)->options;

 #define MAX_BUILDPROGRAM_LEN 1024
    char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
    size_t cost_bytes = 0;

    memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
-    context = graph->ctx;

    status = VSI_FAILURE;
    info = &(kernel->info);
@ -579,21 +579,21 @@ static vsi_status _gpu_register
        return status;
    }

-    if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE )
+    if (options->config.evis.ver == VSI_NN_HW_EVIS_NONE)
    {
        // set default evis version is 2
        if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
        {
            cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
                    "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
-                    context->config.use_40bits_va );
+                    options->config.use_40bits_va );
        }
    }
    else
    {
        cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
                "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
-                context->config.evis.ver, context->config.use_40bits_va );
+                options->config.evis.ver, options->config.use_40bits_va );
    }
    // Pack build option
    if( kernel->gpu.sources[active_fmt].build_option.data )
@ -655,16 +655,16 @@ static vsi_status _gpu_register_ext
    vsi_status status;
    vx_kernel_description_t* info;
    vx_kernel obj;
-    vsi_nn_context_t context;
    vx_program program = NULL;
    const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
+    vsi_nn_runtime_option_t* options;
+    options = ((vsi_nn_graph_prv_t*)graph)->options;

 #define MAX_BUILDPROGRAM_LEN 1024
    char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
    size_t cost_bytes = 0;

    memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
-    context = graph->ctx;

    status = VSI_FAILURE;
    info = &(kernel->info);
@ -686,21 +686,21 @@ static vsi_status _gpu_register_ext
        return status;
    }

-    if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE )
+    if (options->config.evis.ver == VSI_NN_HW_EVIS_NONE)
    {
        // set default evis version is 2
        if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
        {
            cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
                    "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
-                    context->config.use_40bits_va );
+                    options->config.use_40bits_va );
        }
    }
    else
    {
        cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
                "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
-                context->config.evis.ver, context->config.use_40bits_va );
+                options->config.evis.ver, options->config.use_40bits_va );
    }
    // Pack build option
    if( kernel->gpu.sources[active_fmt].build_option.data )
@ -1258,7 +1258,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
            }
            /* Skip evis if not support */
            if( type == VSI_NN_KERNEL_TYPE_EVIS
-                    && graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_NONE )
+                    && ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver == VSI_NN_HW_EVIS_NONE )
            {
                continue;
            }
@ -1677,7 +1677,7 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
    int32_t enableShader = ((vsi_nn_graph_prv_t*)graph)->options->enable_shader;

 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
-    if ( graph->ctx->config.subGroupSize == 0 )
+    if ( ((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize == 0 )
    {
        return FALSE;
    }
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
@ -162,15 +162,11 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(pow)
 #if (VX_TENSOR_GATHER_API_SUPPORT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(gather)
 #endif
-#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(relational_ops)
-#endif
 #if (VX_TENSOR_TILE_API_SUPPORT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(tile)
 #endif
-#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(layer_norm)
-#endif
 #if (VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(exp)
 #endif
@ -184,6 +180,7 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(log_softmax)
 #if (VX_BITCAST_VX_SUPPORT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(bitcast)
 #endif
-
+REGISTER_VX_FIRST_KERNEL_SELECTOR(group_norm)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(instance_norm)

 __END_DECLS
--- a/src/tim/vx/internal/src/kernel/vx/group_norm_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/group_norm_vx.c
@ -0,0 +1,89 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+#if VX_GROUP_NORMALIZATION_VX_SUPPORT
+#define REGISTER_GROUP_NORM_OPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_GROUP_NORM_OPENVX_KERNEL(group_norm)
+{
+    vx_node node = NULL;
+    float eps = vsi_nn_kernel_param_get_float32(params, "eps");
+    int32_t group_num = vsi_nn_kernel_param_get_int32(params, "group_num");
+    vx_tensor inputs_tensor[3] = { NULL };
+    vx_tensor output_tensor = NULL;
+
+    inputs_tensor[0] = inputs[0]->t;
+    inputs_tensor[1] = inputs[1]->t;
+    inputs_tensor[2] = inputs[2]->t;
+    output_tensor = outputs[0]->t;
+
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(kernel);
+
+    if (graph->ctx->config.support_ffd ||
+        graph->ctx->config.support_stream_processor)
+    {
+        node = vxGroupNormalizationLayer(
+            graph->g,
+            eps,
+            group_num,
+            inputs_tensor,
+            (vx_uint32)input_num,
+            output_tensor
+        );
+    }
+
+    return (vsi_nn_kernel_node_t)node;
+} /* group_norm() */
+
+#endif
--- a/src/tim/vx/internal/src/kernel/vx/instance_norm_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/instance_norm_vx.c
@ -0,0 +1,87 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+#if VX_INSTANCE_NORMALIZATION_VX_SUPPORT
+#define REGISTER_INSTANCE_NORM_OPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_INSTANCE_NORM_OPENVX_KERNEL(instance_norm)
+{
+    vsi_nn_kernel_node_t node = NULL;
+    float eps = vsi_nn_kernel_param_get_float32(params, "eps");
+    vx_tensor inputs_tensor[3] = { NULL };
+    vx_tensor output_tensor = NULL;
+
+    inputs_tensor[0] = inputs[0]->t;
+    inputs_tensor[1] = inputs[1]->t;
+    inputs_tensor[2] = inputs[2]->t;
+    output_tensor = outputs[0]->t;
+
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(kernel);
+
+    if (graph->ctx->config.support_ffd ||
+        graph->ctx->config.support_stream_processor)
+    {
+        node = vxInstanceNormalizationLayer(
+            graph->g,
+            eps,
+            inputs_tensor,
+            (vx_uint32)input_num,
+            output_tensor
+        );
+    }
+
+    return (vsi_nn_kernel_node_t)node;
+} /* instance_norm() */
+
+#endif
--- a/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c
@ -30,7 +30,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"

-#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
+#if (VX_LAYER_NORMALIZATION_VX_SUPPORT)
 #define REGISTER_LAYER_NORM_OPENVX_KERNEL( kernel_name )   \
    static vsi_nn_kernel_node_t _##kernel_name##setup \
        ( \
@ -71,14 +71,20 @@ REGISTER_LAYER_NORM_OPENVX_KERNEL( layer_norm )
    inputs_tensor[2] = inputs[2]->t;
    output_tensor = outputs[0]->t;

-    node = vxLayerNormalizationLayer(
-        graph->g,
-        eps,
-        axis,
-        inputs_tensor,
-        (uint32_t)input_num,
-        output_tensor
+#if !defined(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) || !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
+    if (graph->ctx->config.support_ffd ||
+        graph->ctx->config.support_stream_processor)
+#endif
+    {
+        node = vxLayerNormalizationLayer(
+            graph->g,
+            eps,
+            axis,
+            inputs_tensor,
+            (uint32_t)input_num,
+            output_tensor
        );
+    }

    return (vsi_nn_kernel_node_t)node;
 } /* layer_norm() */
--- a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
@ -89,9 +89,10 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 )
    if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
    {
        vsi_nn_tensor_attr_t attr;
+
        memcpy( &attr, &outputs[0]->attr, sizeof( attr ) );
        memcpy( &attr.size, &inputs[0]->attr.size, sizeof( attr.size ) );
-        attr.vtl = FALSE;
+        attr.vtl = TRUE;
        attr.is_const = FALSE;

        convert_tensor = vsi_nn_CreateTensor(graph, &attr);
--- a/src/tim/vx/internal/src/kernel/vx/relationalops_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/relationalops_vx.c
@ -30,7 +30,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"

-#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
+#if (VX_RELATIONAL_OPS_VX_SUPPORT)

 #define REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( kernel_name )   \
    static vsi_nn_kernel_node_t _##kernel_name##setup \
@ -68,12 +68,25 @@ REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( relational_ops )
    VSI_UNREFERENCED(kernel);
    VSI_UNREFERENCED(output_num);

-    node = vxRelationalLayer(graph->g,
-                              operation,
-                              inputs_tensor,
-                              (uint32_t)input_num,
-                              outputs[0]->t
-                              );
+#if !defined(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) || !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
+    if (vsi_nn_is_broadcast_operaton(inputs, input_num, outputs[0]))
+    {
+        return NULL;
+    }
+#endif
+
+#if !defined(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) || !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
+    if (graph->ctx->config.support_stream_processor)
+#endif
+    {
+        node = vxRelationalLayer(
+            graph->g,
+            operation,
+            inputs_tensor,
+            (uint32_t)input_num,
+            outputs[0]->t
+        );
+    }

    return (vsi_nn_kernel_node_t)node;
 } /* relational_ops() */
--- a/src/tim/vx/internal/src/kernel/vx/swish_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/swish_vx.c
@ -23,6 +23,7 @@
 *****************************************************************************/

 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_log.h"
@ -66,7 +67,7 @@ REGISTER_SWISH_OPENVX_KERNEL( swish )
    VSI_UNREFERENCED(output_num);
    VSI_UNREFERENCED(input_num);

-    if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
+    if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
    {
        swish_type = (vsi_nn_swish_type)vsi_nn_kernel_param_get_int32(params, "type");

--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl
@ -67,8 +67,8 @@ __kernel void cumsum_F32toF32_axis2(
    }
 }

-#define CUMSUM_toU8_AXIS2_SH(name, src_type, read_image_type) \
-__kernel void cumsum_##name##toU8_axis2( \
+#define CUMSUM_toINT_AXIS2_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
+__kernel void cumsum_##name##_axis2( \
    __read_only image2d_array_t  input, \
    __write_only image2d_array_t  output, \
    int axis, \
@ -87,19 +87,19 @@ __kernel void cumsum_##name##toU8_axis2( \
    int4 coord_out = coord; \
 \
    src_type sum = (src_type)(0); \
-    uint4 dst = (uint4)(0); \
+    dst_type dst = (dst_type)(0); \
    int tmp_zp = convert_int_rte(output_zp); \
-    dst.x = convert_uint_sat(tmp_zp); \
+    dst.x = convert_dtype(tmp_zp); \
 \
    float cnt = 0.0f; \
 \
    if(exclusive && rev) \
    { \
        coord_out.z = channel - 1; \
-        write_imageui(output, coord_out, dst); \
+        image_write(output, coord_out, dst); \
        for(coord.z = channel - 1; coord.z > 0; coord.z--) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            coord_out.z--; \
            cnt += 1.0f; \
            sum += data; \
@ -107,17 +107,17 @@ __kernel void cumsum_##name##toU8_axis2( \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord_out, dst); \
        } \
    } \
    else if(exclusive) \
    { \
        coord_out.z = 0; \
-        write_imageui(output, coord_out, dst); \
+        image_write(output, coord_out, dst); \
        for(coord.z = 0; coord.z < channel - 1; coord.z++) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            coord_out.z++; \
            cnt += 1.0f; \
            sum += data; \
@ -125,45 +125,44 @@ __kernel void cumsum_##name##toU8_axis2( \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord_out, dst); \
        } \
    } \
    else if(rev) \
    { \
        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord, dst); \
        } \
    } \
    else \
    { \
        for(coord.z = 0; coord.z < channel; coord.z++) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord, dst); \
        } \
    } \
 }
-CUMSUM_toU8_AXIS2_SH(U8,uint4,read_imageui)
-CUMSUM_toU8_AXIS2_SH(F32,float4,read_imagef)
-
-
+CUMSUM_toINT_AXIS2_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_toINT_AXIS2_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_toINT_AXIS2_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)

 __kernel void cumsum_F32toF32_axis1(
    __read_only image2d_array_t  input,
@ -233,10 +232,10 @@ __kernel void cumsum_F32toF32_axis1(
    }
 }

-#define CUMSUM_toU8_AXIS1_SH(name, src_type, read_image_type) \
-__kernel void cumsum_##name##toU8_axis1( \
-    __read_only image2d_array_t  input, \
-    __write_only image2d_array_t  output, \
+#define CUMSUM_toINT_AXIS1_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
+__kernel void cumsum_##name##_axis1( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
    int axis, \
    int exclusive, \
    int rev, \
@ -253,20 +252,20 @@ __kernel void cumsum_##name##toU8_axis1( \
    int4 coord_out = coord; \
 \
    src_type sum = (src_type)(0); \
-    uint4 dst = (uint4)(0); \
+    dst_type dst = (dst_type)(0); \
    int tmp_zp = convert_int_rte(output_zp); \
-    dst.x = convert_uint_sat(tmp_zp); \
+    dst.x = convert_dtype(tmp_zp); \
 \
    float cnt = 0; \
 \
    if(exclusive && rev) \
    { \
        coord_out.y = height - 1; \
-        write_imageui(output, coord_out, dst); \
+        image_write(output, coord_out, dst); \
 \
        for(coord.y = height - 1; coord.y > 0; coord.y--) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            cnt += 1.0f; \
            coord_out.y--; \
            sum += data; \
@ -274,17 +273,17 @@ __kernel void cumsum_##name##toU8_axis1( \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord_out, dst); \
        } \
    } \
    else if(exclusive) \
    { \
        coord_out.y = 0; \
-        write_imageui(output, coord_out, dst); \
+        image_write(output, coord_out, dst); \
        for(coord.y = 0; coord.y < height - 1; coord.y++) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            cnt += 1.0f; \
            coord_out.y++; \
            sum += data; \
@ -292,44 +291,44 @@ __kernel void cumsum_##name##toU8_axis1( \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord_out, dst); \
        } \
    } \
    else if(rev) \
    { \
        for(coord.y = height - 1; coord.y >= 0; coord.y--) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord, dst); \
        } \
    } \
    else \
    { \
        for(coord.y = 0; coord.y < height; coord.y++) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord, dst); \
        } \
    } \
 }
-CUMSUM_toU8_AXIS1_SH(U8,uint4,read_imageui)
-CUMSUM_toU8_AXIS1_SH(F32,float4,read_imagef)
-
+CUMSUM_toINT_AXIS1_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_toINT_AXIS1_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_toINT_AXIS1_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)

 __kernel void cumsum_F32toF32_axis0(
    __read_only image2d_array_t  input,
@ -399,8 +398,8 @@ __kernel void cumsum_F32toF32_axis0(
    }
 }

-#define CUMSUM_toU8_AXIS0_SH(name, src_type, read_image_type) \
-__kernel void cumsum_##name##toU8_axis0( \
+#define CUMSUM_toINT_AXIS0_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
+__kernel void cumsum_##name##_axis0( \
    __read_only image2d_array_t  input, \
    __write_only image2d_array_t  output, \
    int axis, \
@ -419,19 +418,19 @@ __kernel void cumsum_##name##toU8_axis0( \
    int4 coord_out = coord; \
 \
    src_type sum = (src_type)(0); \
-    uint4 dst = (uint4)(0); \
+    dst_type dst = (dst_type)(0); \
    int tmp_zp = convert_int_rte(output_zp); \
-    dst.x = convert_uint_sat(tmp_zp); \
+    dst.x = convert_dtype(tmp_zp); \
 \
    float cnt = 0; \
 \
    if(exclusive && rev) \
    { \
        coord_out.x = width - 1; \
-        write_imageui(output, coord_out, dst); \
+        image_write(output, coord_out, dst); \
        for(coord.x = width - 1; coord.x > 0; coord.x--) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            coord_out.x--; \
            cnt += 1.0f; \
            sum += data; \
@ -439,8 +438,8 @@ __kernel void cumsum_##name##toU8_axis0( \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord_out, dst); \
        } \
    } \
    else if(exclusive) \
@ -449,7 +448,7 @@ __kernel void cumsum_##name##toU8_axis0( \
        write_imageui(output, coord_out, dst); \
        for(coord.x = 0; coord.x < width - 1; coord.x++) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            coord_out.x++; \
            cnt += 1.0f; \
            sum += data; \
@ -457,40 +456,42 @@ __kernel void cumsum_##name##toU8_axis0( \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord_out, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord_out, dst); \
        } \
    } \
    else if(rev) \
    { \
        for(coord.x = width - 1; coord.x >= 0; coord.x--) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord, dst); \
        } \
    } \
    else \
    { \
        for(coord.x = 0; coord.x < width; coord.x++) \
        { \
-            src_type data = read_image_type(input, coord); \
+            src_type data = image_read(input, coord); \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
 \
-            dst.x = (uint)convert_int_rte(tmpSum); \
-            write_imageui(output, coord, dst); \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord, dst); \
        } \
    } \
 }
-CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui)
-CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)
+CUMSUM_toINT_AXIS0_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_toINT_AXIS0_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_toINT_AXIS0_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)
+
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl
@ -65,188 +65,100 @@ __kernel void cumsum_F32toF32_axis1_2D(
    }
 }

-__kernel void cumsum_U8toU8_axis1_2D(
-    __read_only image2d_t  input,
-    __write_only image2d_t  output,
-    int axis,
-    int exclusive,
-    int rev,
-    int width,
-    int height,
-    int chn,
-    int input_zp,
-    float in_out_scale,
-    float in_out_zp_scale,
-    float output_zp
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
-
-    uint4 sum = (uint4)(0);
-    uint4 dst = (uint4)(0);
-
-    int tmp_zp = convert_int_rte(output_zp);
-    dst.x = convert_uint_sat(tmp_zp);
-
-    float cnt = 0;
-
-    if(exclusive && rev)
-    {
-        coord.w = height - 1;
-        write_imageui(output, coord.zw, dst);
-        for(coord.y = height - 1; coord.y > 0; coord.y--)
-        {
-            uint4 data = read_imageui(input, coord.xy);
-            cnt += 1.0f;
-            coord.w--;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.zw, dst);
-        }
-    }
-    else if(exclusive)
-    {
-        write_imageui(output, coord.zw, dst);
-        for(coord.y = 0; coord.y < height - 1; coord.y++)
-        {
-            uint4 data = read_imageui(input, coord.xy);
-            cnt += 1.0f;
-            coord.w++;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.zw, dst);
-        }
-    }
-    else if(rev)
-    {
-        for(coord.y = height - 1; coord.y >= 0; coord.y--)
-        {
-            uint4 data = read_imageui(input, coord.xy);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.xy, dst);
-        }
-    }
-    else
-    {
-        for(coord.y = 0; coord.y < height; coord.y++)
-        {
-            uint4 data = read_imageui(input, coord.xy);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.xy, dst);
-        }
-    }
-}
-
-__kernel void cumsum_F32toU8_axis1_2D(
-    __read_only image2d_t  input,
-    __write_only image2d_t  output,
-    int axis,
-    int exclusive,
-    int rev,
-    int width,
-    int height,
-    int chn,
-    int input_zp,
-    float in_out_scale,
-    float in_out_zp_scale,
-    float output_zp
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
-
-    float4 sum = (float4)(0);
-    uint4 dst = (uint4)(0);
-    int tmp_zp = convert_int_rte(output_zp);
-    dst.x = convert_uint_sat(tmp_zp);
-
-    float cnt = 0;
-
-    if(exclusive && rev)
-    {
-        coord.w = height - 1;
-        write_imageui(output, coord.zw, dst);
-        for(coord.y = height - 1; coord.y > 0; coord.y--)
-        {
-            float4 data = read_imagef(input, coord.xy);
-            cnt += 1.0f;
-            coord.w--;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.zw, dst);
-        }
-    }
-    else if(exclusive)
-    {
-        write_imageui(output, coord.zw, dst);
-        for(coord.y = 0; coord.y < height - 1; coord.y++)
-        {
-            float4 data = read_imagef(input, coord.xy);
-            cnt += 1.0f;
-            coord.w++;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.zw, dst);
-        }
-    }
-    else if(rev)
-    {
-        for(coord.y = height - 1; coord.y >= 0; coord.y--)
-        {
-            float4 data = read_imagef(input, coord.xy);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.xy, dst);
-        }
-    }
-    else
-    {
-        for(coord.y = 0; coord.y < height; coord.y++)
-        {
-            float4 data = read_imagef(input, coord.xy);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.xy, dst);
-        }
-    }
+#define CUMSUM_INT_AXIS1_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
+__kernel void cumsum_##name##_axis1_2D( \
+    __read_only  image2d_t input, \
+    __write_only image2d_t output, \
+    int axis, \
+    int exclusive, \
+    int rev, \
+    int width, \
+    int height, \
+    int chn, \
+    int input_zp, \
+    float in_out_scale, \
+    float in_out_zp_scale, \
+    float output_zp \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    src_type sum = (src_type)(0); \
+    dst_type dst = (dst_type)(0); \
+    int tmp_zp = convert_int_rte(output_zp); \
+    dst.x = convert_dtype(tmp_zp); \
+ \
+    float cnt = 0; \
+ \
+    if(exclusive && rev) \
+    { \
+        coord.w = height - 1; \
+        image_write(output, coord.zw, dst); \
+        for(coord.y = height - 1; coord.y > 0; coord.y--) \
+        { \
+            src_type data = image_read(input, coord.xy); \
+            cnt += 1.0f; \
+            coord.w--; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord.zw, dst); \
+        } \
+    } \
+    else if(exclusive) \
+    { \
+        image_write(output, coord.zw, dst); \
+        for(coord.y = 0; coord.y < height - 1; coord.y++) \
+        { \
+            src_type data = image_read(input, coord.xy); \
+            cnt += 1.0f; \
+            coord.w++; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord.zw, dst); \
+        } \
+    } \
+    else if(rev) \
+    { \
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \
+        { \
+            src_type data = image_read(input, coord.xy); \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord.xy, dst); \
+        } \
+    } \
+    else \
+    { \
+        for(coord.y = 0; coord.y < height; coord.y++) \
+        { \
+            src_type data = image_read(input, coord.xy); \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord.xy, dst); \
+        } \
+    } \
 }
+CUMSUM_INT_AXIS1_2D_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_INT_AXIS1_2D_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_INT_AXIS1_2D_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)

 __kernel void cumsum_F32toF32_axis0_2D(
    __read_only image2d_t  input,
@ -316,188 +228,100 @@ __kernel void cumsum_F32toF32_axis0_2D(
    }
 }

-__kernel void cumsum_U8toU8_axis0_2D(
-    __read_only image2d_t  input,
-    __write_only image2d_t  output,
-    int axis,
-    int exclusive,
-    int rev,
-    int width,
-    int height,
-    int chn,
-    int input_zp,
-    float in_out_scale,
-    float in_out_zp_scale,
-    float output_zp
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
-
-    uint4 sum = (uint4)(0);
-    uint4 dst = (uint4)(0);
-
-    int tmp_zp = convert_int_rte(output_zp);
-    dst.x = convert_uint_sat(tmp_zp);
-
-    float cnt = 0.0f;
-
-    if(exclusive && rev)
-    {
-        coord.x = width - 1;
-        coord.z = coord.x;
-        write_imageui(output, coord.zw, dst);
-        for(; coord.x > 0; coord.x--)
-        {
-            uint4 data = read_imageui(input, coord.xy);
-            coord.z--;
-            cnt += 1.0;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.zw, dst);
-        }
-    }
-    else if(exclusive)
-    {
-        coord.z = 0;
-        write_imageui(output, coord.zw, dst);
-        for(coord.x = 0; coord.x < width - 1; coord.x++)
-        {
-            uint4 data = read_imageui(input, coord.xy);
-            cnt += 1.0f;
-            coord.z++;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.zw, dst);
-        }
-    }
-    else if(rev)
-    {
-        for(coord.x = width - 1; coord.x >= 0; coord.x--)
-        {
-            uint4 data = read_imageui(input, coord.xy);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.xy, dst);
-        }
-    }
-    else
-    {
-        for(coord.x = 0; coord.x < width; coord.x++)
-        {
-            uint4 data = read_imageui(input, coord.xy);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.xy, dst);
-        }
-    }
-}
-
-__kernel void cumsum_F32toU8_axis0_2D(
-    __read_only image2d_t  input,
-    __write_only image2d_t  output,
-    int axis,
-    int exclusive,
-    int rev,
-    int width,
-    int height,
-    int chn,
-    int input_zp,
-    float in_out_scale,
-    float in_out_zp_scale,
-    float output_zp
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
-
-    float4 sum = (float4)(0);
-    uint4 dst = (uint4)(0);
-    int tmp_zp = convert_int_rte(output_zp);
-    dst.x = convert_uint_sat(tmp_zp);
-
-    float cnt = 0.0f;
-    if(exclusive && rev)
-    {
-        coord.x = width - 1;
-        coord.z = coord.x;
-        write_imageui(output, coord.zw, dst);
-        for(; coord.x > 0; coord.x--)
-        {
-            float4 data = read_imagef(input, coord.xy);
-            coord.z--;
-            cnt += 1.0;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.zw, dst);
-        }
-    }
-    else if(exclusive)
-    {
-        coord.z = 0;
-        write_imageui(output, coord.zw, dst);
-        for(coord.x = 0; coord.x < width - 1; coord.x++)
-        {
-            float4 data = read_imagef(input, coord.xy);
-            cnt += 1.0f;
-            coord.z++;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.zw, dst);
-        }
-    }
-    else if(rev)
-    {
-        for(coord.x = width - 1; coord.x >= 0; coord.x--)
-        {
-            float4 data = read_imagef(input, coord.xy);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.xy, dst);
-        }
-    }
-    else
-    {
-        for(coord.x = 0; coord.x < width; coord.x++)
-        {
-            float4 data = read_imagef(input, coord.xy);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord.xy, dst);
-        }
-    }
+#define CUMSUM_INT_AXIS0_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
+__kernel void cumsum_##name##_axis0_2D( \
+    __read_only  image2d_t input, \
+    __write_only image2d_t output, \
+    int axis, \
+    int exclusive, \
+    int rev, \
+    int width, \
+    int height, \
+    int chn, \
+    int input_zp, \
+    float in_out_scale, \
+    float in_out_zp_scale, \
+    float output_zp \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    src_type sum = (src_type)(0); \
+    dst_type dst = (dst_type)(0); \
+ \
+    int tmp_zp = convert_int_rte(output_zp); \
+    dst.x = convert_dtype(tmp_zp); \
+ \
+    float cnt = 0.0f; \
+ \
+    if(exclusive && rev) \
+    { \
+        coord.x = width - 1; \
+        coord.z = coord.x; \
+        image_write(output, coord.zw, dst); \
+        for(; coord.x > 0; coord.x--) \
+        { \
+            src_type data = image_read(input, coord.xy); \
+            coord.z--; \
+            cnt += 1.0; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord.zw, dst); \
+        } \
+    } \
+    else if(exclusive) \
+    { \
+        coord.z = 0; \
+        image_write(output, coord.zw, dst); \
+        for(coord.x = 0; coord.x < width - 1; coord.x++) \
+        { \
+            src_type data = image_read(input, coord.xy); \
+            cnt += 1.0f; \
+            coord.z++; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord.zw, dst); \
+        } \
+    } \
+    else if(rev) \
+    { \
+        for(coord.x = width - 1; coord.x >= 0; coord.x--) \
+        { \
+            src_type data = image_read(input, coord.xy); \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord.xy, dst); \
+        } \
+    } \
+    else \
+    { \
+        for(coord.x = 0; coord.x < width; coord.x++) \
+        { \
+            src_type data = image_read(input, coord.xy); \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = convert_dtype(tmpSum); \
+            image_write(output, coord.xy, dst); \
+        } \
+    } \
 }
+CUMSUM_INT_AXIS0_2D_SH(U8toU8,   uint4,  read_imageui, uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_INT_AXIS0_2D_SH(F32toU8,  float4, read_imagef,  uint4, write_imageui, convert_uint_sat_rte)
+CUMSUM_INT_AXIS0_2D_SH(I32toI32, int4,   read_imagei,  int4,  write_imagei,  convert_int_sat_rte)
--- a/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl
@ -132,3 +132,30 @@ __kernel void one_hot_U8toU8
        coord.z ++;
    } while (coord.z < depth);
 }
+
+__kernel void one_hot_I32toBF16
+    (
+        __read_only  image2d_t       input,
+        __write_only image2d_array_t output,
+                     int             depth,
+                     uint            on_value,
+                     uint            off_value,
+                     float           inputScale,
+                     float           inputTail
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+    int4 src = read_imagei(input, coord.xy);
+
+    int  val = convert_int(convert_float(src.x) * inputScale - inputTail);
+    do
+    {
+        uint4 dst;
+        dst.x = val == coord.z ? on_value : off_value;
+
+        write_imageui(output, coord.xzyw, dst.xxxx);
+
+        coord.z ++;
+    } while (coord.z < depth);
+}
--- a/src/tim/vx/internal/src/libnnext/ops/cl/rope_0.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/rope_0.cl
@ -0,0 +1,373 @@
+__kernel void rope_F32_F32toF32_axis0
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    float4 cos, sin;
+
+    READ_IMAGEF_2DARRAY(cos, cos_cache, coord);
+    READ_IMAGEF_2DARRAY(sin, sin_cache, coord);
+    coord.x = coord.x * step;
+    float4 src0 = read_imagef(input, coord);
+    int4 coord_out = coord;
+
+    coord.x += half_head_size;
+    float4 src1 = read_imagef(input, coord);
+
+    float4 dst0 = src0 * cos - src1 * sin;
+    float4 dst1 = src0 * sin + src1 * cos;
+
+    write_imagef(output, coord_out, dst0);
+    coord_out.x += half_head_size;
+    write_imagef(output, coord_out, dst1);
+}
+
+__kernel void rope_F32_F32toF32_axis1
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    float4 cos, sin;
+
+    READ_IMAGEF_2DARRAY(cos, cos_cache, coord);
+    READ_IMAGEF_2DARRAY(sin, sin_cache, coord);
+    coord.y = coord.y * step;
+    float4 src0 = read_imagef(input, coord);
+    int4 coord_out = coord;
+    coord.y += half_head_size;
+    float4 src1 = read_imagef(input, coord);
+
+    float4 dst0 = src0 * cos - src1 * sin;
+    float4 dst1 = src0 * sin + src1 * cos;
+
+    write_imagef(output, coord_out, dst0);
+    coord_out.y += half_head_size;
+    write_imagef(output, coord_out, dst1);
+}
+
+__kernel void rope_F32_F32toF32_axis2
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+
+    float4 cos = read_imagef(cos_cache, coord);
+    float4 sin = read_imagef(sin_cache, coord);
+    coord.z = coord.z * step;
+    float4 src0 = read_imagef(input, coord);
+    int4 coord_out = coord;
+    coord.z += half_head_size;
+    float4 src1 = read_imagef(input, coord);
+
+    float4 dst0 = src0 * cos - src1 * sin;
+    float4 dst1 = src0 * sin + src1 * cos;
+
+    write_imagef(output, coord_out, dst0);
+    coord_out.z += half_head_size;
+    write_imagef(output, coord_out, dst1);
+}
+
+__kernel void rope_I32_I32toI32_axis0
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4 _cos, _sin;
+    float4 cos, sin;
+
+    READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);
+    READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);
+    coord.x = coord.x * step;
+    float4 src0 = convert_float4(read_imagei(input, coord));
+    int4 coord_out = coord;
+
+    coord.x += half_head_size;
+    float4 src1 = convert_float4(read_imagei(input, coord));
+
+    src0 = src0 - input_zp;
+    src1 = src1 - input_zp;
+    cos = convert_float4(_cos) - cos_zp;
+    sin = convert_float4(_sin) - sin_zp;
+
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
+    int4 dst0 = convert_int4_rte(_dst0);
+    int4 dst1 = convert_int4_rte(_dst1);
+
+    write_imagei(output, coord_out, dst0);
+    coord_out.x += half_head_size;
+    write_imagei(output, coord_out, dst1);
+}
+
+__kernel void rope_I32_I32toI32_axis1
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4 _cos, _sin;
+    float4 cos, sin;
+
+    READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);
+    READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);
+    coord.y = coord.y * step;
+    float4 src0 = convert_float4(read_imagei(input, coord));
+    int4 coord_out = coord;
+
+    coord.y += half_head_size;
+    float4 src1 = convert_float4(read_imagei(input, coord));
+
+    src0 = src0 - input_zp;
+    src1 = src1 - input_zp;
+    cos = convert_float4(_cos) - cos_zp;
+    sin = convert_float4(_sin) - sin_zp;
+
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
+    int4 dst0 = convert_int4_rte(_dst0);
+    int4 dst1 = convert_int4_rte(_dst1);
+
+    write_imagei(output, coord_out, dst0);
+    coord_out.y += half_head_size;
+    write_imagei(output, coord_out, dst1);
+}
+
+__kernel void rope_I32_I32toI32_axis2
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+
+    float4 cos = convert_float4(read_imagei(cos_cache, coord));
+    float4 sin = convert_float4(read_imagei(sin_cache, coord));
+    coord.z = coord.z * step;
+    float4 src0 = convert_float4(read_imagei(input, coord));
+    int4 coord_out = coord;
+
+    coord.z += half_head_size;
+    float4 src1 = convert_float4(read_imagei(input, coord));
+
+    src0 = src0 - input_zp;
+    src1 = src1 - input_zp;
+    cos = cos - cos_zp;
+    sin = sin - sin_zp;
+
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
+    int4 dst0 = convert_int4_rte(_dst0);
+    int4 dst1 = convert_int4_rte(_dst1);
+
+    write_imagei(output, coord_out, dst0);
+    coord_out.z += half_head_size;
+    write_imagei(output, coord_out, dst1);
+}
+
+__kernel void rope_U32_U32toU32_axis0
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    uint4 _cos, _sin;
+    float4 cos, sin;
+
+    READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);
+    READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);
+    coord.x = coord.x * step;
+    float4 src0 = convert_float4(read_imageui(input, coord));
+    int4 coord_out = coord;
+
+    coord.x += half_head_size;
+    float4 src1 = convert_float4(read_imageui(input, coord));
+
+    src0 = src0 - input_zp;
+    src1 = src1 - input_zp;
+    cos = convert_float4(_cos) - cos_zp;
+    sin = convert_float4(_sin) - sin_zp;
+
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
+    uint4 dst0 = convert_uint4_rte(_dst0);
+    uint4 dst1 = convert_uint4_rte(_dst1);
+
+    write_imageui(output, coord_out, dst0);
+    coord_out.x += half_head_size;
+    write_imageui(output, coord_out, dst1);
+}
+
+__kernel void rope_U32_U32toU32_axis1
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    uint4 _cos, _sin;
+    float4 cos, sin;
+
+    READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);
+    READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);
+    coord.y = coord.y * step;
+    float4 src0 = convert_float4(read_imageui(input, coord));
+    int4 coord_out = coord;
+
+    coord.y += half_head_size;
+    float4 src1 = convert_float4(read_imageui(input, coord));
+
+    src0 = src0 - input_zp;
+    src1 = src1 - input_zp;
+    cos = convert_float4(_cos) - cos_zp;
+    sin = convert_float4(_sin) - sin_zp;
+
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
+    uint4 dst0 = convert_uint4_rte(_dst0);
+    uint4 dst1 = convert_uint4_rte(_dst1);
+
+    write_imageui(output, coord_out, dst0);
+    coord_out.y += half_head_size;
+    write_imageui(output, coord_out, dst1);
+}
+
+__kernel void rope_U32_U32toU32_axis2
+  (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int axis,
+                 float input_zp,
+                 float cos_zp,
+                 float sin_zp,
+                 float scale0,
+                 float scale1,
+                 float output_zp,
+                 int half_head_size,
+                 int step
+  )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+
+    float4 cos = convert_float4(read_imageui(cos_cache, coord));
+    float4 sin = convert_float4(read_imageui(sin_cache, coord));
+    coord.z = coord.z * step;
+    float4 src0 = convert_float4(read_imageui(input, coord));
+    int4 coord_out = coord;
+
+    coord.z += half_head_size;
+    float4 src1 = convert_float4(read_imageui(input, coord));
+
+    src0 = src0 - input_zp;
+    src1 = src1 - input_zp;
+    cos = cos - cos_zp;
+    sin = sin - sin_zp;
+
+    float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
+    float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
+    uint4 dst0 = convert_uint4_rte(_dst0);
+    uint4 dst1 = convert_uint4_rte(_dst1);
+
+    write_imageui(output, coord_out, dst0);
+    coord_out.z += half_head_size;
+    write_imageui(output, coord_out, dst1);
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/custom_letterbox.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_letterbox.vx
@ -0,0 +1,307 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int top;
+_viv_uniform int left;
+_viv_uniform float out_scale_r;
+_viv_uniform float out_scale_g;
+_viv_uniform float out_scale_b;
+_viv_uniform float out_zp_r;
+_viv_uniform float out_zp_g;
+_viv_uniform float out_zp_b;
+_viv_uniform float pad_v_r;
+_viv_uniform float pad_v_g;
+_viv_uniform float pad_v_b;
+_viv_uniform float scale_w;
+_viv_uniform float scale_h;
+_viv_uniform int resize_max_w;
+_viv_uniform int resize_max_h;
+_viv_uniform int out_height;
+_viv_uniform int r_order;
+_viv_uniform int b_order;
+_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
+_viv_uniform VXC_512Bits uniLeftToFloat32_4x4;
+_viv_uniform VXC_512Bits uniExtactHalf8_2x8;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+
+__kernel void custom_letterbox_U8toU8
+    (
+     __read_only  image2d_t input,
+     __write_only image2d_t output,
+            int       top_,
+            int       bottom_,
+            int       left_,
+            int       right_,
+            float     mean_r_,
+            float     mean_g_,
+            float     mean_b_,
+            float     scale_r_,
+            float     scale_g_,
+            float     scale_b_,
+            int       pad_r_,
+            int       pad_g_,
+            int       pad_b_,
+            int       reverse_channel
+    )
+{
+    int2 coord_out  =  (int2)(get_global_id(0), get_global_id(1));
+    int2 coord = coord_out;
+    uint4 dst = (uint4)(0,0,0,0);
+    vxc_uchar8 result;
+
+    if (coord_out.x < left || coord_out.x >= resize_max_w ||
+        coord_out.y < top  || coord_out.y >= resize_max_h)
+    {
+        dst.x = convert_uint(pad_v_r);
+        coord.y = coord_out.y + r_order;
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+        dst.x = convert_uint(pad_v_g);
+        coord.y = coord_out.y + out_height;
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+        dst.x = convert_uint(pad_v_b);
+        coord.y = coord_out.y + b_order;
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+        return;
+    }
+
+    float in_x = convert_float(coord_out.x - left) * scale_w;
+    float in_y = convert_float(coord_out.y - top) * scale_h;
+    float left_x_f    = floor(in_x);
+    float top_y_f     = floor(in_y);
+    float x_lerp      = in_x - left_x_f;
+    float y_lerp      = in_y - top_y_f;
+    int   left_x_idx  = convert_int(left_x_f);
+    int   top_y_idx   = convert_int(top_y_f);
+
+    int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
+    vxc_uchar8 top_data, bottom_data;
+
+    VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
+                                VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
+                                VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+
+    float4 left4 = (float4)(0,0,0,0);
+    float4 right4 = (float4)(0,0,0,0);
+    float4 top4 = (float4)(0,0,0,0);
+    float4 bottom4 = (float4)(0,0,0,0);
+    VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
+    top4 = right4 * x_lerp + left4;
+    VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
+    bottom4 = right4 * x_lerp + left4;
+    float4 out = (bottom4 - top4) * y_lerp + top4;
+
+    dst.x = convert_uint(out.x * out_scale_r + out_zp_r );
+    coord.y = coord_out.y + r_order;
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    dst.x = convert_uint(out.y * out_scale_g + out_zp_g);
+    coord.y = coord_out.y + out_height;
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    dst.x = convert_uint(out.z * out_scale_b + out_zp_b);
+    coord.y = coord_out.y + b_order;
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_letterbox_U8toI8
+    (
+     __read_only  image2d_t input,
+     __write_only image2d_t output,
+            int       top_,
+            int       bottom_,
+            int       left_,
+            int       right_,
+            float     mean_r_,
+            float     mean_g_,
+            float     mean_b_,
+            float     scale_r_,
+            float     scale_g_,
+            float     scale_b_,
+            int       pad_r_,
+            int       pad_g_,
+            int       pad_b_,
+            int       reverse_channel
+    )
+{
+    int2 coord_out  =  (int2)(get_global_id(0), get_global_id(1));
+    int2 coord = coord_out;
+    int4 dst = (int4)(0,0,0,0);
+    vxc_char8 result;
+
+    if (coord_out.x < left || coord_out.x >= resize_max_w ||
+        coord_out.y < top  || coord_out.y >= resize_max_h)
+    {
+        dst.x = convert_int(pad_v_r);
+        coord.y = coord_out.y + r_order;
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+        dst.x = convert_int(pad_v_g);
+        coord.y = coord_out.y + out_height;
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+        dst.x = convert_int(pad_v_b);
+        coord.y = coord_out.y + b_order;
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+        VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+        return;
+    }
+
+    float in_x = convert_float(coord_out.x - left) * scale_w;
+    float in_y = convert_float(coord_out.y - top) * scale_h;
+    float left_x_f    = floor(in_x);
+    float top_y_f     = floor(in_y);
+    float x_lerp      = in_x - left_x_f;
+    float y_lerp      = in_y - top_y_f;
+    int   left_x_idx  = convert_int(left_x_f);
+    int   top_y_idx   = convert_int(top_y_f);
+
+    int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
+    vxc_char8 top_data, bottom_data;
+
+    VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
+                               VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
+                               VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+
+    float4 left4 = (float4)(0,0,0,0);
+    float4 right4 = (float4)(0,0,0,0);
+    float4 top4 = (float4)(0,0,0,0);
+    float4 bottom4 = (float4)(0,0,0,0);
+    VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
+    top4 = right4 * x_lerp + left4;
+    VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
+    bottom4 = right4 * x_lerp + left4;
+    float4 out = (bottom4 - top4) * y_lerp + top4;
+
+    dst.x = convert_int(out.x * out_scale_r + out_zp_r);
+    coord.y = coord_out.y + r_order;
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    dst.x = convert_int(out.y * out_scale_g + out_zp_g);
+    coord.y = coord_out.y + out_height;
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    dst.x = convert_int(out.z * out_scale_b + out_zp_b);
+    coord.y = coord_out.y + b_order;
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+    VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_letterbox_U8toF16
+    (
+     __read_only  image2d_t input,
+     __write_only image2d_t output,
+            int       top_,
+            int       bottom_,
+            int       left_,
+            int       right_,
+            float     mean_r_,
+            float     mean_g_,
+            float     mean_b_,
+            float     scale_r_,
+            float     scale_g_,
+            float     scale_b_,
+            int       pad_r_,
+            int       pad_g_,
+            int       pad_b_,
+            int       reverse_channel
+    )
+{
+    int2 coord_out  =  (int2)(get_global_id(0), get_global_id(1));
+    int2 coord = coord_out;
+    half4 tmp;
+    vxc_half8 dst_temp;
+    vxc_ushort8 dst;
+
+    if (coord_out.x < left || coord_out.x >= resize_max_w ||
+        coord_out.y < top  || coord_out.y >= resize_max_h)
+    {
+
+        float4 pad = (float4)(pad_v_r, pad_v_g, pad_v_b, 0);
+        _viv_asm(CONV, tmp, pad);
+        VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
+        _viv_asm(COPY, dst, dst_temp, 16);
+        coord.y = coord_out.y + r_order;
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+        tmp.x = tmp.y;
+        VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
+        _viv_asm(COPY, dst, dst_temp, 16);
+        coord.y = coord_out.y + out_height;
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+        tmp.x = tmp.z;
+        VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
+        _viv_asm(COPY, dst, dst_temp, 16);
+        coord.y = coord_out.y + b_order;
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+        return;
+    }
+
+    float in_x = convert_float(coord_out.x - left) * scale_w;
+    float in_y = convert_float(coord_out.y - top) * scale_h;
+    float left_x_f    = floor(in_x);
+    float top_y_f     = floor(in_y);
+    float x_lerp      = in_x - left_x_f;
+    float y_lerp      = in_y - top_y_f;
+    int   left_x_idx  = convert_int(left_x_f);
+    int   top_y_idx   = convert_int(top_y_f);
+
+    int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
+    vxc_uchar8 top_data, bottom_data;
+
+    VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
+                            VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+
+    float4 left4 = (float4)(0,0,0,0);
+    float4 right4 = (float4)(0,0,0,0);
+    float4 top4 = (float4)(0,0,0,0);
+    float4 bottom4 = (float4)(0,0,0,0);
+    VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
+    top4 = right4 * x_lerp + left4;
+    VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
+    bottom4 = right4 * x_lerp + left4;
+    float4 out = (bottom4 - top4) * y_lerp + top4;
+
+    float4 out_temp = (float4)(0,0,0,0);
+    out_temp.x = out.x * out_scale_r + out_zp_r;
+    _viv_asm(CONV, tmp, out_temp);
+    VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
+    _viv_asm(COPY, dst, dst_temp, 16);
+    coord.y = coord_out.y + r_order;
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    out_temp.x = out.y * out_scale_g + out_zp_g;
+    _viv_asm(CONV, tmp, out_temp);
+    VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
+    _viv_asm(COPY, dst, dst_temp, 16);
+    coord.y = coord_out.y + out_height;
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    out_temp.x = out.z * out_scale_b + out_zp_b;
+    _viv_asm(CONV, tmp, out_temp);
+    VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
+    _viv_asm(COPY, dst, dst_temp, 16);
+    coord.y = coord_out.y + out_height;
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx
@ -10,7 +10,12 @@
 #include "cl_viv_vx_ext.h"

 _viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32;
+_viv_uniform VXC_512Bits uniExtract8Bin_2x8;
 _viv_uniform int  sf_size;
+_viv_uniform float  srcScale;
+_viv_uniform float  srcZP;
+_viv_uniform float  dstScale;
+_viv_uniform float  dstZP;
 #define F_MAX(a,b) ((a)>(b)?(a):(b))
 __kernel void Softmax2VXC
    (
@ -19,35 +24,37 @@ __kernel void Softmax2VXC
    int axis
    )
 {
-   int4 coord_in = (int4)(0,0,0,0);
-   float fMax = 0.0;
+   int4 coord_in = (int4)(0, get_global_id(0), 0, 0);
+   float fMax = 0;
   for (int i = 0; i < sf_size; i++)
   {
-       vxc_char8 val;
+       vxc_short8 val;
+       vxc_half8  val_h;
       coord_in.x = i;
-       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+       _viv_asm(COPY, val_h, val, 16);
       float fval;
-       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
+       VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);

       fMax = F_MAX(fMax, fval);
   }
-
    float  fProbSum = 0.0f;
    vxc_short8 dst;
    for (int i = 0; i < sf_size; i++)
    {
-       vxc_char8 val;
-
+       vxc_short8 val;
+       vxc_half8  val_h;
       coord_in.x = i;
-       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+       _viv_asm(COPY, val_h, val, 16);
       float fval;
-       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
-
+       VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
       float fOut = (float)exp(fval - fMax);
       fProbSum += fOut;
       half hVal;
-       _viv_asm(CONV,hVal,fOut);
-       _viv_asm(COPY,dst,hVal, 4);
+       _viv_asm(CONV, hVal, fOut);
+       _viv_asm(COPY, dst, hVal, 4);
+
       VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    }

@ -56,15 +63,68 @@ __kernel void Softmax2VXC
       vxc_short8 val;
       vxc_half8  val_h;
       coord_in.x = i;
-       VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
+       VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
       float fval;
       _viv_asm(COPY, val_h,val, 16);
       VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
-
-       float fOut =fval/fProbSum;
+       float fOut =fval / fProbSum;
       half hVal;
-       _viv_asm(CONV,hVal,fOut);
+       _viv_asm(CONV, hVal, fOut);
       _viv_asm(COPY,dst,hVal, 4);
       VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    }
 }
+
+__kernel void Softmax2VXC_u8
+    (
+    image2d_array_t input,
+    image2d_array_t output,
+    int axis
+    )
+{
+   int4 coord_in = (int4)(0, get_global_id(0), 0, 0);
+   float fMax = -3.4e38f;
+   for (int i = 0; i < sf_size; i++)
+   {
+       vxc_uchar8 val;
+       coord_in.x = i;
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+       float fval;
+       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
+       fval = (fval - srcZP) * srcScale;
+       fMax = F_MAX(fMax, fval);
+   }
+
+    float  fProbSum = 0.0f;
+    vxc_uchar8 dst;
+    for (int i = 0; i < sf_size; i++)
+    {
+       vxc_uchar8 val;
+
+       coord_in.x = i;
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+       float fval;
+       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
+       fval = (fval - srcZP) * srcScale;
+       float fOut = (float)exp(fval - fMax);
+       fProbSum += fOut;
+    }
+
+    for (int i = 0; i < sf_size; i++)
+    {
+       vxc_uchar8 val;
+       coord_in.x = i;
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+       float fval;
+       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
+       fval = (fval - srcZP) * srcScale;
+
+       float fOut = exp(fval - fMax) / fProbSum;
+
+       fOut = fOut * dstScale + dstZP;
+       short dst0;
+       _viv_asm(CONV, dst0, fOut);
+       VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8);
+       VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx
@ -16,7 +16,7 @@ _viv_uniform float sum_x2_tail1;
 _viv_uniform float output_scale;
 _viv_uniform float output_zp;

-#define GROUP_NORM_SUMS_16BITS_IMPL(name, src_type) \
+#define GROUP_NORM_SUMS_16BITS_IMPL(name, load_type, src_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \
    __read_only  image2d_array_t input, \
    __write_only image2d_array_t output, \
@ -26,7 +26,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
    int lidx = get_local_id(0); \
    int gidz = get_global_id(1); \
    int4 coord = (int4)(gidx, 0, gidz, 0); \
-    vxc_short8 src0; \
+    load_type src; \
    src_type in_h; \
    float4 sumsqr; \
    float4 tmpSumSqr = (float4)(0); \
@ -43,9 +43,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
    { \
        for(coord.y = 0; coord.y < height;) \
        { \
-            VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            VXC_OP4(img_load_3d, src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
            coord.y++; \
-            _viv_asm(COPY, in_h, src0, 16); \
+            _viv_asm(COPY, in_h, src, 16); \
            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
            tmpSumSqr += sumsqr; \
        } \
@ -76,10 +76,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
        write_imagef(output, coord_out, data); \
    } \
 }
-GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_half8)
-GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)
+GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_short8,  vxc_half8)
+GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8,  vxc_short8)
+GROUP_NORM_SUMS_16BITS_IMPL(U16, vxc_ushort8, vxc_ushort8)

-#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \
+#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, load_type, src_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \
    __read_only  image2d_array_t input, \
    __write_only image2d_array_t output, \
@ -89,7 +90,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
    int lidx = get_local_id(0); \
 \
    int2 coord = (int2)(gidx, get_global_id(1)); \
-    vxc_short8 src0; \
+    load_type src; \
    src_type in_h; \
    float4 sumsqr = (float4)(0); \
 \
@ -98,8 +99,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
 \
    if(gidx < width) \
    { \
-        VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-        _viv_asm(COPY, in_h, src0, 16); \
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, in_h, src, 16); \
        VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
        sumsqr.y = sumsqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sumsqr.x; \
        sumsqr.x = sumsqr.x * input_scale + sum_x_tail; \
@ -128,8 +129,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
        write_imagef(output, coord_out, data); \
    } \
 }
-GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)
-GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)
+GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_short8, vxc_half8)
+GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8, vxc_short8)
+GROUP_NORM_SUMS_16BITS_IMPL_2D(U16, vxc_ushort8, vxc_ushort8)

 #define GROUP_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \
@ -178,7 +180,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
    _viv_asm(CONV_RTE, tmpVal0, norm); \
    norm = alpha * tmpData1 + bias_val; \
    _viv_asm(CONV_RTE, tmpVal1, norm); \
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    _viv_asm(COPY, outval, dst, 16); \
    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
 }
@ -230,10 +232,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
    float4 norm; \
    norm = alpha * tmpData0 + bias_val; \
-    _viv_asm(CONV, tmpVal0, norm); \
+    _viv_asm(CONV_RTE, tmpVal0, norm); \
    norm = alpha * tmpData1 + bias_val; \
-    _viv_asm(CONV, tmpVal1, norm); \
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(CONV_RTE, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    _viv_asm(COPY, outval, dst, 16); \
    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
@ -283,7 +285,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
 \
    float4 norm; \
    norm = alpha * tmpData0 + bias_val; \
-    _viv_asm(CONV, tmpVal0, norm); \
+    _viv_asm(CONV_RTE, tmpVal0, norm); \
    norm = alpha * tmpData1 + bias_val; \
    _viv_asm(CONV_RTE, tmpVal1, norm); \
    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
@ -296,6 +298,7 @@ GROUP_NORM_16BITS_F32_IMPL(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int
 GROUP_NORM_16BITS_F32_IMPL(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)
 GROUP_NORM_16BITS_F32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
 GROUP_NORM_16BITS_F32_IMPL(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)
+GROUP_NORM_16BITS_F32_IMPL(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)

 #define GROUP_NORM_16BITS_F32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \
@ -333,10 +336,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
    float4 norm; \
    norm = alpha * tmpData0 + bias_val; \
-    _viv_asm(CONV, tmpVal0, norm); \
+    _viv_asm(CONV_RTE, tmpVal0, norm); \
    norm = alpha * tmpData1 + bias_val; \
-    _viv_asm(CONV, tmpVal1, norm); \
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(CONV_RTE, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
    _viv_asm(COPY, outval, dst, 16); \
    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
@ -346,4 +349,5 @@ GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,
 GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)
 GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
 GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)
+GROUP_NORM_16BITS_F32_IMPL_2D(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)

--- a/src/tim/vx/internal/src/libnnext/ops/vx/prelu.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/prelu.vx
@ -115,45 +115,45 @@ _viv_uniform VXC_512Bits uniDataSubZPtoFp32Part1_4x4;
 _viv_uniform VXC_512Bits uniConvF16toF32_part0_4x4;
 _viv_uniform VXC_512Bits uniConvF16toF32_part1_4x4;
 _viv_uniform VXC_512Bits uniExtact8Bin_2x8;
-_viv_uniform int inputZP0;
-_viv_uniform int inputZP1;
-_viv_uniform float input_scale0;
-_viv_uniform float input_scale1;
-_viv_uniform float outputZP;
-#define PRELU_F16_3D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \
-    __kernel void prelu_##name0##to##name1( \
+_viv_uniform int input0_zp;
+_viv_uniform int input1_zp;
+_viv_uniform float input0_scale;
+_viv_uniform float input1_scale;
+_viv_uniform float output_zp;
+#define PRELU_F16_3D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \
+    __kernel void prelu_##name( \
    __read_only  image2d_array_t input0, \
    __read_only  image2d_array_t input1, \
    __write_only image2d_array_t output) \
 {\
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\
-    vxc_float4 vecA, vecB, vecC, vecD;\
+    float4 vecA, vecB, vecC, vecD;\
    input_type0 srcA;\
    copy_type0  src0;\
    vxc_short8 srcB;\
    vxc_half8  src1;\
-    input_type0 input_ZP;\
+    input_type0 zp;\
    VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
    _viv_asm(COPY, src0, srcA, 16); \
    VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
    _viv_asm(COPY, src1, srcB, 16); \
    \
-    _viv_asm(COPY, input_ZP, inputZP0, 4);\
-    VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
+    _viv_asm(COPY, zp, input0_zp, 4);\
+    VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
        uniDataSubZPtoFp32Part0_4x4); \
-    VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
+    VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
        uniDataSubZPtoFp32Part1_4x4);\
    VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\
    VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\
    \
-    vecA = vecA * input_scale0;\
-    vecB = vecB * input_scale0;\
-    vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \
-    vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \
-    vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
-    vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
-    vecA = maxData0 + vecC * minData0 + outputZP;\
-    vecB = maxData1 + vecD * minData1 + outputZP;\
+    vecA = vecA * input0_scale;\
+    vecB = vecB * input0_scale;\
+    float4 maxData0 = vecA > 0 ? vecA : 0.0; \
+    float4 maxData1 = vecB > 0 ? vecB : 0.0; \
+    float4 minData0 = vecA < 0 ? vecA : 0.0; \
+    float4 minData1 = vecB < 0 ? vecB : 0.0; \
+    vecA = maxData0 + vecC * minData0 + output_zp;\
+    vecB = maxData1 + vecD * minData1 + output_zp;\
    convert_type dst0, dst1;\
    _viv_asm(CONV_RTE, dst0, vecA);\
    _viv_asm(CONV_RTE, dst1, vecB);\
@ -164,49 +164,49 @@ _viv_uniform float outputZP;
    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
 }
 //        name0, name1, input_type0, copy_type0,  output_type, convert_type, copy_type
-PRELU_F16_3D(I8F16,  I8,  vxc_char16,  vxc_char16,  vxc_char16,  int4,  vxc_char16)
-PRELU_F16_3D(I8F16,  F16, vxc_char16,  vxc_char16,  vxc_half8,   half4, vxc_short8)
-PRELU_F16_3D(I16F16, I16, vxc_short8,  vxc_short8,  vxc_short8,  int4,  vxc_short8)
-PRELU_F16_3D(I16F16, F16, vxc_short8,  vxc_short8,  vxc_half8,   half4, vxc_short8)
-PRELU_F16_3D(U8F16,  U8,  vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)
-PRELU_F16_3D(U8F16,  F16, vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)
-PRELU_F16_3D(F16F16, F16, vxc_short8,  vxc_half8,   vxc_half8,   half4, vxc_short8)
-PRELU_F16_3D(F16F16, I8,  vxc_short8,  vxc_half8,   vxc_char16,  int4,  vxc_char16)
-PRELU_F16_3D(F16F16, I16, vxc_short8,  vxc_half8,   vxc_short8,  int4,  vxc_short8)
-PRELU_F16_3D(F16F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_uchar16)
+PRELU_F16_3D(I8F16toI8,   vxc_char16,  vxc_char16,  vxc_char16,  int4,  vxc_char16)
+PRELU_F16_3D(I8F16toF16,  vxc_char16,  vxc_char16,  vxc_half8,   half4, vxc_short8)
+PRELU_F16_3D(I16F16toI16, vxc_short8,  vxc_short8,  vxc_short8,  int4,  vxc_short8)
+PRELU_F16_3D(I16F16toF16, vxc_short8,  vxc_short8,  vxc_half8,   half4, vxc_short8)
+PRELU_F16_3D(U8F16toU8,   vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)
+PRELU_F16_3D(U8F16toF16,  vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)
+PRELU_F16_3D(F16F16toF16, vxc_short8,  vxc_half8,   vxc_half8,   half4, vxc_short8)
+PRELU_F16_3D(F16F16toI8,  vxc_short8,  vxc_half8,   vxc_char16,  int4,  vxc_char16)
+PRELU_F16_3D(F16F16toI16, vxc_short8,  vxc_half8,   vxc_short8,  int4,  vxc_short8)
+PRELU_F16_3D(F16F16toU8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_uchar16)

-#define PRELU_F16_2D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \
-    __kernel void prelu_##name0##to##name1##_2D( \
+#define PRELU_F16_2D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \
+    __kernel void prelu_##name##_2D( \
    __read_only  image2d_array_t input0, \
    __read_only  image2d_array_t input1, \
    __write_only image2d_array_t output) \
 {\
    int2 coord = (int2)(get_global_id(0), get_global_id(1));\
-    vxc_float4 vecA, vecB, vecC, vecD;\
+    float4 vecA, vecB, vecC, vecD;\
    input_type0 srcA;\
    copy_type0  src0;\
    vxc_short8 srcB;\
    vxc_half8  src1;\
-    input_type0 input_ZP;\
+    input_type0 zp;\
    VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
    _viv_asm(COPY, src0, srcA, 16); \
    VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
    _viv_asm(COPY, src1, srcB, 16); \
    \
-    _viv_asm(COPY, input_ZP, inputZP0, 4);\
-    VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
-    VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
+    _viv_asm(COPY, zp, input0_zp, 4);\
+    VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
+    VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
    VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\
    VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\
    \
-    vecA = vecA * input_scale0;\
-    vecB = vecB * input_scale0;\
-    vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \
-    vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \
-    vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
-    vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
-    vecA = maxData0 + vecC * minData0 + outputZP;\
-    vecB = maxData1 + vecD * minData1 + outputZP;\
+    vecA = vecA * input0_scale;\
+    vecB = vecB * input0_scale;\
+    float4 maxData0 = vecA > 0 ? vecA : 0.0; \
+    float4 maxData1 = vecB > 0 ? vecB : 0.0; \
+    float4 minData0 = vecA < 0 ? vecA : 0.0; \
+    float4 minData1 = vecB < 0 ? vecB : 0.0; \
+    vecA = maxData0 + vecC * minData0 + output_zp;\
+    vecB = maxData1 + vecD * minData1 + output_zp;\
    convert_type dst0, dst1;\
    _viv_asm(CONV_RTE, dst0, vecA);\
    _viv_asm(CONV_RTE, dst1, vecB);\
@ -216,49 +216,49 @@ PRELU_F16_3D(F16F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_ucha
    _viv_asm(COPY, dst, dst2, 16); \
    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
 }
-PRELU_F16_2D(I8F16,  F16, vxc_char16,  vxc_char16,  vxc_half8,   half4, vxc_short8)
-PRELU_F16_2D(I8F16,  I8,  vxc_char16,  vxc_char16,  vxc_char16,  int4,  vxc_char16)
-PRELU_F16_2D(I16F16, F16, vxc_short8,  vxc_short8,  vxc_half8,   half4, vxc_short8)
-PRELU_F16_2D(U8F16,  U8,  vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)
-PRELU_F16_2D(U8F16,  F16, vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)
-PRELU_F16_2D(F16F16, F16, vxc_short8,  vxc_half8,   vxc_half8,   half4, vxc_short8)
-PRELU_F16_2D(F16F16, I8,  vxc_short8,  vxc_half8,   vxc_char16,  int4,  vxc_char16)
-PRELU_F16_2D(F16F16, I16, vxc_short8,  vxc_half8,   vxc_short8,  int4,  vxc_short8)
-PRELU_F16_2D(I16F16, I16, vxc_short8,  vxc_short8,  vxc_short8,  int4,  vxc_short8)
-PRELU_F16_2D(F16F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_uchar16)
+PRELU_F16_2D(I8F16toF16,  vxc_char16,  vxc_char16,  vxc_half8,   half4, vxc_short8)
+PRELU_F16_2D(I8F16toI8,   vxc_char16,  vxc_char16,  vxc_char16,  int4,  vxc_char16)
+PRELU_F16_2D(I16F16toF16, vxc_short8,  vxc_short8,  vxc_half8,   half4, vxc_short8)
+PRELU_F16_2D(U8F16toU8,   vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)
+PRELU_F16_2D(U8F16toF16,  vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)
+PRELU_F16_2D(F16F16toF16, vxc_short8,  vxc_half8,   vxc_half8,   half4, vxc_short8)
+PRELU_F16_2D(F16F16toI8,  vxc_short8,  vxc_half8,   vxc_char16,  int4,  vxc_char16)
+PRELU_F16_2D(F16F16toI16, vxc_short8,  vxc_half8,   vxc_short8,  int4,  vxc_short8)
+PRELU_F16_2D(I16F16toI16, vxc_short8,  vxc_short8,  vxc_short8,  int4,  vxc_short8)
+PRELU_F16_2D(F16F16toU8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_uchar16)

-#define PRELU_U8_2D(name, output_type, convert_type, copy_type) \
-    __kernel void prelu_U8U8to##name##_2D( \
+#define PRELU_INTEGER_2D(name, src0_type, src1_type, output_type, convert_type, copy_type) \
+    __kernel void prelu_##name##_2D( \
    __read_only  image2d_array_t input0, \
    __read_only  image2d_array_t input1, \
    __write_only image2d_array_t output) \
 {\
    int2 coord = (int2)(get_global_id(0), get_global_id(1));\
-    vxc_float4 vecA, vecB, vecC, vecD;\
-    vxc_uchar16  src0;\
-    vxc_uchar16  src1;\
-    vxc_uchar16 input_ZP0;\
-    vxc_uchar16 input_ZP1;\
+    float4 vecA, vecB, vecC, vecD;\
+    src0_type  src0;\
+    src1_type  src1;\
+    short zp0;\
+    short zp1;\
    VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
    VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
    \
-    _viv_asm(COPY, input_ZP0, inputZP0, 4);\
-    VXC_DP4x4(vecA, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
-    VXC_DP4x4(vecB, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
-    _viv_asm(COPY, input_ZP1, inputZP1, 4);\
-    VXC_DP4x4(vecC, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
-    VXC_DP4x4(vecD, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
+    _viv_asm(COPY, zp0, input0_zp, 2);\
+    VXC_DP4x4(vecA, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
+    VXC_DP4x4(vecB, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
+    _viv_asm(COPY, zp1, input1_zp, 4);\
+    VXC_DP4x4(vecC, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
+    VXC_DP4x4(vecD, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
    \
-    vecA = vecA * input_scale0;\
-    vecB = vecB * input_scale0;\
-    vecC = vecC * input_scale1;\
-    vecD = vecD * input_scale1;\
-    vxc_float4 maxData0 = vecA >= 0 ? vecA : 0.0; \
-    vxc_float4 maxData1 = vecB >= 0 ? vecB : 0.0; \
-    vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
-    vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
-    vecA = maxData0 + vecC * minData0 + outputZP;\
-    vecB = maxData1 + vecD * minData1 + outputZP;\
+    vecA = vecA * input0_scale;\
+    vecB = vecB * input0_scale;\
+    vecC = vecC * input1_scale;\
+    vecD = vecD * input1_scale;\
+    float4 maxData0 = vecA >= 0 ? vecA : 0.0; \
+    float4 maxData1 = vecB >= 0 ? vecB : 0.0; \
+    float4 minData0 = vecA < 0 ? vecA : 0.0; \
+    float4 minData1 = vecB < 0 ? vecB : 0.0; \
+    vecA = maxData0 + vecC * minData0 + output_zp;\
+    vecB = maxData1 + vecD * minData1 + output_zp;\
    convert_type dst0, dst1;\
    _viv_asm(CONV_RTE, dst0, vecA);\
    _viv_asm(CONV_RTE, dst1, vecB);\
@ -268,7 +268,8 @@ PRELU_F16_2D(F16F16, U8,  vxc_short8,  vxc_half8,   vxc_uchar16, int4,  vxc_ucha
    _viv_asm(COPY, dst, dst2, 16); \
    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
 }
-PRELU_U8_2D(U8,  vxc_uchar16, int4,  vxc_uchar16)
-PRELU_U8_2D(F16, vxc_half8,   half4, vxc_short8)
+PRELU_INTEGER_2D(U8U8toU8,    vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16)
+PRELU_INTEGER_2D(U8U8toF16,   vxc_uchar16, vxc_uchar16, vxc_half8,   half4, vxc_short8)
+


--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_3.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_3.vx
@ -0,0 +1,181 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
+_viv_uniform VXC_512Bits uniResize2xUp_0_4x8;
+_viv_uniform VXC_512Bits uniResize2xUp_1_4x8;
+_viv_uniform int out_height;
+
+__kernel void resize_bilinear_U8toU8_2x_upsample_half_pixel_centers
+    (
+    __read_only  image2d_array_t   input,
+    __write_only image2d_array_t   output,
+                             int   align_corners,
+                             int   half_pixel_centers
+    )
+{
+    int4 coord_out  = (int4)(get_global_id(0), 0, get_global_id(1), 0);
+    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);
+    coord_in.x = (coord_out.x * 2 - 1) >> 2;
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;
+
+    vxc_uchar16 in0, in1, tmp, result;
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    vxc_ushort8 multiplier;
+    _viv_asm(COPY, multiplier, multAndoutZP, 16);
+
+    vxc_ushort8 dst0;
+    while (coord_out.y < out_height)
+    {
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
+        VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_in.y += 2;
+        coord_out.y++;
+    }
+}
+
+_viv_uniform VXC_512Bits uniResize4xUp_l00_4x8;
+_viv_uniform VXC_512Bits uniResize4xUp_l01_4x8;
+_viv_uniform VXC_512Bits uniResize4xUp_l10_4x8;
+_viv_uniform VXC_512Bits uniResize4xUp_l11_4x8;
+__kernel void resize_bilinear_U8toU8_4x_upsample_half_pixel_centers
+    (
+    __read_only  image2d_array_t   input,
+    __write_only image2d_array_t   output,
+                             int   align_corners,
+                             int   half_pixel_centers
+    )
+{
+    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0);
+    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);
+    coord_in.x = (coord_out.x * 2 - 3) >> 3;
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;
+
+    vxc_uchar16 in0, in1, dst0, dst1;
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    vxc_ushort8 multiplier;
+    _viv_asm(COPY, multiplier, multAndoutZP, 16);
+
+    vxc_ushort8 tmp;
+    while (coord_out.y < out_height)
+    {
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l10_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l10_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_in.y += 2;
+        coord_out.y++;
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_4.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_4.vx
@ -0,0 +1,102 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
+_viv_uniform VXC_512Bits uniResize3xUp_l00_2x8;
+_viv_uniform VXC_512Bits uniResize3xUp_l01_2x8;
+_viv_uniform VXC_512Bits uniResize3xUp_l10_4x4;
+_viv_uniform VXC_512Bits uniResize3xUp_l11_4x4;
+_viv_uniform VXC_512Bits uniResize3xUp_l12_4x4;
+_viv_uniform VXC_512Bits uniResize3xUp_l13_4x4;
+__kernel void resize_bilinear_U8toU8_3x_upsample_half_pixel_centers
+    (
+    __read_only  image2d_array_t   input,
+    __write_only image2d_array_t   output,
+                             int   align_corners,
+                             int   half_pixel_centers
+    )
+{
+    int4 coord_out  = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_in   = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    coord_in.x = (short)(coord_out.x * 2 - 1) / (short)6;
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;
+    coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;
+    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;
+
+    vxc_uchar16 in0, in1, in2, in3, tmp, dst0, dst1, dst2;
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    vxc_ushort8 multiplier;
+    _viv_asm(COPY, multiplier, multAndoutZP, 16);
+
+    vxc_ushort8 data;
+
+    VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);
+    VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);
+    VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);
+    VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
+    VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+
+    VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l00_2x8);
+    VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l01_2x8);
+    VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);
+    VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);
+    VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);
+    VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
+    VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);
+    VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);
+    VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);
+    VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
+    VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+
+    VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l00_2x8);
+    VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l01_2x8);
+    VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);
+    VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);
+    VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);
+    VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
+    VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+        VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_5.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_5.vx
@ -0,0 +1,167 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
+_viv_uniform int out_height;
+_viv_uniform VXC_512Bits uniResize8xUp_l00_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l01_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l10_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l11_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l20_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l21_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l30_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l31_4x8;
+__kernel void resize_bilinear_U8toU8_8x_upsample_half_pixel_centers
+    (
+    __read_only  image2d_array_t   input,
+    __write_only image2d_array_t   output,
+                             int   align_corners,
+                             int   half_pixel_centers
+    )
+{
+    int4 coord_out  = (int4)(get_global_id(0), 0, get_global_id(1), 0);
+    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);
+    coord_in.x = (coord_out.x * 2 - 7) >> 4;
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;
+
+    vxc_uchar16 in0, in1, in2, dst0, dst1, dst2, dst3;
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    vxc_ushort8 multiplier;
+    _viv_asm(COPY, multiplier, multAndoutZP, 16);
+
+    vxc_ushort8 tmp;
+    while (coord_out.y < out_height)
+    {
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_in.y += 2;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
+        VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
+        VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
+        VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
+        VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/rope_0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/rope_0.vx
@ -0,0 +1,303 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float scale0;
+_viv_uniform float scale1;
+_viv_uniform float output_zp;
+_viv_uniform int half_head_size;
+_viv_uniform VXC_512Bits uniATimesB_0_4x4;
+_viv_uniform VXC_512Bits uniATimesB_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+
+#define ROPE_BNHS_SYMM(name, src_type, src1_type, copy_type, dst_type) \
+__kernel void rope_##name##_bnhs \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t cos_cache, \
+    __read_only  image2d_array_t sin_cache, \
+    __write_only image2d_array_t output, \
+                 int   axis \
+    ) \
+{ \
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+    int4 coord_out = coord_in; \
+ \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+ \
+    src_type data0, data1; \
+    src1_type cos, sin; \
+    copy_type v0, v1; \
+    dst_type dst; \
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, cos, v0, 16); \
+    VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, sin, v1, 16); \
+    coord_in.y += half_head_size; \
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+ \
+    float4 data2, data3, data4, data5; \
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
+    VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
+    VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
+    data3 = data3 * scale0 - data5 * scale1 + output_zp; \
+ \
+    int4 dst0 = convert_int4_rte(data2); \
+    int4 dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+                dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
+    VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
+    data2 = data2 * scale1 + data4 * scale0 + output_zp; \
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
+ \
+    dst0 = convert_int4_rte(data2); \
+    dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    coord_out.y += half_head_size; \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+            dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+ROPE_BNHS_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
+ROPE_BNHS_SYMM(I16_I16toI8,  vxc_short8, vxc_short8, vxc_short8, vxc_char8)
+ROPE_BNHS_SYMM(I16_I16toU8,  vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
+ROPE_BNHS_SYMM(I16_F16toI16, vxc_short8, vxc_half8,  vxc_short8, vxc_short8)
+ROPE_BNHS_SYMM(I16_F16toI8,  vxc_short8, vxc_half8,  vxc_short8, vxc_char8)
+ROPE_BNHS_SYMM(I16_F16toU8,  vxc_short8, vxc_half8,  vxc_short8, vxc_uchar8)
+
+__kernel void rope_F16_F16toF16_bnhs
+    (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int   axis
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4 coord_out = coord_in;
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    vxc_short8 v0, v1, v2, v3, dst;
+    vxc_half8 data0, data1, cos, sin, dst2;
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, data0, v0, 16);
+    VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, cos, v1, 16);
+    VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, sin, v2, 16);
+    coord_in.y += half_head_size;
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, data1, v3, 16);
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 data2, data3, data4, data5;
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
+    VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
+    VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
+    data2 = data2 - data4;
+    data3 = data3 - data5;
+
+    half4 dst0;
+    half4 dst1;
+    _viv_asm(CONV_RTE, dst0, data2);
+    _viv_asm(CONV_RTE, dst1, data3);
+
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+    _viv_asm(COPY, dst, dst2, 16);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+
+    VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
+    VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
+    data2 = data2 * scale1 + data4 * scale0 + output_zp;
+    data3 = data3 * scale1 + data5 * scale0 + output_zp;
+
+    _viv_asm(CONV_RTE, dst0, data2);
+    _viv_asm(CONV_RTE, dst1, data3);
+
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+    _viv_asm(COPY, dst, dst2, 16);
+    coord_out.y += half_head_size;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform int in0_zp;
+_viv_uniform int cos_zp;
+_viv_uniform int sin_zp;
+_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
+_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
+#define ROPE_ASYM_BNHS(name, src1_type, copy_type, dtype) \
+__kernel void rope_##name##_bnhs \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t cos_cache, \
+    __read_only  image2d_array_t sin_cache, \
+    __write_only image2d_array_t output, \
+                 int   axis \
+    ) \
+{ \
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+    int4 coord_out = coord_in; \
+ \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+ \
+    dtype data0, data1, dst; \
+    src1_type cos, sin; \
+    copy_type v0, v1; \
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, cos, v0, 16); \
+    VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, sin, v1, 16); \
+    coord_in.y += half_head_size; \
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+ \
+    float4 l00, l01, cos0, cos1; \
+    float4 l10, l11, sin0, sin1; \
+    VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \
+    float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
+ \
+    int4 dst0 = convert_int4_rte(data2); \
+    int4 dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, \
+            coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+ \
+    data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \
+    data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
+ \
+    dst0 = convert_int4_rte(data2); \
+    dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    coord_out.y += half_head_size; \
+    VXC_OP4_NoDest(img_store_3d, output, \
+        coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+ROPE_ASYM_BNHS(I8_I8toI8,    vxc_char8,   vxc_char8,   vxc_char8)
+ROPE_ASYM_BNHS(U8_U8toU8,    vxc_uchar8,  vxc_uchar8,  vxc_uchar8)
+ROPE_ASYM_BNHS(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
+ROPE_ASYM_BNHS(I8_F16toI8,   vxc_half8,   vxc_short8,  vxc_char8)
+ROPE_ASYM_BNHS(U8_F16toU8,   vxc_half8,   vxc_short8,  vxc_uchar8)
+ROPE_ASYM_BNHS(U16_F16toU16, vxc_half8,   vxc_short8,  vxc_ushort8)
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+__kernel void rope_BF16_BF16toBF16_bnhs
+    (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int   axis
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4 coord_out = coord_in;
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    vxc_ushort8 v0, v1, v2, v3, dst;
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    coord_in.y += half_head_size;
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_short8 data;
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, src0, data, 16);
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, src1, data, 16);
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, cos0, data, 16);
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, cos1, data, 16);
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, sin0, data, 16);
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, sin1, data, 16);
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, src2, data, 16);
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, src3, data, 16);
+
+    float4 data0 = src0 * cos0 - src2 * sin0;
+    float4 data1 = src1 * cos1 - src3 * sin1;
+
+    _viv_asm(COPY, v0, data0, 16);
+    _viv_asm(COPY, v1, data1, 16);
+
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+
+    data0 = src0 * sin0 + src2 * cos0;
+    data1 = src1 * sin1 + src3 * cos1;
+
+    _viv_asm(COPY, v0, data0, 16);
+    _viv_asm(COPY, v1, data1, 16);
+
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    coord_out.y += half_head_size;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/rope_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/rope_1.vx
@ -0,0 +1,245 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float scale0;
+_viv_uniform float scale1;
+_viv_uniform float output_zp;
+_viv_uniform int half_head_size;
+_viv_uniform VXC_512Bits uniATimesB_0_4x4;
+_viv_uniform VXC_512Bits uniATimesB_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+
+#define ROPE_BNH1_SYMM(name, src_type, src1_type, copy_type, dst_type) \
+__kernel void rope_##name##_bnh1 \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t cos_cache, \
+    __read_only  image2d_array_t sin_cache, \
+    __write_only image2d_array_t output, \
+                 int   axis \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    src_type data0, data1; \
+    src1_type cos, sin; \
+    copy_type v0, v1; \
+    VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, cos, v0, 16); \
+    VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, sin, v1, 16); \
+    coord.x += half_head_size; \
+    VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 data2, data3, data4, data5; \
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
+    VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
+    VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
+    data3 = data3 * scale0 - data5 * scale1 + output_zp; \
+ \
+    int4 dst0 = convert_int4_rte(data2); \
+    int4 dst1 = convert_int4_rte(data3); \
+ \
+    dst_type dst; \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
+    VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
+    data2 = data2 * scale1 + data4 * scale0 + output_zp; \
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
+ \
+    dst0 = convert_int4_rte(data2); \
+    dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+ROPE_BNH1_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
+ROPE_BNH1_SYMM(I16_I16toI8,  vxc_short8, vxc_short8, vxc_short8, vxc_char8)
+ROPE_BNH1_SYMM(I16_I16toU8,  vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
+ROPE_BNH1_SYMM(I16_F16toI16, vxc_short8, vxc_half8,  vxc_short8, vxc_short8)
+ROPE_BNH1_SYMM(I16_F16toI8,  vxc_short8, vxc_half8,  vxc_short8, vxc_char8)
+ROPE_BNH1_SYMM(I16_F16toU8,  vxc_short8, vxc_half8,  vxc_short8, vxc_uchar8)
+
+__kernel void rope_F16_F16toF16_bnh1
+    (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int   axis
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    vxc_short8 v0, v1, v2, v3, dst;
+    vxc_half8 data0, data1, cos, sin, dst2;
+    VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, data0, v0, 16);
+    VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, cos, v1, 16);
+    VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, sin, v2, 16);
+    coord.x += half_head_size;
+    VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, data1, v3, 16);
+
+    float4 data2, data3, data4, data5;
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
+    VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
+    VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
+    data2 = data2 - data4;
+    data3 = data3 - data5;
+
+    half4 dst0;
+    half4 dst1;
+    _viv_asm(CONV_RTE, dst0, data2);
+    _viv_asm(CONV_RTE, dst1, data3);
+
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+    _viv_asm(COPY, dst, dst2, 16);
+    VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
+    VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
+    data2 = data2 + data4;
+    data3 = data3 + data5;
+
+    _viv_asm(CONV_RTE, dst0, data2);
+    _viv_asm(CONV_RTE, dst1, data3);
+
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+    _viv_asm(COPY, dst, dst2, 16);
+    VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform int in0_zp;
+_viv_uniform int cos_zp;
+_viv_uniform int sin_zp;
+_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
+_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
+#define ROPE_ASYM_BNH1(name, src1_type, copy_type, dtype) \
+__kernel void rope_##name##_bnh1 \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t cos_cache, \
+    __read_only  image2d_array_t sin_cache, \
+    __write_only image2d_array_t output, \
+                 int   axis \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    dtype data0, data1, dst; \
+    src1_type cos, sin; \
+    copy_type v0, v1; \
+    VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, cos, v0, 16); \
+    VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, sin, v1, 16); \
+    coord.x += half_head_size; \
+    VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 l00, l01, cos0, cos1; \
+    float4 l10, l11, sin0, sin1; \
+    VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \
+    float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
+ \
+    int4 dst0 = convert_int4_rte(data2); \
+    int4 dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \
+    data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
+ \
+    dst0 = convert_int4_rte(data2); \
+    dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+ROPE_ASYM_BNH1(I8_I8toI8,    vxc_char8,   vxc_char8,   vxc_char8)
+ROPE_ASYM_BNH1(U8_U8toU8,    vxc_uchar8,  vxc_uchar8,  vxc_uchar8)
+ROPE_ASYM_BNH1(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
+ROPE_ASYM_BNH1(I8_F16toI8,   vxc_half8,   vxc_short8,  vxc_char8)
+ROPE_ASYM_BNH1(U8_F16toU8,   vxc_half8,   vxc_short8,  vxc_uchar8)
+ROPE_ASYM_BNH1(U16_F16toU16, vxc_half8,   vxc_short8,  vxc_ushort8)
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+__kernel void rope_BF16_BF16toBF16_bnh1
+    (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int   axis
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    vxc_ushort8 v0, v1, v2, v3, dst;
+    VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    coord.x += half_head_size;
+    VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_short8 data;
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, src0, data, 16);
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, src1, data, 16);
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, cos0, data, 16);
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, cos1, data, 16);
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, sin0, data, 16);
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, sin1, data, 16);
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, src2, data, 16);
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, src3, data, 16);
+
+    float4 data0 = src0 * cos0 - src2 * sin0;
+    float4 data1 = src1 * cos1 - src3 * sin1;
+
+    _viv_asm(COPY, v0, data0, 16);
+    _viv_asm(COPY, v1, data1, 16);
+
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    data0 = src0 * sin0 + src2 * cos0;
+    data1 = src1 * sin1 + src3 * cos1;
+
+    _viv_asm(COPY, v0, data0, 16);
+    _viv_asm(COPY, v1, data1, 16);
+
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/rope_2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/rope_2.vx
@ -0,0 +1,312 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float scale0;
+_viv_uniform float scale1;
+_viv_uniform float output_zp;
+_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;
+_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;
+_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;
+_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+
+#define ROPE_BSNH_SYMM(name, src_type, src1_type, copy_type, dst_type) \
+__kernel void rope_##name##_bsnh \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t cos_cache, \
+    __read_only  image2d_array_t sin_cache, \
+    __write_only image2d_array_t output, \
+                 int   axis \
+    ) \
+{ \
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    src_type data0, data1; \
+    src1_type cos, sin; \
+    copy_type v0, v1; \
+    dst_type dst; \
+    VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, cos, v0, 16); \
+    VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, sin, v1, 16); \
+ \
+    coord_in.x *= 2; \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 coord_out = coord_in; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+ \
+    float4 data2, data3, data4, data5; \
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
+    VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
+    VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
+ \
+    int4 dst0 = convert_int4_rte(data2); \
+    int4 dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+                dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
+    VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
+ \
+    dst0 = convert_int4_rte(data2); \
+    dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    coord_out.x += 8; \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+            dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+ROPE_BSNH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
+ROPE_BSNH_SYMM(I16_I16toI8,  vxc_short8, vxc_short8, vxc_short8, vxc_char8)
+ROPE_BSNH_SYMM(I16_I16toU8,  vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
+ROPE_BSNH_SYMM(I16_F16toI16, vxc_short8, vxc_half8,  vxc_short8, vxc_short8)
+ROPE_BSNH_SYMM(I16_F16toI8,  vxc_short8, vxc_half8,  vxc_short8, vxc_char8)
+ROPE_BSNH_SYMM(I16_F16toU8,  vxc_short8, vxc_half8,  vxc_short8, vxc_uchar8)
+
+__kernel void rope_F16_F16toF16_bsnh
+    (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int   axis
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+
+    vxc_short8 v0, v1, v2, v3, dst;
+    vxc_half8 data0, data1, cos, sin, dst2;
+    VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, cos, v1, 16);
+    VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, sin, v2, 16);
+
+    coord_in.x *= 2;
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, data0, v0, 16);
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, data1, v3, 16);
+
+    int4 coord_out = coord_in;
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 data2, data3, data4, data5;
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
+    VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
+    VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
+    data2 = data2 - data4;
+    data3 = data3 + data5;
+
+    half4 dst0;
+    half4 dst1;
+    _viv_asm(CONV_RTE, dst0, data2);
+    _viv_asm(CONV_RTE, dst1, data3);
+
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+    _viv_asm(COPY, dst, dst2, 16);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+
+    VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
+    VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
+    data2 = data2 - data4;
+    data3 = data3 + data5;
+
+    _viv_asm(CONV_RTE, dst0, data2);
+    _viv_asm(CONV_RTE, dst1, data3);
+
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+    _viv_asm(COPY, dst, dst2, 16);
+    coord_out.x += 8;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform int in0_zp;
+_viv_uniform int cos_zp;
+_viv_uniform int sin_zp;
+_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
+_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
+_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;
+_viv_uniform VXC_512Bits uniAOddMinusZp_4x4;
+#define ROPE_ASYM_BSNH(name, src1_type, copy_type, dtype) \
+__kernel void rope_##name##_bsnh \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t cos_cache, \
+    __read_only  image2d_array_t sin_cache, \
+    __write_only image2d_array_t output, \
+                 int   axis \
+    ) \
+{ \
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    dtype data0, data1, dst; \
+    src1_type cos, sin; \
+    copy_type v0, v1; \
+ \
+    VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, cos, v0, 16); \
+    VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, sin, v1, 16); \
+    coord_in.x *= 2; \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 coord_out = coord_in; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+ \
+    float4 l00, l01, cos0, cos1; \
+    float4 l10, l11, sin0, sin1; \
+    VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
+    VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
+    VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \
+    float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \
+ \
+    int4 dst0 = convert_int4_rte(data2); \
+    int4 dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, \
+            coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
+    VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
+    data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
+    data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
+ \
+    dst0 = convert_int4_rte(data2); \
+    dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    coord_out.x += 8; \
+    VXC_OP4_NoDest(img_store_3d, output, \
+        coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+ROPE_ASYM_BSNH(I8_I8toI8,    vxc_char8,   vxc_char8,   vxc_char8)
+ROPE_ASYM_BSNH(U8_U8toU8,    vxc_uchar8,  vxc_uchar8,  vxc_uchar8)
+ROPE_ASYM_BSNH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
+ROPE_ASYM_BSNH(I8_F16toI8,   vxc_half8,   vxc_short8,  vxc_char8)
+ROPE_ASYM_BSNH(U8_F16toU8,   vxc_half8,   vxc_short8,  vxc_uchar8)
+ROPE_ASYM_BSNH(U16_F16toU16, vxc_half8,   vxc_short8,  vxc_ushort8)
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+__kernel void rope_BF16_BF16toBF16_bsnh
+    (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int   axis
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+
+    vxc_ushort8 v0, v1, v2, v3, dst;
+    VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    coord_in.x *= 2;
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    int4 coord_out = coord_in;
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_short8 data;
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, src0, data, 16);
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, src1, data, 16);
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, cos0, data, 16);
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, cos1, data, 16);
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, sin0, data, 16);
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, sin1, data, 16);
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, src2, data, 16);
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, src3, data, 16);
+
+    float4 even = (float4)(src0.xz, src1.xz);
+    float4 odd = (float4)(src0.yw, src1.yw);
+    float4 data0 = even * cos0 - odd * sin0;
+    float4 data1 = even * sin0 + odd * cos0;
+
+    _viv_asm(COPY, v0, data0, 16);
+    _viv_asm(COPY, v1, data1, 16);
+
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+
+    even = (float4)(src2.xz, src3.xz);
+    odd = (float4)(src2.yw, src3.yw);
+    data0 = even * cos1 - odd * sin1;
+    data1 = even * sin1 + odd * cos1;
+
+    _viv_asm(COPY, v0, data0, 16);
+    _viv_asm(COPY, v1, data1, 16);
+
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    coord_out.x += 8;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/rope_3.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/rope_3.vx
@ -0,0 +1,312 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float scale0;
+_viv_uniform float scale1;
+_viv_uniform float output_zp;
+_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;
+_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;
+_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;
+_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+
+#define ROPE_BNSH_SYMM(name, src_type, src1_type, copy_type, dst_type) \
+__kernel void rope_##name##_bnsh \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t cos_cache, \
+    __read_only  image2d_array_t sin_cache, \
+    __write_only image2d_array_t output, \
+                 int   axis \
+    ) \
+{ \
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    src_type data0, data1; \
+    src1_type cos, sin; \
+    copy_type v0, v1; \
+    dst_type dst; \
+    VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, cos, v0, 16); \
+    VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, sin, v1, 16); \
+ \
+    coord_in.x *= 2; \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 coord_out = coord_in; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+ \
+    float4 data2, data3, data4, data5; \
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
+    VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
+    VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
+ \
+    int4 dst0 = convert_int4_rte(data2); \
+    int4 dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+                dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
+    VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
+    data2 = data2 * scale0 - data4 * scale1 + output_zp; \
+    data3 = data3 * scale1 + data5 * scale0 + output_zp; \
+ \
+    dst0 = convert_int4_rte(data2); \
+    dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    coord_out.x += 8; \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+            dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+ROPE_BNSH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
+ROPE_BNSH_SYMM(I16_I16toI8,  vxc_short8, vxc_short8, vxc_short8, vxc_char8)
+ROPE_BNSH_SYMM(I16_I16toU8,  vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
+ROPE_BNSH_SYMM(I16_F16toI16, vxc_short8, vxc_half8,  vxc_short8, vxc_short8)
+ROPE_BNSH_SYMM(I16_F16toI8,  vxc_short8, vxc_half8,  vxc_short8, vxc_char8)
+ROPE_BNSH_SYMM(I16_F16toU8,  vxc_short8, vxc_half8,  vxc_short8, vxc_uchar8)
+
+__kernel void rope_F16_F16toF16_bnsh
+    (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int   axis
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+
+    vxc_short8 v0, v1, v2, v3, dst;
+    vxc_half8 data0, data1, cos, sin, dst2;
+    VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, cos, v1, 16);
+    VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, sin, v2, 16);
+
+    coord_in.x *= 2;
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, data0, v0, 16);
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, data1, v3, 16);
+
+    int4 coord_out = coord_in;
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 data2, data3, data4, data5;
+    VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
+    VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
+    VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
+    VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
+    data2 = data2 - data4;
+    data3 = data3 + data5;
+
+    half4 dst0;
+    half4 dst1;
+    _viv_asm(CONV_RTE, dst0, data2);
+    _viv_asm(CONV_RTE, dst1, data3);
+
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+    _viv_asm(COPY, dst, dst2, 16);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+
+    VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
+    VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
+    VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
+    VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
+    data2 = data2 - data4;
+    data3 = data3 + data5;
+
+    _viv_asm(CONV_RTE, dst0, data2);
+    _viv_asm(CONV_RTE, dst1, data3);
+
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+    _viv_asm(COPY, dst, dst2, 16);
+    coord_out.x += 8;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform int in0_zp;
+_viv_uniform int cos_zp;
+_viv_uniform int sin_zp;
+_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
+_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
+_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;
+_viv_uniform VXC_512Bits uniAOddMinusZp_4x4;
+#define ROPE_ASYM_BNSH(name, src1_type, copy_type, dtype) \
+__kernel void rope_##name##_bnsh \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t cos_cache, \
+    __read_only  image2d_array_t sin_cache, \
+    __write_only image2d_array_t output, \
+                 int   axis \
+    ) \
+{ \
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    dtype data0, data1, dst; \
+    src1_type cos, sin; \
+    copy_type v0, v1; \
+ \
+    VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, cos, v0, 16); \
+    VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, sin, v1, 16); \
+    coord_in.x *= 2; \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 coord_out = coord_in; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+ \
+    float4 l00, l01, cos0, cos1; \
+    float4 l10, l11, sin0, sin1; \
+    VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
+    VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
+    VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
+    VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
+    float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \
+    float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \
+ \
+    int4 dst0 = convert_int4_rte(data2); \
+    int4 dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, \
+            coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
+    VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
+    data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
+    data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
+ \
+    dst0 = convert_int4_rte(data2); \
+    dst1 = convert_int4_rte(data3); \
+ \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    coord_out.x += 8; \
+    VXC_OP4_NoDest(img_store_3d, output, \
+        coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+ROPE_ASYM_BNSH(I8_I8toI8,    vxc_char8,   vxc_char8,   vxc_char8)
+ROPE_ASYM_BNSH(U8_U8toU8,    vxc_uchar8,  vxc_uchar8,  vxc_uchar8)
+ROPE_ASYM_BNSH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
+ROPE_ASYM_BNSH(I8_F16toI8,   vxc_half8,   vxc_short8,  vxc_char8)
+ROPE_ASYM_BNSH(U8_F16toU8,   vxc_half8,   vxc_short8,  vxc_uchar8)
+ROPE_ASYM_BNSH(U16_F16toU16, vxc_half8,   vxc_short8,  vxc_ushort8)
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+__kernel void rope_BF16_BF16toBF16_bnsh
+    (
+    __read_only  image2d_array_t input,
+    __read_only  image2d_array_t cos_cache,
+    __read_only  image2d_array_t sin_cache,
+    __write_only image2d_array_t output,
+                 int   axis
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+
+    vxc_ushort8 v0, v1, v2, v3, dst;
+    VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    coord_in.x *= 2;
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    int4 coord_out = coord_in;
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_short8 data;
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, src0, data, 16);
+    VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, src1, data, 16);
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, cos0, data, 16);
+    VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, cos1, data, 16);
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, sin0, data, 16);
+    VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, sin1, data, 16);
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, src2, data, 16);
+    VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, src3, data, 16);
+
+    float4 even = (float4)(src0.xz, src1.xz);
+    float4 odd = (float4)(src0.yw, src1.yw);
+    float4 data0 = even * cos0 - odd * sin0;
+    float4 data1 = even * sin0 + odd * cos0;
+
+    _viv_asm(COPY, v0, data0, 16);
+    _viv_asm(COPY, v1, data1, 16);
+
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+
+    even = (float4)(src2.xz, src3.xz);
+    odd = (float4)(src2.yw, src3.yw);
+    data0 = even * cos1 - odd * sin1;
+    data1 = even * sin1 + odd * cos1;
+
+    _viv_asm(COPY, v0, data0, 16);
+    _viv_asm(COPY, v1, data1, 16);
+
+    VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    coord_out.x += 8;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_special.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_special.vx
@ -93,3 +93,101 @@ __kernel void scatter_nd_update_cpy2out_##src0_type##to##src0_type( \
 }
 SCATTER_ND_UPDATE_COPY2OUT(U8,  vxc_uchar16, 1)
 SCATTER_ND_UPDATE_COPY2OUT(I8,  vxc_char16, 1)
+SCATTER_ND_UPDATE_COPY2OUT(U16,  vxc_ushort8, 2)
+SCATTER_ND_UPDATE_COPY2OUT(I16,  vxc_short8, 2)
+
+#define SCATTER_ND_UPDATE_REF2OUT_16BITS(src0_type, data_type) \
+__kernel void scatter_nd_update_ref2out_##src0_type##to##src0_type( \
+    __read_only image2d_t   input_ref, \
+    image2d_t  temp_ref, \
+    image2d_t  output0 \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    Image img0 = create_image_from_image2d(input_ref, 2); \
+    Image img1 = create_image_from_image2d(temp_ref, 2); \
+    __global data_type* in_ptr = (__global data_type*)img0.ptr; \
+    __global data_type* out_ptr = (__global data_type*)img1.ptr; \
+    data_type src, dst; \
+    src = in_ptr[gidx]; \
+    vxc_ushort8 mp0; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    VXC_DP2x8(dst, src, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    out_ptr[gidx] = dst; \
+}
+SCATTER_ND_UPDATE_REF2OUT_16BITS(U16,  vxc_ushort8)
+SCATTER_ND_UPDATE_REF2OUT_16BITS(I16,  vxc_short8)
+
+#define SCATTER_ND_UPDATE_UPDATE2REF_16BITS(src0_type, data_type) \
+__kernel void scatter_nd_update_update2ref_##src0_type##to##src0_type##_16x( \
+    __read_only image2d_t   input_index, \
+    __read_only image2d_t   input_update, \
+    image2d_t  temp_ref, \
+    image2d_t  input0, \
+    image2d_t  output1, \
+    int width, int area, int vol, int coord_dim \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    Image img1 = create_image_from_image2d(input_index, 4); \
+    Image img2 = create_image_from_image2d(input_update, 2); \
+    Image img3 = create_image_from_image2d(temp_ref, 2); \
+    __global int* index_ptr = (__global int*)img1.ptr; \
+    __global data_type* update_ptr = (__global data_type*)img2.ptr; \
+    __global data_type* output_ptr = (__global data_type*)img3.ptr; \
+    data_type dst; \
+ \
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx); \
+    data_type src = update_ptr[gidy * update_width + gidx]; \
+    int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \
+    int loc = idx * output_width + gidx; \
+    vxc_ushort8 mp1; \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(dst, src, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+    output_ptr[loc] = dst; \
+}
+SCATTER_ND_UPDATE_UPDATE2REF_16BITS(U16,  vxc_ushort8)
+SCATTER_ND_UPDATE_UPDATE2REF_16BITS(I16,  vxc_short8)
+
+__kernel void scatter_nd_update_ref2out_F16toF16(
+    __read_only image2d_t   input_ref,
+    image2d_t  temp_ref,
+    image2d_t  output0
+    )
+{
+    int gidx = get_global_id(0);
+    Image img0 = create_image_from_image2d(input_ref, 2);
+    Image img1 = create_image_from_image2d(temp_ref, 2);
+    __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)img0.ptr;
+    __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)img1.ptr;
+    out_ptr[gidx] = in_ptr[gidx];
+}
+
+__kernel void scatter_nd_update_update2ref_F16toF16_16x(
+    __read_only image2d_t   input_index,
+    __read_only image2d_t   input_update,
+    image2d_t  temp_ref,
+    image2d_t  input0,
+    image2d_t  output1,
+    int width, int area, int vol, int coord_dim
+    )
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    Image img1 = create_image_from_image2d(input_index, 4);
+    Image img2 = create_image_from_image2d(input_update, 2);
+    Image img3 = create_image_from_image2d(temp_ref, 2);
+    __global int* index_ptr = (__global int*)img1.ptr;
+    __global vxc_ushort8* update_ptr = (__global vxc_ushort8*)img2.ptr;
+    __global vxc_ushort8* output_ptr = (__global vxc_ushort8*)img3.ptr;
+
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx);
+    int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;
+    int loc = idx * output_width + gidx;
+    output_ptr[loc] = update_ptr[gidy * update_width + gidx];
+}
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
@ -29,6 +29,7 @@
 #include "VX/vx_ext_program.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_prv.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_log.h"
 #include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
@ -198,10 +199,11 @@ static vsi_status vsi_nn_RegisterVXKernel
    vx_size * program_len = NULL;
    const char **program_src = NULL;
    vx_context ctx = NULL;
-    vsi_nn_context_t context = NULL;
    vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];
    uint8_t i = 0;
    vsi_bool load_from_file = FALSE;
+    vsi_nn_runtime_option_t* options;
+    options = ((vsi_nn_graph_prv_t*)graph)->options;

 #define MAX_BUILDPROGRAM_LEN 128
    char cmd[MAX_BUILDPROGRAM_LEN] = {0};
@ -210,8 +212,7 @@ static vsi_status vsi_nn_RegisterVXKernel
    memset(cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN);
    status = VSI_FAILURE;
    ctx = vxGetContext( (vx_reference)graph->g );
-    context = graph->ctx;
-    evis = context->config.evis.ver;
+    evis = options->config.evis.ver;

    program_src = (const char**)malloc(kernel_info->resource_num * sizeof(char *));
    CHECK_PTR_FAIL_GOTO( program_src, "Create buffer fail.", final );
@ -244,12 +245,12 @@ static vsi_status vsi_nn_RegisterVXKernel
    {
        // set default evis version is 2
        snprintf(cmd, MAX_BUILDPROGRAM_LEN,
-            "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va);
+            "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", options->config.use_40bits_va);
    }
    else
    {
        snprintf(cmd, MAX_BUILDPROGRAM_LEN,
-            "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va);
+            "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, options->config.use_40bits_va);
    }
    status = vxBuildProgram(program, cmd);

@ -302,7 +303,7 @@ static vsi_status vsi_nn_RegisterBinKernel
    vx_size program_len = 0;
    const uint8_t *program_ptr = NULL;
    vx_context ctx;
-    vsi_nn_context_t context;
+    vsi_nn_runtime_option_t* options;
    vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];

 #define MAX_BUILDPROGRAM_LEN 128
@ -313,8 +314,8 @@ static vsi_status vsi_nn_RegisterBinKernel
    status = VSI_FAILURE;

    ctx = vxGetContext( (vx_reference)graph->g );
-    context = graph->ctx;
-    evis = context->config.evis.ver;
+    options = ((vsi_nn_graph_prv_t*)graph)->options;
+    evis = options->config.evis.ver;

    program_ptr = vsi_nn_VxBinResourceGetResource(
            kernel_info->resource_name[kernel_info->resource_num - 1], &program_len);
@ -337,12 +338,12 @@ static vsi_status vsi_nn_RegisterBinKernel
    {
        // set default evis version is 2
        snprintf(cmd, MAX_BUILDPROGRAM_LEN,
-            "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va);
+            "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", options->config.use_40bits_va);
    }
    else
    {
        snprintf(cmd, MAX_BUILDPROGRAM_LEN,
-            "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va);
+            "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, options->config.use_40bits_va);
    }
 #else
    snprintf(cmd, MAX_BUILDPROGRAM_LEN, "-cl-viv-vx-extension");
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
@ -35,6 +35,8 @@
 #include "utils/vsi_nn_constraint_check.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
+#include "vsi_nn_tensor_util_prv.h"
+#include "vsi_nn_error.h"

 static vsi_status _try_set_high_presision_tensor
    (
@ -120,9 +122,22 @@ static vsi_status _static_batchnorm
    vsi_nn_tensor_t ** outputs
    )
 {
+#define _TENSOR_LEN 64
    vsi_status         status;
    vsi_nn_kernel_param_t * param = NULL;
    vsi_nn_tensor_t* reshape_tensors[6] = { NULL };
+    vsi_size_t shape[VSI_NN_MAX_DIM_NUM];
+    uint32_t new_rank = 4;
+    vsi_nn_tensor_t* input0 = NULL;
+    vsi_nn_tensor_t* output = NULL;
+    char reshape0_tensor_name[_TENSOR_LEN];
+    char reshape1_tensor_name[_TENSOR_LEN];
+    char batch_norm_tensor_name[_TENSOR_LEN];
+
+    memset(reshape0_tensor_name, 0, sizeof(reshape0_tensor_name));
+    memset(reshape1_tensor_name, 0, sizeof(reshape1_tensor_name));
+    memset(batch_norm_tensor_name, 0, sizeof(batch_norm_tensor_name));
+
    status = VSI_FAILURE;

    status = _try_set_high_presision_tensor(inputs);
@ -131,10 +146,43 @@ static vsi_status _static_batchnorm
        VSILOGE("Set tensor attr of high presision fail");
        return status;
    }
-    if(_require_reshape(self, inputs))
+    if (_require_reshape(self, inputs))
    {
-        reshape_tensors[0] = self->nn_param.batch_norm.local->reshaped_input;
-        reshape_tensors[5] = self->nn_param.batch_norm.local->reshaped_output;
+        if (3 == inputs[0]->attr.dim_num)
+        {
+            shape[0] = inputs[0]->attr.size[0];
+            shape[1] = 1;
+            shape[2] = inputs[0]->attr.size[1];
+            shape[3] = inputs[0]->attr.size[2];
+        }
+        else if (5 == inputs[0]->attr.dim_num)
+        {
+            shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
+            shape[1] = inputs[0]->attr.size[2];
+            shape[2] = inputs[0]->attr.size[3];
+            shape[3] = inputs[0]->attr.size[4];
+        }
+
+        input0 = vsi_nn_kernel_insert_reshape_node(self->graph,
+            inputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_BACKWARD);
+        CHECK_PTR_FAIL_GOTO(input0, "Create tensor fail.", final);
+        reshape_tensors[0] = input0;
+        snprintf(reshape0_tensor_name, sizeof(reshape0_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0);
+        if (vxSetReferenceName((vx_reference)reshape_tensors[0]->t, reshape0_tensor_name) == VSI_FAILURE)
+        {
+            VSILOGW("Set uid %u reshape 0 node output name fail", self->uid);
+            goto final;
+        }
+        output = vsi_nn_kernel_insert_reshape_node(self->graph,
+            outputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_FORWARD);
+        CHECK_PTR_FAIL_GOTO(output, "Create tensor fail.", final);
+        reshape_tensors[5] = output;
+        snprintf(reshape1_tensor_name, sizeof(reshape1_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1);
+        if (vxSetReferenceName((vx_reference)outputs[0]->t, reshape1_tensor_name) == VSI_FAILURE)
+        {
+            VSILOGW("Set uid %u reshap 1 node output name fail", self->uid);
+            goto final;
+        }
    }
    else
    {
@ -155,12 +203,26 @@ static vsi_status _static_batchnorm
        reshape_tensors, 5,
        &reshape_tensors[5], 1, param );

-    if( self->n )
+    if ( self->n )
    {
        status = VSI_SUCCESS;
    }

-    vsi_nn_kernel_param_release( &param );
+    vsi_nn_kernel_param_release(&param);
+
+    if (output)
+    {
+        snprintf(batch_norm_tensor_name, sizeof(batch_norm_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 2);
+        if (vxSetReferenceName((vx_reference)output->t, batch_norm_tensor_name) == VSI_FAILURE)
+        {
+            VSILOGW("Set uid %u instance_norm node output name fail", self->uid);
+            goto final;
+        }
+    }
+
+final:
+    vsi_safe_release_tensor(input0);
+    vsi_safe_release_tensor(output);

    return status;
 }
@ -313,68 +375,6 @@ static vsi_status op_compute
    return status;
 } /* op_compute() */

-static vsi_status op_optimize
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs,
-    vsi_nn_opt_direction_e direction
-    )
-{
-    uint32_t dim = 0;
-    vsi_nn_batcnnorm_lcl_data *local = NULL;
-    vsi_size_t shape[VSI_NN_MAX_DIM_NUM];
-    char tensor_name[128];
-
-    dim = inputs[0]->attr.dim_num;
-    if(_require_reshape(self, inputs) == FALSE)
-    {
-        return VSI_SUCCESS;
-    }
-
-    VSILOGD("Optimize 3D %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
-    /*
-        reshape 3d input (xcn) --> 4d input (whcn)
-        reshape 3d output(xcn) --> 4d output(whcn)
-    */
-    dim = 4;
-    if (3 == inputs[0]->attr.dim_num)
-    {
-        shape[0] = inputs[0]->attr.size[0];
-        shape[1] = 1;
-        shape[2] = inputs[0]->attr.size[1];
-        shape[3] = inputs[0]->attr.size[2];
-    }
-    else if (5 == inputs[0]->attr.dim_num)
-    {
-        shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
-        shape[1] = inputs[0]->attr.size[2];
-        shape[2] = inputs[0]->attr.size[3];
-        shape[3] = inputs[0]->attr.size[4];
-    }
-    local = self->nn_param.batch_norm.local;
-    if (VSI_NN_OPTIMIZE_BACKWARD == direction)
-    {
-        local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim);
-    }
-    else
-    {
-        local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim);
-        if(local->reshaped_output && local->reshaped_output->t)
-        {
-            memset(tensor_name, 0, sizeof(tensor_name));
-            snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid);
-            if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE)
-            {
-                VSILOGW("Set uid %u batchnorm reshaped output name fail", self->uid);
-                return VSI_FAILURE;
-            }
-        }
-    }
-
-    return VSI_SUCCESS;
-} /* op_optimize() */
-
 static vsi_bool _dynamic_check
    (
    vsi_nn_node_t * self,
@ -494,58 +494,6 @@ static vsi_bool op_check
    }
 } /* op_check() */

-static vsi_bool op_setup
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_nn_batcnnorm_lcl_data *local = NULL;
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
-    {
-        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
-        memcpy( outputs[0]->attr.size, inputs[0]->attr.size,
-            VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
-    }
-
-    if(_require_reshape(self, inputs))
-    {
-        local = (vsi_nn_batcnnorm_lcl_data *)malloc(sizeof(vsi_nn_batcnnorm_lcl_data));
-        if(NULL == local)
-        {
-            return VSI_FAILURE;
-        }
-        memset(local, 0, sizeof(vsi_nn_batcnnorm_lcl_data));
-        self->nn_param.batch_norm.local = local;
-    }
-    return TRUE;
-} /* op_setup() */
-
-static vsi_status op_deinit
-    (
-    vsi_nn_node_t * self
-    )
-{
-    vsi_nn_batch_norm_param *p = &(self->nn_param.batch_norm);
-    if(p->local)
-    {
-        if (p->local->reshaped_input)
-        {
-            vsi_nn_ReleaseTensor(&(p->local->reshaped_input));
-            p->local->reshaped_input = NULL;
-        }
-        if (p->local->reshaped_output)
-        {
-            vsi_nn_ReleaseTensor(&(p->local->reshaped_output));
-            p->local->reshaped_output = NULL;
-        }
-        vsi_nn_safe_free(p->local);
-    }
-    vsi_nn_op_common_deinit(self);
-    return VSI_SUCCESS;
-}
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -555,10 +503,10 @@ DEF_OP_REG
    /* op_name    */ BATCH_NORM,
    /* init       */ NULL,
    /* compute    */ op_compute,
-    /* deinit     */ op_deinit,
+    /* deinit     */ vsi_nn_op_common_deinit,
    /* check      */ op_check,
-    /* setup      */ op_setup,
-    /* optimize   */ op_optimize,
+    /* setup      */ vsi_nn_op_common_setup,
+    /* optimize   */ NULL,
    /* input_num  */ 5,
    /* output_num */ 1
    );
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c
@ -118,6 +118,7 @@ static vsi_bool op_setup
            if (outputs[0]->attr.dim_num == 0)
            {
                outputs[0]->attr.size[0] = 1;
+                outputs[0]->attr.dim_num = 1;
                vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
            }
            else
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c
@ -82,6 +82,7 @@ static vsi_bool op_check
 {
    BEGIN_IO_TYPE_DECL(CUMSUM, 1, 1)
        IO_TYPE(D_U32,         D_U32)
+        IO_TYPE(D_I32,         D_I32)
        IO_TYPE(D_F32,         D_F32)
        IO_TYPE(D_F16,         D_F16)
        IO_TYPE(D_BF16,        D_BF16)
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@ -253,6 +253,7 @@ static vsi_bool op_check
        IO_TYPE(D_BOOL8,      D_I32)
        IO_TYPE(D_BOOL8,      D_U16)
        IO_TYPE(D_BOOL8,      D_U32)
+        IO_TYPE(D_BOOL8,      D_BF16)
        IO_TYPE(D_U8|Q_ASYM,  D_BOOL8)
        IO_TYPE(D_I8|Q_ASYM,  D_BOOL8)
        IO_TYPE(D_I8|Q_DFP,   D_BOOL8)
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
@ -155,10 +155,10 @@ vsi_bool vsi_nn_op_eltwise_setup
    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
    vsi_bool ret = TRUE;

-    out_rank = inputs[0]->attr.dim_num;
+    out_rank = vsi_nn_get_tensor_dims(inputs[0]);
    for ( i = 1; i < self->input.num; i++)
    {
-        in2_rank = inputs[i]->attr.dim_num;
+        in2_rank = vsi_nn_get_tensor_dims(inputs[i]);
        out_rank = vsi_nn_max( out_rank, in2_rank );
    }

@ -166,10 +166,10 @@ vsi_bool vsi_nn_op_eltwise_setup
    {
        vsi_size_t sz0, sz1;

-        sz0 = i < inputs[0]->attr.dim_num ? inputs[0]->attr.size[i] : 1;
+        sz0 = i < vsi_nn_get_tensor_dims(inputs[0]) ? inputs[0]->attr.size[i] : 1;
        for ( j = 1; j < self->input.num; j++)
        {
-            sz1 = i < inputs[j]->attr.dim_num  ? inputs[j]->attr.size[i] : 1;
+            sz1 = i < vsi_nn_get_tensor_dims(inputs[j]) ? inputs[j]->attr.size[i] : 1;
            sz0 = vsi_nn_max( sz0, sz1 );
            if (sz0 != sz1 && sz0 != 1 && sz1 != 1)
            {
@ -187,11 +187,12 @@ vsi_bool vsi_nn_op_eltwise_setup
    {
        outputs[0]->attr.dim_num = out_rank;
        memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) );
-        if (out_rank == 1 &&
-            vsi_nn_GetTensorIsScalar(inputs[0]) &&
+        if (vsi_nn_GetTensorIsScalar(inputs[0]) &&
            vsi_nn_GetTensorIsScalar(inputs[1]))
        {
            vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
+            outputs[0]->attr.size[0] = 1;
+            outputs[0]->attr.dim_num = 1;
        }
    }
    else
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
@ -199,6 +199,7 @@ static vsi_bool op_setup
        if (o_rank == 0)
        {
            outputs[0]->attr.size[0] = 1;
+            outputs[0]->attr.dim_num = 1;
            vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
        }
        else
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
@ -306,6 +306,8 @@ static vsi_bool _op_check
        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_I16|Q_DFP)
        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_I16|Q_ASYM)
        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_I16|Q_SYM)
+        IO_TYPE(D_U16|Q_ASYM, D_F32,  D_F32,  D_U16|Q_ASYM)
+        IO_TYPE(D_U16|Q_SYM,  D_F32,  D_F32,  D_U16|Q_SYM)
    END_IO_TYPE_DECL(GROUP_NORM)
    if (!VALIDATE_OP_IO_TYPES(GROUP_NORM, self, inputs, self->input.num, outputs, self->output.num))
    {
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c
@ -25,6 +25,7 @@
 #include <stdlib.h>

 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_graph.h"
@ -197,6 +198,7 @@ static vsi_bool op_setup_default
    vsi_nn_internal_tensor_t * hstate_fc_outputs[GRUCELL_GATE_CNT] = { NULL };
    vsi_nn_internal_tensor_t * h_times_r = NULL;
    vsi_nn_tensor_attr_t attr;
+    vsi_nn_activation_e recurrent_activation = p->recurrent_activation;

    vsi_nn_internal_init_node_wksp( self );

@ -230,7 +232,8 @@ static vsi_bool op_setup_default
    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
    if (inputs[GRUCELL_IN_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
-        self->graph->ctx->config.support_stream_processor)
+       (((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor &&
+           recurrent_activation == VSI_NN_ACT_SIGMOID))
    {
        attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
    }
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c
@ -93,37 +93,15 @@ static vsi_bool op_check
    {
        BEGIN_IO_TYPE_DECL(L1_LAYER_NORM, 4, 1)
            IO_TYPE(D_F32,        D_F32,  D_F32,  D_F32,  D_F32)
-            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_F16)
-            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_U8|Q_ASYM)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_U8|Q_ASYM)
-            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_DFP)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_DFP)
-            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_ASYM)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_ASYM)
-            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_SYM)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_SYM)
-            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_DFP)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_DFP)
-            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_ASYM)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_ASYM)
-            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_SYM)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_SYM)
            IO_TYPE(D_BF16,       D_F32,  D_F32,  D_F32,  D_BF16)
-            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_F16)
-            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_U8|Q_ASYM)
-            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F32,  D_I16|Q_DFP)
-            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_F32,  D_I16|Q_ASYM)
-            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_F32,  D_I16|Q_SYM)
-            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F32,  D_F16)
-            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_F32,  D_F16)
-            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_F32,  D_F16)
-            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F32,  D_I8|Q_DFP)
-            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_I8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_F32,  D_I8|Q_SYM)
-            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F32,  D_F16)
-            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_F16)
-            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_U8|Q_ASYM)
            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_F16)
            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F32,  D_I16|Q_DFP)
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
@ -25,6 +25,7 @@
 #include <stdlib.h>

 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_node.h"
@ -351,7 +352,7 @@ static vsi_bool op_setup
    }
    else if ( ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 &&
                outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) ||
-              self->graph->ctx->config.support_stream_processor )
+              ((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor )
    {
        vsi_nn_internal_tensor_t* output_tensor = NULL;
        vsi_nn_internal_tensor_t* reshape_tensor = NULL;
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
@ -106,7 +106,7 @@ static vsi_bool op_setup

    vsi_nn_internal_init_node_wksp( self );

-    if ( axis != 0 && !self->graph->ctx->config.support_stream_processor)
+    if ( axis != 0 && !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
    {
        vsi_nn_internal_tensor_t* mean_tensor = NULL;
        vsi_nn_internal_tensor_t* vari_tensor = NULL;
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
@ -25,6 +25,7 @@
 #include <stdlib.h>

 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_graph.h"
@ -139,7 +140,7 @@ static vsi_bool op_setup

    p->is_cifg = inputs[LSTMUNIT_ACT_INPUT_FC_I] == NULL;
    p->is_projection = outputs[LSTMUNIT_ACT_HSTATE_OUT] == NULL;
-    if (self->graph->ctx->config.support_stream_processor)
+    if (((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
    {
        p->is_layer_norm = inputs[LSTMUNIT_ACT_HSTATE_FC_F] == NULL;
    }
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c
@ -100,6 +100,7 @@ static vsi_bool op_check
        IO_TYPE(D_I32,          D_I16|Q_ASYM)
        IO_TYPE(D_I32,          D_I16|Q_SYM)
        IO_TYPE(D_I32,          D_I32)
+        IO_TYPE(D_I32,          D_BF16)
        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_SYM)
        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_SYM)
        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
@ -111,8 +112,10 @@ static vsi_bool op_check
        IO_TYPE(D_U8|Q_ASYM,    D_BF16)
        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
        IO_TYPE(D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_BF16)
        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
        IO_TYPE(D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_BF16)
        IO_TYPE(D_I16|Q_DFP,    D_I8|Q_DFP)
        IO_TYPE(D_I16|Q_DFP,    D_U8|Q_ASYM)
        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
@ -124,11 +127,14 @@ static vsi_bool op_check
        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
        IO_TYPE(D_I16|Q_SYM,    D_I8|Q_SYM)
        IO_TYPE(D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_BF16)
        IO_TYPE(D_I16|Q_ASYM,   D_F32)
        IO_TYPE(D_I16|Q_SYM,    D_U8|Q_ASYM)
        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_BF16)
        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
        IO_TYPE(D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_BF16)
        IO_TYPE(D_BF16,         D_BF16)
    END_IO_TYPE_DECL(ONE_HOT)
    if (!VALIDATE_OP_IO_TYPES(ONE_HOT, self, inputs, self->input.num, outputs, self->output.num))
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
@ -36,6 +36,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"

 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
@ -50,33 +51,52 @@ static vsi_status op_compute
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_param_t * param = NULL;
    vsi_nn_kernel_node_t    n = NULL;
-    param =vsi_nn_kernel_param_create();
+    vsi_nn_tensor_t* reshape_tensor = NULL;
+    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_nn_pre_process_rgb_param* p = NULL;

-    vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_rgb.local.scale_x );
-    vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_rgb.local.scale_y );
-    vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_rgb.rect.left );
-    vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_rgb.rect.top );
-    vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_rgb.r_mean );
-    vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_rgb.g_mean );
-    vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb.b_mean );
-    vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_rgb.r_scale );
-    vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_rgb.g_scale );
-    vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_rgb.b_scale );
-    vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_rgb.reverse_channel );
-    vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_rgb.local.enable_perm );
-    vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb.local.enable_copy );
-    n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb", inputs, 1, outputs, 1, param );
-    if( n != NULL )
+    memcpy(shape, inputs[0]->attr.size, inputs[0]->attr.dim_num * sizeof(vsi_size_t));
+
+    shape[0] = shape[1] * shape[0];
+    shape[1] = shape[2];
+    shape[2] = 1;
+
+    reshape_tensor = vsi_nn_reshape_tensor(self->graph,
+        inputs[0], shape, inputs[0]->attr.dim_num);
+    CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor failed", final);
+
+    p = (vsi_nn_pre_process_rgb_param*)&(self->nn_param.pre_process_rgb);
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32( param, "scale_x", p->local->scale_x );
+    vsi_nn_kernel_param_add_int32( param, "scale_y", p->local->scale_y );
+    vsi_nn_kernel_param_add_int32( param, "left", p->rect.left );
+    vsi_nn_kernel_param_add_int32( param, "top", p->rect.top );
+    vsi_nn_kernel_param_add_float32( param, "r_mean", p->r_mean );
+    vsi_nn_kernel_param_add_float32( param, "g_mean", p->g_mean );
+    vsi_nn_kernel_param_add_float32( param, "b_mean", p->b_mean );
+    vsi_nn_kernel_param_add_float32( param, "r_scale", p->r_scale );
+    vsi_nn_kernel_param_add_float32( param, "g_scale", p->g_scale );
+    vsi_nn_kernel_param_add_float32( param, "b_scale", p->b_scale );
+    vsi_nn_kernel_param_add_int32( param, "reverse", p->reverse_channel );
+    vsi_nn_kernel_param_add_int32( param, "enable_perm", p->local->enable_perm );
+    vsi_nn_kernel_param_add_int32( param, "enable_copy", p->local->enable_copy );
+    n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb", &reshape_tensor, 1, outputs, 1, param );
+    if ( n != NULL )
    {
        self->n = (vx_node)n;
        status = VSI_SUCCESS;
    }

-    if(param != NULL)
+    if (param != NULL)
    {
        vsi_nn_kernel_param_release( &param );
    }

+final:
+    vsi_safe_release_tensor(reshape_tensor);
+
    return status;
 } /* op_compute() */

@ -166,35 +186,57 @@ static vsi_bool op_setup
    }


-    self->nn_param.pre_process_rgb.local.enable_perm = FALSE;
+    p->local->enable_perm = FALSE;

-    if (self->nn_param.pre_process_rgb.local.enable_perm == FALSE)
+    if (p->local->enable_perm == FALSE)
    {
-        p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]);
-        p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
+        p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]);
+        p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
    }
    else
    {
-        p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
-        p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]);
+        p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
+        p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]);
    }

-    p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15)));
+    p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15)));

    return TRUE;
 } /* op_setup() */

+static vsi_status op_init
+(
+    vsi_nn_node_t* self
+)
+{
+    vsi_status status = VSI_SUCCESS;
+
+    self->nn_param.pre_process_rgb.local =
+        (vsi_nn_pre_process_rgb_lcl_data*)malloc(sizeof(vsi_nn_pre_process_rgb_lcl_data));
+
+    if (NULL == self->nn_param.pre_process_rgb.local)
+    {
+        return VX_ERROR_NO_MEMORY;
+    }
+
+    memset(self->nn_param.pre_process_rgb.local, 0, sizeof(vsi_nn_pre_process_rgb_lcl_data));
+
+    return status;
+} /* op_init() */
+
 static vsi_status op_deinit
    (
    vsi_nn_node_t * self
    )
 {
-    if (self->nn_param.pre_process_rgb.local.local_tensor != NULL)
+    if (self->nn_param.pre_process_rgb.local->local_tensor != NULL)
    {
-        vxReleaseTensor(&self->nn_param.pre_process_rgb.local.local_tensor);
-        self->nn_param.pre_process_rgb.local.local_tensor = NULL;
+        vxReleaseTensor(&self->nn_param.pre_process_rgb.local->local_tensor);
+        self->nn_param.pre_process_rgb.local->local_tensor = NULL;
    }

+    vsi_nn_safe_free(self->nn_param.pre_process_rgb.local);
+
    vsi_nn_op_common_deinit(self);

    return VSI_SUCCESS;
@ -208,7 +250,7 @@ extern "C" {
 DEF_OP_REG
    (
    /* op_name    */ PRE_PROCESS_RGB,
-    /* init       */ NULL,
+    /* init       */ op_init,
    /* compute    */ op_compute,
    /* deinit     */ op_deinit,
    /* check      */ op_check,
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c
@ -79,7 +79,10 @@ static vsi_status _prelu_op_compute
    vsi_status status = VSI_FAILURE;
    vsi_nn_prelu_param *prelu = &self->nn_param.prelu;
    vsi_ssize_t shapes[VSI_NN_MAX_DIM_NUM] = { 1 };
-    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_nn_tensor_t* input0 = NULL;
+    vsi_nn_tensor_t* input1 = NULL;
+    vsi_nn_tensor_t* output = NULL;
    vsi_bool   one_rank = FALSE;
    vsi_bool   is_per_channel_alpha = 0;
    vsi_size_t alpha_shape = 1;
@ -88,6 +91,7 @@ static vsi_status _prelu_op_compute
    uint32_t dims = outputs[0]->attr.dim_num;

    reshape_tensors[0] = inputs[0];
+    reshape_tensors[2] = outputs[0];
    one_rank = _is_one_rank_tensor(inputs[1], &alpha_shape);

    for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
@ -114,18 +118,23 @@ static vsi_status _prelu_op_compute
               dims = inputs[1]->attr.dim_num;
            }

-            reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+            input1 = vsi_nn_reshape_tensor( self->graph,
                inputs[1], (vsi_size_t*)shapes, dims );
+            CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
+            reshape_tensors[1] = input1;
        }
        else
        {
            memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t));
-            reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+            input1 = vsi_nn_reshape_tensor( self->graph,
                inputs[1], (vsi_size_t*)shapes, inputs[1]->attr.dim_num );
+            CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
+            reshape_tensors[1] = input1;
        }
    }
    else
    {
+        uint32_t rank = inputs[0]->attr.dim_num;
        dims = inputs[1]->attr.dim_num;

        memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t));
@ -141,9 +150,32 @@ static vsi_status _prelu_op_compute
            shapes[1] = 1;
            dims = 2;
        }
+        else if (one_rank && inputs[1]->attr.is_const == TRUE &&
+            alpha_shape == inputs[0]->attr.size[0] &&
+            alpha_shape == inputs[1]->attr.size[0] &&
+            rank < 3)
+        {
+            is_per_channel_alpha = TRUE;
+            shapes[0] = 1;
+            shapes[1] = 1;
+            shapes[2] = alpha_shape;
+            shapes[3] = rank > 1 ? inputs[0]->attr.size[1] : 1;
+            dims = 4;
+            input0 = vsi_nn_reshape_tensor(self->graph, inputs[0], (vsi_size_t*)shapes, dims);
+            CHECK_PTR_FAIL_GOTO(input0, "Create tensor fail.", final);
+            reshape_tensors[0] = input0;
+            output = vsi_nn_reshape_tensor(self->graph, outputs[0], (vsi_size_t*)shapes, dims);
+            CHECK_PTR_FAIL_GOTO(output, "Create tensor fail.", final);
+            reshape_tensors[2] = output;
+            shapes[0] = alpha_shape;
+            shapes[1] = 1;
+            dims = 2;
+        }

-        reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+        input1 = vsi_nn_reshape_tensor( self->graph,
            inputs[1], (vsi_size_t*)shapes, dims );
+        CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
+        reshape_tensors[1] = input1;
    }

    // Add params
@ -153,15 +185,19 @@ static vsi_status _prelu_op_compute
    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
        kernel_name,
        &reshape_tensors[0], 2,
-        outputs, 1, param );
+        &reshape_tensors[2], 1, param );

    vsi_nn_kernel_param_release( &param );
-    vsi_nn_ReleaseTensor( &reshape_tensors[1] );
-    if( self->n )
+    if ( self->n )
    {
        status = VSI_SUCCESS;
    }

+final:
+    vsi_safe_release_tensor(input0);
+    vsi_safe_release_tensor(input1);
+    vsi_safe_release_tensor(output);
+
    return status;
 } /* _prelu_op_compute() */

@ -211,28 +247,36 @@ static vsi_bool op_check
    )
 {
    BEGIN_IO_TYPE_DECL(PRELU, 2, 1)
-        IO_TYPE(D_F16,  D_F16, D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_F16, D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_F16, D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_F16, D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_F16)
-        IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_F16, D_F16)
-        IO_TYPE(D_BF16, D_F16, D_BF16)
-        IO_TYPE(D_BF16, D_BF16, D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
-        IO_TYPE(D_F32, D_F32, D_F32)
-        IO_TYPE(D_I32, D_I32, D_I32)
+        IO_TYPE(D_F16,          D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_F16)
+        IO_TYPE(D_BF16,         D_F16,          D_BF16)
+        IO_TYPE(D_BF16,         D_BF16,         D_BF16)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_F32,          D_F32,          D_F32)
+        IO_TYPE(D_I32,          D_I32,          D_I32)

        /* HW 9.0 */
-        IO_TYPE(D_F32, D_BF16, D_BF16)
-        IO_TYPE(D_BF16, D_BF16, D_F32)
+        IO_TYPE(D_F32,          D_BF16,         D_BF16)
+        IO_TYPE(D_BF16,         D_BF16,         D_F32)
    END_IO_TYPE_DECL(PRELU)
-    if(!VALIDATE_OP_IO_TYPES(PRELU, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(PRELU, self, inputs, self->input.num, outputs, self->output.num)) {
        char* desc = generate_op_io_types_desc(inputs,
                self->input.num, outputs, self->output.num);
        VSILOGE("Inputs/Outputs data type not support: %s", desc);
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
@ -162,7 +162,7 @@ static vsi_bool _check_is_sp_supported_type
    int32_t * axes = self->nn_param.reduce.local2->axes;
    int32_t axes_num = self->nn_param.reduce.local2->axes_num;

-    if ( !self->graph->ctx->config.support_stream_processor ||
+    if ( !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor ||
         (type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN && type != VSI_NN_REDUCE_MAX) )
    {
        return FALSE;
@ -788,7 +788,7 @@ static vsi_bool op_set_reduce_axis(
        }
        *out_rank_x = inputs[0]->attr.dim_num;
    }
-    else if (!self->graph->ctx->config.support_stream_processor ||
+    else if (!((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor ||
             resolved_dim_count > 2)
    {
        optimzation_input_size(
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
@ -61,7 +61,7 @@ static vsi_status op_compute
        vx_nn_reshape_params_t reshape_param;

        memset(&attr, 0, sizeof(attr));
-        attr.size[0] = self->nn_param.reshape.dim_num;
+        attr.size[0] = vsi_nn_max(self->nn_param.reshape.dim_num, 1);
        attr.dim_num = 1;
        attr.is_const = TRUE;
        attr.dtype.vx_type = VSI_NN_TYPE_INT32;
@ -124,17 +124,28 @@ static vsi_bool op_setup
    vsi_bool ret = TRUE;
    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
    {
-        vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
-        uint32_t i = 0;
-        for (i = 0; i < self->nn_param.reshape.dim_num; i++)
+        if (self->nn_param.reshape.dim_num == 0 ||
+            self->nn_param.reshape.size == NULL
+            )
        {
-            shape[i] = (uint32_t)-1 == self->nn_param.reshape.size[i] ? \
-                (vsi_size_t)-1 : (vsi_size_t)self->nn_param.reshape.size[i];
+            outputs[0]->attr.size[0] = 1;
+            outputs[0]->attr.dim_num = 1;
+            vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
+        }
+        else
+        {
+            vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+            uint32_t i = 0;
+            for (i = 0; i < self->nn_param.reshape.dim_num; i++)
+            {
+                shape[i] = (uint32_t)-1 == self->nn_param.reshape.size[i] ? \
+                    (vsi_size_t)-1 : (vsi_size_t)self->nn_param.reshape.size[i];
+            }
+            ret = vsi_nn_CalcReshapeTensor(inputs[0],
+                outputs[0],
+                shape,
+                self->nn_param.reshape.dim_num);
        }
-        ret = vsi_nn_CalcReshapeTensor(inputs[0],
-            outputs[0],
-            shape,
-            self->nn_param.reshape.dim_num);
    }

    return ret;
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
@ -66,7 +66,7 @@ static vsi_status op_compute
        }

        memset(&attr, 0, sizeof(attr));
-        attr.size[0] = self->nn_param.reshape2.dim_num;
+        attr.size[0] = vsi_nn_max(self->nn_param.reshape2.dim_num, 1);
        attr.dim_num = 1;
        attr.is_const = TRUE;
        attr.dtype.vx_type = VSI_NN_TYPE_INT32;
@ -161,13 +161,24 @@ static vsi_bool op_setup
    vsi_bool ret = TRUE;
    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
    {
-        vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
-        memcpy(shape, self->nn_param.reshape2.size,
-            sizeof(vsi_size_t) * self->nn_param.reshape2.dim_num);
-        ret = vsi_nn_CalcReshapeTensor(inputs[0],
-            outputs[0],
-            shape,
-            self->nn_param.reshape2.dim_num);
+        if (self->nn_param.reshape2.dim_num == 0 ||
+            self->nn_param.reshape2.size == NULL
+            )
+        {
+            outputs[0]->attr.size[0] = 1;
+            outputs[0]->attr.dim_num = 1;
+            vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
+        }
+        else
+        {
+            vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+            memcpy(shape, self->nn_param.reshape2.size,
+                sizeof(vsi_size_t) * self->nn_param.reshape2.dim_num);
+            ret = vsi_nn_CalcReshapeTensor(inputs[0],
+                outputs[0],
+                shape,
+                self->nn_param.reshape2.dim_num);
+        }
    }

    return ret;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Colin	c05cfdc623	Update cmake .	2025-12-02 15:02:41 +00:00
Colin	1ad3aabcfe	Add unified-tina and viplite runtime library in arm linux.	2025-12-02 15:01:18 +00:00
Kee	7b24f4d437	[vx_platform] Fix create sub device crash issue (#715 ) * Fix native platform build issue Redefinition of variable deviceCount Type: Bug fix * [vx_platform] Fix create sub device crash issue sub_device_ variable should be initialized Type: Bug fix Signed-off-by: Kee <xuke537@hotmail.com> * Fix a typo Signed-off-by: Kee <xuke537@hotmail.com> * Fix another typo Signed-off-by: Kee <xuke537@hotmail.com> --------- Signed-off-by: Kee <xuke537@hotmail.com>	2025-11-25 17:19:56 +08:00
Peter Kjellerstedt	3c83eca946	Add include of cstdint to permute_vector.h (#711 ) This avoids the following error with GCC 15: src/tim/transform/ops/../permute_vector.h:41:11: error: 'uint32_t' does not name a type 41 \| virtual uint32_t Rank() const = 0; \| ^~~~~~~~ src/tim/transform/ops/../permute_vector.h:32:1: note: 'uint32_t' is defined in header '<cstdint>'; this is probably fixable by adding '#include <cstdint>' 31 \| #include <string> +++ \|+#include <cstdint> 32 \| Co-authored-by: Peter Kjellerstedt <pkj@axis.com>	2025-10-13 13:15:57 +08:00
Kee	c4e75674fa	Refine platform code and samples (#713 ) * Refine platform code and samples 1. Support viplite v2 API 2. Unify the Lite and Native platform APIs so that the same code can run on different platforms through different compilation options. Type: Code Improvement Signed-off-by: Kee <xuke537@hotmail.com> * Fix build error if VSI device API is not supported Signed-off-by: Kee <xuke537@hotmail.com> --------- Signed-off-by: Kee <xuke537@hotmail.com>	2025-10-13 13:15:31 +08:00
Kee	6810d310d3	update CI workflows to use v4 of the artifact actions (#714 )	2025-10-09 18:39:22 +08:00
Chen Feiyue	8494275d76	Update internal ovxlib to release/1.2.22 (#706 ) * Update internal ovxlib to release/1.2.22 Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com> * Refine yaml file for blocking tfhub model tests Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com> --------- Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com>	2025-01-08 13:22:46 +08:00
Kainan Cha	149834832c	Update README.md Add ONNX Runtime Link	2024-12-12 09:24:49 +08:00
Chen Feiyue	fcdf223d06	Fixed layernorm and logsoftmax ut error (#702 ) Type: Bug Fix Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>	2024-07-29 10:44:28 +08:00
Chen Feiyue	81b6c07c5d	Update timvx_overview.svg (#701 ) Type: Documentation Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>	2024-07-29 10:44:04 +08:00
 @ -1 +1 @@
 .2.14
 .2.22