From 161bb8a7c4ab6a84b28cecce08a71804ca60786a Mon Sep 17 00:00:00 2001
From: Zhouheng Zheng <zhengzhouheng@outlook.com>
Date: Tue, 1 Mar 2022 17:56:03 +0800
Subject: [PATCH] Pre-release for 22Q1 (#302)

update internal to commit-id: d45da6fa

Co-authored-by: zhouheng.zheng <zhouheng.zheng@ouotlook.com>
---
 .../include/custom/custom_node_type.def       |    3 +
 .../vx/internal/include/custom/custom_ops.def |    3 +
 ...si_nn_op_custom_ainr_denoise_postprocess.h |   47 +
 .../custom/ops/vsi_nn_op_custom_warp_affine.h |   49 +
 .../ops/vsi_nn_op_custom_warp_perspective.h   |   50 +
 .../include/custom/vsi_nn_custom_node_type.h  |    3 +
 src/tim/vx/internal/include/interface/ops.def |    3 +
 .../include/internal/internal_ops.def         |    1 +
 .../internal/include/kernel/vsi_nn_kernel.h   |   14 +
 .../kernel/vsi_nn_kernel_gpu_shape_optimize.h |    6 +
 .../internal/include/ops/vsi_nn_op_deconv3d.h |   54 +
 .../internal/include/ops/vsi_nn_op_gather.h   |    1 +
 .../vx/internal/include/ops/vsi_nn_op_pad2.h  |   50 +
 .../internal/include/ops/vsi_nn_op_reduce.h   |    2 +-
 .../ops/vsi_nn_op_reduce_mean_internal.h      |   49 +
 .../vx/internal/include/utils/vsi_nn_util.h   |   26 +
 .../vx/internal/include/vsi_nn_client_op.h    |   11 +
 src/tim/vx/internal/include/vsi_nn_context.h  |    1 +
 .../internal/include/vsi_nn_feature_config.h  |   23 -
 src/tim/vx/internal/include/vsi_nn_graph.h    |   23 +
 src/tim/vx/internal/include/vsi_nn_log.h      |    9 +-
 .../vx/internal/include/vsi_nn_node_type.h    |    8 +-
 src/tim/vx/internal/include/vsi_nn_ops.h      |    9 +-
 src/tim/vx/internal/include/vsi_nn_version.h  |    2 +-
 .../ops/kernel/cpu/custom_softmax_cpu.c       |    2 +-
 .../ops/kernel/cpu/custom_warp_affine_cpu.c   |  296 ++
 .../kernel/cpu/custom_warp_perspective_cpu.c  |  300 ++
 .../ops/kernel/evis/custom_warp_affine_evis.c |  295 ++
 .../evis/custom_warp_perspective_evis.c       |  300 ++
 .../ops/op_custom_ainr_denoise_postprocess.c  |  136 +
 .../custom/ops/vsi_nn_op_custom_warp_affine.c |  136 +
 .../ops/vsi_nn_op_custom_warp_perspective.c   |  136 +
 src/tim/vx/internal/src/kernel/cl/clip_cl.c   |   18 +-
 .../src/kernel/cl/depth2space_internal_cl.c   |  226 +
 .../internal/src/kernel/cl/eltwise_unary_cl.c |   10 +
 .../vx/internal/src/kernel/cl/floordiv_cl.c   |   79 +-
 src/tim/vx/internal/src/kernel/cl/gather_cl.c |  114 +-
 .../src/kernel/cl/group_normalization_cl.c    |   44 +-
 .../src/kernel/cl/instance_normalization_cl.c |    6 +-
 .../vx/internal/src/kernel/cl/moments_cl.c    |    5 +
 src/tim/vx/internal/src/kernel/cl/topk_cl.c   |  301 ++
 .../src/kernel/cpu/eltwise_unary_cpu.c        |   10 +
 .../vx/internal/src/kernel/cpu/gather_cpu.c   |   76 +-
 .../internal/src/kernel/cpu/gather_nd_cpu.c   |    5 +-
 .../kernel/cpu/instance_normalization_cpu.c   |  100 +-
 .../src/kernel/cpu/resize_bilinear_cpu.c      |    2 -
 .../vx/internal/src/kernel/evis/argmax_evis.c |   77 +-
 .../src/kernel/evis/comparisons_evis.c        |   30 +-
 .../kernel/evis/depth2space_internal_evis.c   |   11 +
 .../src/kernel/evis/eltwise_unary_evis.c      |   27 +
 .../vx/internal/src/kernel/evis/gather_evis.c |  147 +-
 .../kernel/evis/group_normalization_evis.c    |   43 +-
 .../kernel/evis/instance_normalization_evis.c |    7 +-
 .../src/kernel/evis/logical_ops_evis.c        |   28 +-
 .../internal/src/kernel/evis/matrixmul_evis.c |   50 +
 .../internal/src/kernel/evis/maximum_evis.c   |   47 +-
 .../internal/src/kernel/evis/minimum_evis.c   |   47 +-
 .../internal/src/kernel/evis/moments_evis.c   |   94 +
 .../internal/src/kernel/evis/one_hot_evis.c   |   87 +-
 .../vx/internal/src/kernel/evis/prelu_evis.c  |    4 +-
 .../src/kernel/evis/resize_bilinear_evis.c    |  120 +-
 .../src/kernel/evis/scatter_nd_evis.c         |   38 +
 .../vx/internal/src/kernel/vsi_nn_kernel.c    |  321 +-
 .../kernel/vsi_nn_kernel_gpu_shape_optimize.c |   44 +-
 .../vx/internal/src/kernel/vx/batch_norm_vx.c |   84 +
 .../vx/internal/src/kernel/vx/convolutional.c |   92 +-
 src/tim/vx/internal/src/kernel/vx/pad2_vx.c   |  113 +
 .../internal/src/libnnext/ops/cl/clip_BF16.cl |   37 +
 .../src/libnnext/ops/cl/depth2space_crd.cl    |   17 +
 .../src/libnnext/ops/cl/eltwise_unary.cl      |    9 +
 .../internal/src/libnnext/ops/cl/floordiv.cl  |  186 +-
 .../vx/internal/src/libnnext/ops/cl/gather.cl |   12 +-
 .../src/libnnext/ops/cl/gather_batch.cl       |  123 +
 .../src/libnnext/ops/cl/moments_axis0.cl      |   42 +
 .../src/libnnext/ops/cl/moments_axis01.cl     |   60 +
 .../src/libnnext/ops/cl/moments_axis012.cl    |   61 +
 .../src/libnnext/ops/cl/moments_axis1.cl      |   41 +
 .../src/libnnext/ops/cl/moments_axis2.cl      |   44 +-
 .../vx/internal/src/libnnext/ops/cl/topk.cl   |  251 ++
 .../src/libnnext/ops/vx/argmax_axis2.vx       |  101 +-
 .../ops/vx}/custom_softmax.vx                 |   11 +-
 .../src/libnnext/ops/vx/custom_warp_affine.vx |  353 ++
 .../ops/vx/custom_warp_perspective.vx         |  395 ++
 .../src/libnnext/ops/vx/depth2space_crd.vx    |    2 +-
 .../src/libnnext/ops/vx/eltwise_unary_2d.vx   |   18 +
 .../src/libnnext/ops/vx/eltwise_unary_3d.vx   |   18 +
 .../vx/internal/src/libnnext/ops/vx/gather.vx |    2 -
 .../src/libnnext/ops/vx/gather_batch.vx       |  237 ++
 .../src/libnnext/ops/vx/gather_mix_batch.vx   |  236 ++
 .../src/libnnext/ops/vx/logical_ops.vx        |   53 +-
 .../src/libnnext/ops/vx/matrixmul_bf16.vx     |  272 ++
 .../src/libnnext/ops/vx/matrixmul_f16.vx      |   23 +-
 .../internal/src/libnnext/ops/vx/maximum.vx   |   56 +
 .../src/libnnext/ops/vx/maximum_i16.vx        |   60 +
 .../internal/src/libnnext/ops/vx/minimum.vx   |   56 +
 .../src/libnnext/ops/vx/minimum_i16.vx        |   60 +
 .../src/libnnext/ops/vx/moments_axis0.vx      |   85 +
 .../src/libnnext/ops/vx/moments_axis012.vx    |   78 +
 .../src/libnnext/ops/vx/moments_axis1.vx      |   84 +
 .../src/libnnext/ops/vx/moments_axis2.vx      |   46 +
 .../src/libnnext/ops/vx/moments_u8_axis012.vx |  142 +-
 .../internal/src/libnnext/ops/vx/one_hot.vx   |   88 +
 .../src/libnnext/ops/vx/relational_ops_2d.vx  |   47 +-
 .../src/libnnext/ops/vx/relational_ops_3d.vx  |   41 +
 ...esize_bilinear_U8_half_pixel_centers_1.vx} |    0
 ...resize_bilinear_U8_half_pixel_centers_2.vx |  129 +
 .../src/libnnext/ops/vx/scatter_nd.vx         |   50 +-
 .../src/libnnext/vsi_nn_libnnext_resource.c   | 3740 ++++++++++++++++-
 .../internal/src/libnnext/vsi_nn_vxkernel.c   |   18 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_abs.c   |    1 +
 .../internal/src/ops/vsi_nn_op_batch_norm.c   |   44 +-
 .../src/ops/vsi_nn_op_conv2d_lstm_cell.c      |    2 +-
 .../internal/src/ops/vsi_nn_op_dataconvert.c  |    6 +-
 .../vx/internal/src/ops/vsi_nn_op_deconv3d.c  |  302 ++
 .../src/ops/vsi_nn_op_depth2space_internal.c  |    2 +
 .../vx/internal/src/ops/vsi_nn_op_eltwise.c   |  100 +-
 .../src/ops/vsi_nn_op_eltwise_unary.c         |    7 +-
 .../vx/internal/src/ops/vsi_nn_op_floordiv.c  |    6 +
 .../src/ops/vsi_nn_op_fullconnect_relu.c      |   15 +-
 .../vx/internal/src/ops/vsi_nn_op_gather.c    |   20 +-
 .../src/ops/vsi_nn_op_grouped_conv2d.c        |    6 +
 .../src/ops/vsi_nn_op_instancenormalize.c     |  211 +-
 .../src/ops/vsi_nn_op_l2normalizescale.c      |    5 +-
 .../vx/internal/src/ops/vsi_nn_op_linear.c    |    4 +-
 .../internal/src/ops/vsi_nn_op_logical_ops.c  |    1 +
 .../src/ops/vsi_nn_op_lstmunit_ovxlib.c       |    2 +-
 .../vx/internal/src/ops/vsi_nn_op_matrixmul.c |    1 +
 .../vx/internal/src/ops/vsi_nn_op_moments.c   |    2 +
 .../vx/internal/src/ops/vsi_nn_op_one_hot.c   |    1 +
 src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c  |  198 +
 .../vx/internal/src/ops/vsi_nn_op_reduce.c    |  185 +-
 .../src/ops/vsi_nn_op_reduce_mean_internal.c  |  163 +
 .../internal/src/ops/vsi_nn_op_scatter_nd.c   |    1 +
 .../src/ops/vsi_nn_op_softmax_internal.c      |    6 +-
 .../vx/internal/src/ops/vsi_nn_op_square.c    |    1 +
 src/tim/vx/internal/src/ops/vsi_nn_op_stack.c |   10 +
 .../src/ops/vsi_nn_op_strided_slice.c         |    1 +
 .../src/utils/vsi_nn_code_generator.c         |    6 +-
 src/tim/vx/internal/src/utils/vsi_nn_dtype.c  |   12 +
 src/tim/vx/internal/src/utils/vsi_nn_util.c   |  107 +-
 src/tim/vx/internal/src/vsi_nn_client_op.c    |   39 +
 src/tim/vx/internal/src/vsi_nn_context.c      |    9 +-
 src/tim/vx/internal/src/vsi_nn_graph.c        |   71 +-
 .../internal/src/vsi_nn_graph_optimization.c  |    4 +-
 src/tim/vx/internal/src/vsi_nn_log.c          |    3 +-
 src/tim/vx/internal/src/vsi_nn_node.c         |    4 +-
 src/tim/vx/internal/src/vsi_nn_ops.c          |   31 +-
 .../vx/internal/src/vsi_nn_pre_post_process.c |   37 +-
 src/tim/vx/internal/src/vsi_nn_tensor.c       |   46 +-
 149 files changed, 12641 insertions(+), 970 deletions(-)
 create mode 100644 src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_ainr_denoise_postprocess.h
 create mode 100644 src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h
 create mode 100644 src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_perspective.h
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_pad2.h
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_reduce_mean_internal.h
 create mode 100644 src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c
 create mode 100644 src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c
 create mode 100644 src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c
 create mode 100644 src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c
 create mode 100644 src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c
 create mode 100644 src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
 create mode 100644 src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c
 create mode 100644 src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/cl/topk_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c
 create mode 100644 src/tim/vx/internal/src/kernel/vx/pad2_vx.c
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/clip_BF16.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/depth2space_crd.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
 rename src/tim/vx/internal/src/{custom/ops/kernel/evis => libnnext/ops/vx}/custom_softmax.vx (83%)
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_perspective.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx
 rename src/tim/vx/internal/src/libnnext/ops/vx/{resize_bilinear_U8_half_pixel_centers.vx => resize_bilinear_U8_half_pixel_centers_1.vx} (100%)
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_2.vx
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c

diff --git a/src/tim/vx/internal/include/custom/custom_node_type.def b/src/tim/vx/internal/include/custom/custom_node_type.def
index 034c37f..0283c71 100644
--- a/src/tim/vx/internal/include/custom/custom_node_type.def
+++ b/src/tim/vx/internal/include/custom/custom_node_type.def
@@ -2,3 +2,6 @@
     custom op data struct def
 */
 DEF_NODE_TYPE(custom_softmax)
+DEF_NODE_TYPE(custom_ainr_denoise_postprocess)
+DEF_NODE_TYPE(custom_warp_affine)
+DEF_NODE_TYPE(custom_warp_perspective)
diff --git a/src/tim/vx/internal/include/custom/custom_ops.def b/src/tim/vx/internal/include/custom/custom_ops.def
index 8ef4d50..690b057 100644
--- a/src/tim/vx/internal/include/custom/custom_ops.def
+++ b/src/tim/vx/internal/include/custom/custom_ops.def
@@ -2,3 +2,6 @@
    Add custom ops to the end.
 */
 DEF_OP(CUSTOM_SOFTMAX)
+DEF_OP(CUSTOM_AINR_DENOISE_POSTPROCESS)
+DEF_OP(CUSTOM_WARP_AFFINE)
+DEF_OP(CUSTOM_WARP_PERSPECTIVE)
diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_ainr_denoise_postprocess.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_ainr_denoise_postprocess.h
new file mode 100644
index 0000000..1a7e623
--- /dev/null
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_ainr_denoise_postprocess.h
@@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CUSTOM_AINR_DENOISE_POSTPROCESS_H
+#define _VSI_NN_OP_CUSTOM_AINR_DENOISE_POSTPROCESS_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_custom_ainr_denoise_postprocess_param
+{
+    struct _ainr_denoise_postprocess_local_data_t* local;
+    // Add parameters here
+} vsi_nn_custom_ainr_denoise_postprocess_param;
+_compiler_assert(offsetof(vsi_nn_custom_ainr_denoise_postprocess_param, local) == 0, \
+    vsi_nn_custom_ainr_denoise_postprocess_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h
new file mode 100644
index 0000000..815a064
--- /dev/null
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h
@@ -0,0 +1,49 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CUSTOM_WARP_AFFINE_H
+#define _VSI_NN_OP_CUSTOM_WARP_AFFINE_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_custom_warp_affine_param
+{
+    struct _custom_warp_affine_local_data_t* local;
+    // Add parameters here
+    const float *matrix;
+    vsi_enum type;
+    int32_t size[2];
+} vsi_nn_custom_warp_affine_param;
+_compiler_assert(offsetof(vsi_nn_custom_warp_affine_param, local) == 0, \
+    vsi_nn_custom_warp_affine_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_perspective.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_perspective.h
new file mode 100644
index 0000000..8aceb2a
--- /dev/null
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_perspective.h
@@ -0,0 +1,50 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CUSTOM_WARP_PERSPECTIVE_H
+#define _VSI_NN_OP_CUSTOM_WARP_PERSPECTIVE_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_custom_warp_perspective_param
+{
+    struct _custom_warp_perspective_local_data_t* local;
+    // Add parameters here
+    const float *matrix;
+    vsi_enum type;
+    int32_t size[2];
+} vsi_nn_custom_warp_perspective_param;
+_compiler_assert(offsetof(vsi_nn_custom_warp_perspective_param, local) == 0, \
+    vsi_nn_custom_warp_perspective_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
index 16d3d0c..1a05c8a 100644
--- a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
+++ b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
@@ -27,5 +27,8 @@
     custom op head files
 */
 #include "custom/ops/vsi_nn_op_custom_softmax.h"
+#include "custom/ops/vsi_nn_op_custom_ainr_denoise_postprocess.h"
+#include "custom/ops/vsi_nn_op_custom_warp_affine.h"
+#include "custom/ops/vsi_nn_op_custom_warp_perspective.h"
 
 #endif
diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def
index cf5bebb..4765bd5 100644
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@@ -165,3 +165,6 @@ DEF_OP(GRUCELL)
 DEF_OP(GRUCELL_ACTIVATION)
 DEF_OP(RESHAPE2)
 DEF_OP(CONV3D)
+DEF_OP(DECONV3D)
+DEF_OP(PAD2)
+DEF_OP(COS)
diff --git a/src/tim/vx/internal/include/internal/internal_ops.def b/src/tim/vx/internal/include/internal/internal_ops.def
index 06dbc61..a47559a 100644
--- a/src/tim/vx/internal/include/internal/internal_ops.def
+++ b/src/tim/vx/internal/include/internal/internal_ops.def
@@ -19,3 +19,4 @@ DEF_OP(RESIZE_1D_NEAREST_INTERNAL)
 DEF_OP(SPACE2DEPTH_INTERNAL)
 DEF_OP(GRUCELL_H_TIMES_ACTIVATION_R)
 DEF_OP(GRUCELL_ACTIVATION_Z_H)
+DEF_OP(REDUCE_MEAN_INTERNAL)
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
index 05222b2..f8163be 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
@@ -640,6 +640,13 @@ vsi_nn_kernel_node_t  vsi_nn_kernel_create_node
     vsi_nn_kernel_t * kernel
     );
 
+vsi_nn_kernel_node_t  vsi_nn_kernel_create_node_ext
+    (
+    vsi_nn_graph_t * graph,
+    vsi_nn_kernel_t * kernel,
+    const char** resources
+    );
+
 vsi_status vsi_nn_kernel_node_set_border
     (vsi_nn_kernel_node_t node,
     vx_border_t* border);
@@ -720,6 +727,13 @@ vsi_status vsi_nn_kernel_register
     vsi_nn_kernel_t * kernel
     );
 
+vsi_status vsi_nn_kernel_register_ext
+    (
+    vsi_nn_graph_t * graph,
+    vsi_nn_kernel_t * kernel,
+    const char** resources
+    );
+
 vsi_bool vsi_nn_kernel_gpu_check_shape
     ( const vsi_size_t * shape, vsi_size_t rank );
 
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
index 1f4c947..26a676f 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
@@ -79,4 +79,10 @@ vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape
     vsi_size_t* out_shape, uint32_t* out_rank
     );
 
+vsi_bool vsi_nn_kernel_optimize_group_norm_shape
+    (
+    const vsi_size_t* shape, const uint32_t rank, int32_t groups,
+    int32_t is_sp_kernel, vsi_size_t* out_shape
+    );
+
 #endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h
new file mode 100644
index 0000000..133267f
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h
@@ -0,0 +1,54 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_DECONV3D_H
+#define _VSI_NN_OP_DECONV3D_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_deconv3d_param
+{
+    struct _deconv3d_local_data_t* local;
+    // Add parameters here
+    uint32_t   ksize[3];
+    uint32_t   stride[3];
+    /* Pad left, right, top, bottom, front, rear */
+    uint32_t   pad[6];
+
+    uint32_t   weights;
+    uint32_t   group;
+    uint32_t   output_padding[3];
+} vsi_nn_deconv3d_param;
+_compiler_assert(offsetof(vsi_nn_deconv3d_param, local) == 0, \
+    vsi_nn_deconv3d_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_gather.h b/src/tim/vx/internal/include/ops/vsi_nn_op_gather.h
index 0d76800..c9792c9 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_gather.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gather.h
@@ -41,6 +41,7 @@ typedef struct _vsi_nn_gather_param
 {
     vsi_nn_gather_lcl_data local;
     int32_t     axis;
+    int32_t     batch_dims;
 } vsi_nn_gather_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pad2.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pad2.h
new file mode 100644
index 0000000..f2672a2
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pad2.h
@@ -0,0 +1,50 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_PAD2_H
+#define _VSI_NN_OP_PAD2_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_pad2_param
+{
+    struct _pad2_local_data_t* local;
+    const uint32_t * front_size;
+    const uint32_t * back_size;
+    uint8_t dim_num;
+    float const_val;
+    vsi_nn_pad_mode_e mode;
+} vsi_nn_pad2_param;
+_compiler_assert(offsetof(vsi_nn_pad2_param, local) == 0, \
+    vsi_nn_pad2_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reduce.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reduce.h
index cf7bb8b..57997d2 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reduce.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduce.h
@@ -51,7 +51,7 @@ typedef struct _vsi_nn_reduce_param
 {
     /* local data must be the first. */
     vsi_nn_reduce_lcl_data_t local;
-    vx_enum     type;
+    vsi_enum     type;
     const int32_t *axis;
     vx_uint32   axis_num;
     vx_bool     keep_dim;
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reduce_mean_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reduce_mean_internal.h
new file mode 100644
index 0000000..20eb56c
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduce_mean_internal.h
@@ -0,0 +1,49 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_REDUCE_MEAN_INTERNAL_H
+#define _VSI_NN_OP_REDUCE_MEAN_INTERNAL_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_reduce_mean_internal_param
+{
+    struct _reduce_mean_internal_local_data_t* local;
+    // Add parameters here
+    vx_int32    *axis;
+    vx_uint32   axis_num;
+    float       scale;
+} vsi_nn_reduce_mean_internal_param;
+_compiler_assert(offsetof(vsi_nn_reduce_mean_internal_param, local) == 0, \
+    vsi_nn_reduce_mean_internal_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h
index 7aa984e..8687247 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h
@@ -28,6 +28,7 @@
 /*-------------------------------------------
                 Includes
 -------------------------------------------*/
+#include <stdio.h>
 #include "vsi_nn_platform.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_types.h"
@@ -398,6 +399,31 @@ void vsi_nn_get_tensor_clamp_min_max
     float *clampMax
     );
 
+char* vsi_nn_strncpy
+    (
+    char* dest,
+    const char* source,
+    size_t count
+    );
+
+char* vsi_nn_strncat
+    (
+    char* dest,
+    const char* source,
+    size_t count
+    );
+
+char* vsi_nn_getenv
+    (
+    const char * var_name
+    );
+
+FILE* vsi_nn_fopen
+    (
+    const char * file_name,
+    const char * mode
+    );
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/tim/vx/internal/include/vsi_nn_client_op.h b/src/tim/vx/internal/include/vsi_nn_client_op.h
index 856f81b..c166ce7 100644
--- a/src/tim/vx/internal/include/vsi_nn_client_op.h
+++ b/src/tim/vx/internal/include/vsi_nn_client_op.h
@@ -71,6 +71,17 @@ OVXLIB_API void vsi_nn_OpRemoveClient
     vsi_nn_op_t op
     );
 
+vsi_bool vsi_nn_OpAddClientName
+  (
+    vsi_nn_op_t op,
+    const char* kernel_name
+  );
+
+const char* vsi_nn_OpGetClientName
+  (
+    vsi_nn_op_t op
+  );
+
 #if defined(__cplusplus)
 }
 #endif
diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h
index 4374441..20a4dd1 100644
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@@ -73,6 +73,7 @@ typedef struct _vsi_nn_runtime_option_t
     int32_t enable_shader;
     int32_t enable_opcheck;
     int32_t enable_concat_optimize;
+    int32_t enable_asymi8_to_u8;
 } vsi_nn_runtime_option_t;
 
 /**
diff --git a/src/tim/vx/internal/include/vsi_nn_feature_config.h b/src/tim/vx/internal/include/vsi_nn_feature_config.h
index db38ecc..8906a96 100644
--- a/src/tim/vx/internal/include/vsi_nn_feature_config.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h
@@ -1,26 +1,3 @@
-/****************************************************************************
-*
-*    Copyright (c) 2019 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the Software),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
 /*****Auto generated header file, Please DO NOT modify manually!*****/
 #ifndef _VSI_NN_FEATURE_CONFIG_H
 #define _VSI_NN_FEATURE_CONFIG_H
diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h
index ffb5dd0..dda35b7 100644
--- a/src/tim/vx/internal/include/vsi_nn_graph.h
+++ b/src/tim/vx/internal/include/vsi_nn_graph.h
@@ -456,6 +456,29 @@ OVXLIB_API vsi_nn_node_t * vsi_nn_AddNode
     vsi_nn_node_id_t    * node_id
     );
 
+/**
+ * Add External node
+ * Create a new External node and attach it to graph.
+ *
+ * @param[in] graph Graph handle
+ * @param[in] op Node operation.
+ * @param[in] vsi_nn_proc_t to this node.
+ * @param[in] output_num Number of outputs to this node.
+ * @param[in] kernel name.
+ * @param[out] node_id A handle to get the id of new node,
+ *                  pass it to NULL to get nothing.
+ *
+ * @return The node handle on success, or NULL otherwise.
+ */
+OVXLIB_API vsi_nn_node_t * vsi_nn_AddExternalNode
+    (
+    vsi_nn_graph_t      * graph,
+    vsi_nn_op_t           op,
+    const void           * proc,
+    vsi_nn_node_id_t    * node_id,
+    const char          *kernel_name
+    );
+
 /**
  * @deprecated
  * @see vsi_nn_AddNode
diff --git a/src/tim/vx/internal/include/vsi_nn_log.h b/src/tim/vx/internal/include/vsi_nn_log.h
index fd7d37a..d3afaa2 100644
--- a/src/tim/vx/internal/include/vsi_nn_log.h
+++ b/src/tim/vx/internal/include/vsi_nn_log.h
@@ -24,14 +24,18 @@
 
 #ifndef _VSI_NN_LOG_H
 #define _VSI_NN_LOG_H
-#include <stdio.h>
+
+#include "utils/vsi_nn_util.h"
 
 #if defined(__cplusplus)
 extern "C"{
 #endif
 
 #ifdef _MSC_VER
-#define snprintf _snprintf
+#define snprintf(buffer, count, format, ...) \
+    _snprintf_s(buffer, count, _TRUNCATE, format, ##__VA_ARGS__)
+#define vsnprintf(buffer, count, format, args) \
+    _vsnprintf_s(buffer, count, _TRUNCATE, format, args)
 #endif
 
 typedef enum _vsi_nn_log_level_e
@@ -68,4 +72,3 @@ OVXLIB_API void vsi_nn_LogMsg
 #endif
 
 #endif
-
diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h
index 0278c4b..395ee2e 100644
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@@ -182,6 +182,9 @@
 #include "ops/vsi_nn_op_conv3d.h"
 #include "ops/vsi_nn_op_grucell_h_times_activation_r.h"
 #include "ops/vsi_nn_op_grucell_activation_z_h.h"
+#include "ops/vsi_nn_op_deconv3d.h"
+#include "ops/vsi_nn_op_reduce_mean_internal.h"
+#include "ops/vsi_nn_op_pad2.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 
@@ -350,7 +353,10 @@ typedef union _vsi_nn_nn_param
     vsi_nn_conv3d_param             conv3d;
     vsi_nn_grucell_h_times_activation_r_param grucell_h_times_activation_r;
     vsi_nn_grucell_activation_z_h_param grucell_activation_z_h;
-    uint8_t                         client_param[128];
+    vsi_nn_deconv3d_param           deconv3d;
+    vsi_nn_reduce_mean_internal_param reduce_mean_internal;
+    vsi_nn_pad2_param               pad2;
+    void*                         client_param;
 
     /* custom node data struct define */
 #define DEF_NODE_TYPE( NAME ) vsi_nn_##NAME##_param NAME;
diff --git a/src/tim/vx/internal/include/vsi_nn_ops.h b/src/tim/vx/internal/include/vsi_nn_ops.h
index 4c79499..40671e7 100644
--- a/src/tim/vx/internal/include/vsi_nn_ops.h
+++ b/src/tim/vx/internal/include/vsi_nn_ops.h
@@ -48,7 +48,7 @@ extern "C"{
  * @see include/custom/custom_ops.def
  * @see include/internal/internal_ops.def
  */
-typedef uint32_t vsi_nn_op_t; enum
+typedef int32_t vsi_nn_op_t; enum
 {
 #define DEF_OP( NAME, ... ) VSI_NN_OP_##NAME,
     #include "interface/ops.def"
@@ -317,6 +317,13 @@ vsi_bool vsi_nn_OpRegisterOvxInit
     vsi_nn_op_compute_t compute
     );
 
+vsi_bool vsi_nn_OpRegisterExternalOvxInit
+    (
+    vsi_nn_op_t op,
+    const char* kernel_name,
+    vsi_nn_op_proc_t* proc
+    );
+
 /**
  * Get operation name
  * Get operation name string by operation id.
diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h
index b0acac3..328aa19 100644
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@@ -33,7 +33,7 @@ extern "C"{
 
 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 37
+#define VSI_NN_VERSION_PATCH 39
 #define VSI_NN_VERSION \
     (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
 
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
index 1eeb997..ed1e149 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
@@ -77,7 +77,7 @@ DEF_KERNEL_EXECUTOR(_softmax_exec)
 
     /* alloc the float32 data buffer */
     buffer[1] = (float *)malloc(out_elements * sizeof(float));
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
     memset(buffer[1], 0, out_elements * sizeof(float));
 
     buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c
new file mode 100644
index 0000000..f2cb031
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c
@@ -0,0 +1,296 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+#define _CPU_IO_NUM         (_INPUT_NUM + _OUTPUT_NUM)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.custom_warp_affine")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _custom_warp_affine_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _CUSTOM_WARP_AFFINE_PARAM_NUM  _cnt_of_array( _custom_warp_affine_kernel_param_def )
+#define SCALAR_INPUT_TYPE       (2)
+#define SCALAR_MATRIX_OFFSET    (3)
+
+static void _transform_affine
+    (
+    vsi_size_t dst_x,
+    vsi_size_t dst_y,
+    const float m[],
+    float *src_x,
+    float *src_y
+    )
+{
+    *src_x = dst_x * m[0] + dst_y * m[2] + m[4];
+    *src_y = dst_x * m[1] + dst_y * m[3] + m[5];
+}
+
+static vsi_bool _read_pixel
+    (
+    float *base,
+    vsi_nn_kernel_tensor_attr_t *attr,
+    float x,
+    float y,
+    float *pixel
+    )
+{
+    vsi_size_t width = attr->shape->data[0];
+    vsi_size_t height = attr->shape->data[1];
+    vsi_bool out_of_bounds = (x < 0 || y < 0 || x >= width || y >= height);
+    vsi_size_t bx = 0, by = 0;
+
+    if (out_of_bounds)
+    {
+        *pixel = 205.0f;
+        return TRUE;
+    }
+
+    // bounded x/y
+    bx = x < 0 ? 0 : x >= width ? width - 1 : (vsi_size_t)x;
+    by = y < 0 ? 0 : y >= height ? height - 1 : (vsi_size_t)y;
+
+    *pixel = base[by * width + bx];
+
+    return TRUE;
+}
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    float* buffer[_CPU_IO_NUM] = { NULL };
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL };
+    int32_t type = 0;
+    float matrix[6] = {0};
+    vsi_size_t i = 0;
+    vsi_size_t b = 0;
+    vsi_size_t x = 0;
+    vsi_size_t y = 0;
+    vsi_size_t out_elements = 0;
+    vsi_size_t width = 0;
+    vsi_size_t height = 0;
+    vsi_size_t outer_size = 1;
+
+    tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
+
+    /* alloc the float32 data buffer */
+    buffer[1] = (float *)malloc(out_elements * sizeof(float));
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
+    memset(buffer[1], 0, out_elements * sizeof(float));
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_TYPE],
+        &type);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    for (i = 0; i < 6; i++)
+    {
+        status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
+            &matrix[i]);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    width = attr[1]->shape->data[0];
+    height = attr[1]->shape->data[1];
+    for(i = 2; i < (vsi_size_t)attr[1]->shape->size; ++i)
+    {
+        outer_size *= attr[1]->shape->data[i];
+    }
+    // Do something
+    for (b = 0; b < outer_size; b++)
+    {
+        float *src_base = buffer[0] + b * attr[0]->shape->data[0] * attr[0]->shape->data[1];
+        float *dst_base = buffer[1] + b * width * height;
+        for (y = 0; y < height; y++)
+        {
+            for (x = 0; x < width; x++)
+            {
+                float xf = 0;
+                float yf = 0;
+                float dst = 0;
+
+                _transform_affine(x, y, matrix, &xf, &yf);
+                if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR)
+                {
+                    _read_pixel(src_base, attr[0], xf, yf, &dst);
+                    dst_base[y * width + x] = dst;
+                }
+                else
+                {
+                    float tl = 0, tr = 0, bl = 0, br = 0;
+                    float ar = xf - floorf(xf);
+                    float ab = yf - floorf(yf);
+                    float al = 1.0f - ar;
+                    float at = 1.0f - ab;
+
+                    _read_pixel(src_base, attr[0], floorf(xf), floorf(yf), &tl);
+                    _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf), &tr);
+                    _read_pixel(src_base, attr[0], floorf(xf), floorf(yf) + 1, &bl);
+                    _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf) + 1, &br);
+
+                    dst_base[y * width + x] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
+                }
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
+            buffer[1], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+final:
+    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+        vsi_nn_kernel_tensor_attr_release( &attr[i] );
+    }
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _custom_warp_affine_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _custom_warp_affine_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CUSTOM_WARP_AFFINE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    size_t i = 0;
+    size_t buffer_size = 0;
+    int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
+    float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[SCALAR_INPUT_TYPE] = vsi_nn_kernel_scalar_create(
+                graph, I32, &type );
+            for (i = 0; i < buffer_size; i++)
+            {
+                node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
+                        graph, F32, &buffer[i] );
+            }
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TYPE] );
+            for (i = 0; i < buffer_size; i++)
+            {
+                vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
+            }
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( custom_warp_affine, _setup )
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c
new file mode 100644
index 0000000..397f022
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c
@@ -0,0 +1,300 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+#define _CPU_IO_NUM         (_INPUT_NUM + _OUTPUT_NUM)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.custom_warp_perspective")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _custom_warp_perspective_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM  _cnt_of_array( _custom_warp_perspective_kernel_param_def )
+#define SCALAR_INPUT_TYPE       (2)
+#define SCALAR_MATRIX_OFFSET    (3)
+
+static void _transform_perspective
+    (
+    vsi_size_t dst_x,
+    vsi_size_t dst_y,
+    const float m[],
+    float *src_x,
+    float *src_y
+    )
+{
+    float z = dst_x * m[2] + dst_y * m[5] + m[8];
+
+    *src_x = (dst_x * m[0] + dst_y * m[3] + m[6]) / z;
+    *src_y = (dst_x * m[1] + dst_y * m[4] + m[7]) / z;
+}
+
+static vsi_bool _read_pixel
+    (
+    float *base,
+    vsi_nn_kernel_tensor_attr_t *attr,
+    float x,
+    float y,
+    float *pixel
+    )
+{
+    vsi_size_t width = attr->shape->data[0];
+    vsi_size_t height = attr->shape->data[1];
+    vsi_bool out_of_bounds = (x < 0 || y < 0 || x >= width || y >= height);
+    vsi_size_t bx = 0, by = 0;
+
+    if (out_of_bounds)
+    {
+        *pixel = 205.0f;
+        return TRUE;
+    }
+
+    // bounded x/y
+    bx = x < 0 ? 0 : x >= width ? width - 1 : (vsi_size_t)x;
+    by = y < 0 ? 0 : y >= height ? height - 1 : (vsi_size_t)y;
+
+    *pixel = base[by * width + bx];
+
+    return TRUE;
+}
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    float* buffer[_CPU_IO_NUM] = { NULL };
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL };
+    int32_t type = 0;
+    float matrix[9] = {0};
+    vsi_size_t i = 0;
+    vsi_size_t b = 0;
+    vsi_size_t x = 0;
+    vsi_size_t y = 0;
+    vsi_size_t out_elements = 0;
+    vsi_size_t width = 0;
+    vsi_size_t height = 0;
+    vsi_size_t outer_size = 1;
+
+    tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
+
+    /* alloc the float32 data buffer */
+    buffer[1] = (float *)malloc(out_elements * sizeof(float));
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
+    memset(buffer[1], 0, out_elements * sizeof(float));
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_TYPE],
+        &type);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    for (i = 0; i < 9; i++)
+    {
+        status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
+            &matrix[i]);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    width = attr[1]->shape->data[0];
+    height = attr[1]->shape->data[1];
+    for(i = 2; i < (vsi_size_t)attr[1]->shape->size; ++i)
+    {
+        outer_size *= attr[1]->shape->data[i];
+    }
+    // Do something
+    for (b = 0; b < outer_size; b++)
+    {
+        float *src_base = buffer[0] + b * attr[0]->shape->data[0] * attr[0]->shape->data[1];
+        float *dst_base = buffer[1] + b * width * height;
+        for (y = 0; y < height; y++)
+        {
+            for (x = 0; x < width; x++)
+            {
+                float xf = 0;
+                float yf = 0;
+                float dst = 0;
+
+                _transform_perspective(x, y, matrix, &xf, &yf);
+                if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR)
+                {
+                    _read_pixel(src_base, attr[0], xf, yf, &dst);
+                    dst_base[y * width + x] = dst;
+                }
+                else
+                {
+                    float tl = 0, tr = 0, bl = 0, br = 0;
+                    float ar = xf - floorf(xf);
+                    float ab = yf - floorf(yf);
+                    float al = 1.0f - ar;
+                    float at = 1.0f - ab;
+
+                    _read_pixel(src_base, attr[0], floorf(xf), floorf(yf), &tl);
+                    _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf), &tr);
+                    _read_pixel(src_base, attr[0], floorf(xf), floorf(yf) + 1, &bl);
+                    _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf) + 1, &br);
+
+                    dst_base[y * width + x] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
+                }
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
+            buffer[1], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+final:
+    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+        vsi_nn_kernel_tensor_attr_release( &attr[i] );
+    }
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _custom_warp_perspective_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _custom_warp_perspective_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CUSTOM_WARP_PERSPECTIVE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    size_t i = 0;
+    size_t buffer_size = 0;
+    int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
+    float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[SCALAR_INPUT_TYPE] = vsi_nn_kernel_scalar_create(
+                graph, I32, &type );
+            for (i = 0; i < buffer_size; i++)
+            {
+                node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
+                        graph, F32, &buffer[i] );
+            }
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TYPE] );
+            for (i = 0; i < buffer_size; i++)
+            {
+                vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
+            }
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( custom_warp_perspective, _setup )
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c
new file mode 100644
index 0000000..1698251
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c
@@ -0,0 +1,295 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum _custom_warp_affine_type_e
+{
+    nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR,
+    bilinear = VSI_NN_INTERPOLATION_BILINEAR,
+}custom_warp_affine_type_e;
+
+#define _CUSTOM_WARP_AFFINE_KERNEL_SOURCE      "custom_warp_affine"
+
+// Add kernel hashtable here
+#define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D ) \
+        (( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20))
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
+        { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0 ), \
+          CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE), \
+          _CUSTOM_WARP_AFFINE_KERNEL_SOURCE }
+#define PACK_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
+        { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1 ), \
+          CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D"), \
+          _CUSTOM_WARP_AFFINE_KERNEL_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _custom_warp_affine_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( U8, U8, nearest_neighbor ),
+    PACK_KERNEL_MAP( U8, U8, bilinear ),
+
+    PACK_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
+    PACK_2D_KERNEL_MAP( U8, U8, bilinear ),
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _custom_warp_affine_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _CUSTOM_WARP_AFFINE_PARAM_NUM  _cnt_of_array( _custom_warp_affine_kernel_param_def )
+#define SCALAR_MATRIX_OFFSET    (2)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    vsi_size_array_t * out_shape = NULL;
+    float m[6] = {0};
+    float matrix0[4] = {0};
+    float matrix1[4] = {0};
+    float matrix4[4] = {0};
+    int32_t i = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    for (i = 0; i < 6; i++)
+    {
+        status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
+            &m[i]);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    matrix0[0] = m[0]; matrix0[1] = m[1]; matrix0[2] = m[2]; matrix0[3] = m[3];
+    matrix1[0] = m[4]; matrix1[1] = m[5];
+    matrix4[0] = m[0]; matrix4[1] = m[1]; matrix4[2] = m[0] * 2; matrix4[3] = m[1] * 2;
+
+    out_shape  = attr[1]->shape;
+
+    gpu_param.global_scale[0] = 8;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+    status = vsi_nn_kernel_gpu_add_param( node,
+        "matrix0", &matrix0 );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "matrix1", &matrix1 );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "matrix4", &matrix4 );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+} /* _custom_warp_affine_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t type
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _custom_warp_affine_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _custom_warp_affine_kernel_map );
+    vx_param_description_t * param_def  = _custom_warp_affine_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _custom_warp_affine_initializer;
+    int32_t is_2d_img = inputs[0]->attr.dim_num < 3 || inputs[0]->attr.size[2] == 1;
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _custom_warp_affine_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CUSTOM_WARP_AFFINE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    size_t i = 0;
+    size_t buffer_size = 0;
+    int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
+    float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
+
+    if (vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, type );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            vx_border_t border;
+            border.mode = VX_BORDER_CONSTANT;
+
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            for (i = 0; i < buffer_size; i++)
+            {
+                node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
+                        graph, F32, &buffer[i] );
+            }
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM );
+            for (i = 0; i < buffer_size; i++)
+            {
+                vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
+            }
+            // Set default border mode.
+            border.constant_value.U32 = 0xcdcdcdcd;
+            status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+            CHECK_STATUS(status);
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( custom_warp_affine, _setup )
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c
new file mode 100644
index 0000000..6936759
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c
@@ -0,0 +1,300 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum _custom_warp_perspective_type_e
+{
+    nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR,
+    bilinear = VSI_NN_INTERPOLATION_BILINEAR,
+}custom_warp_perspective_type_e;
+#define _CUSTOM_WARP_PERSPECTIVE_KERNEL_SOURCE      "custom_warp_perspective"
+
+// Add kernel hashtable here
+#define CUSTOM_WARP_PERSPECTIVE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D ) \
+        (( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20))
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
+        { CUSTOM_WARP_PERSPECTIVE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0 ), \
+          CVIVANTE_NAMESPACE("evis.custom_warp_perspective_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE), \
+          _CUSTOM_WARP_PERSPECTIVE_KERNEL_SOURCE }
+#define PACK_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
+        { CUSTOM_WARP_PERSPECTIVE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1 ), \
+          CVIVANTE_NAMESPACE("evis.custom_warp_perspective_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D"), \
+          _CUSTOM_WARP_PERSPECTIVE_KERNEL_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _custom_warp_perspective_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( U8, U8, nearest_neighbor ),
+    PACK_KERNEL_MAP( U8, U8, bilinear ),
+
+    PACK_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
+    PACK_2D_KERNEL_MAP( U8, U8, bilinear ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _custom_warp_perspective_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM  _cnt_of_array( _custom_warp_perspective_kernel_param_def )
+#define SCALAR_MATRIX_OFFSET    (2)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_custom_warp_perspective_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    vsi_size_array_t * out_shape = NULL;
+    float m[9] = {0};
+    float matrix0[4] = {0};
+    float matrix1[4] = {0};
+    float matrix2[4] = {0};
+    float matrix4[4] = {0};
+    int32_t i = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    for (i = 0; i < 9; i++)
+    {
+        status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
+            &m[i]);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    matrix0[0] = m[0]; matrix0[1] = m[1]; matrix0[2] = m[3]; matrix0[3] = m[4];
+    matrix1[0] = m[6]; matrix1[1] = m[7]; matrix1[2] = m[2]; matrix1[3] = m[5];
+    matrix2[0] = m[8];
+    matrix4[0] = m[0]; matrix4[1] = m[1]; matrix4[2] = m[0] * 2; matrix4[3] = m[1] * 2;
+
+    out_shape  = attr[1]->shape;
+
+    gpu_param.global_scale[0] = 8;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+    status = vsi_nn_kernel_gpu_add_param( node,
+        "matrix0", &matrix0 );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "matrix1", &matrix1 );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "matrix2", &matrix2 );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "matrix4", &matrix4 );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+} /* _custom_warp_perspective_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t type
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _custom_warp_perspective_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _custom_warp_perspective_kernel_map );
+    vx_param_description_t * param_def  = _custom_warp_perspective_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _custom_warp_perspective_initializer;
+    int32_t is_2d_img = inputs[0]->attr.dim_num < 3 || inputs[0]->attr.size[2] == 1;
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = CUSTOM_WARP_PERSPECTIVE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _custom_warp_perspective_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CUSTOM_WARP_PERSPECTIVE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    size_t i = 0;
+    size_t buffer_size = 0;
+    int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
+    float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
+
+    if (vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, type );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            vx_border_t border;
+            border.mode = VX_BORDER_CONSTANT;
+
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            for (i = 0; i < buffer_size; i++)
+            {
+                node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
+                        graph, F32, &buffer[i] );
+            }
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM );
+            for (i = 0; i < buffer_size; i++)
+            {
+                vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
+            }
+            // Set default border mode.
+            border.constant_value.U32 = 0xcdcdcdcd;
+            status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+            CHECK_STATUS(status);
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( custom_warp_perspective, _setup )
diff --git a/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c b/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c
new file mode 100644
index 0000000..2e7415e
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c
@@ -0,0 +1,136 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+typedef struct _ainr_denoise_postprocess_local_data_t {
+    int32_t placeholder;
+} ainr_denoise_postprocess_local_data_t;
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+#if defined(VX_DENOISE_POSTPROCESS_SUPPORT) && VX_DENOISE_POSTPROCESS_SUPPORT
+    self->n = vxDenoisePostProcesslayer(
+        self->graph->g,
+        REQUIRED_IO(inputs[0]), // currInput
+        REQUIRED_IO(inputs[1]), // nnOutput
+        REQUIRED_IO(inputs[2]), // preOutImg
+        REQUIRED_IO(inputs[3]), // S0
+        REQUIRED_IO(inputs[4]), // C0
+        REQUIRED_IO(inputs[5]), // C1
+        REQUIRED_IO(inputs[6]), // C2
+        REQUIRED_IO(inputs[7]), // C3
+        REQUIRED_IO(inputs[8]), // clampMin
+        REQUIRED_IO(inputs[9]), // clampMax
+        REQUIRED_IO(outputs[0]) // output
+        );
+#else
+    self->n = NULL;
+#endif
+
+    if(NULL == self->n)
+    {
+        VSILOGE( "Create vxDenoisePostProcesslayer fail." );
+        return VSI_FAILURE;
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    return VSI_SUCCESS;
+} /* op_init() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    status = vsi_nn_op_common_deinit(self);
+
+    return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ CUSTOM_AINR_DENOISE_POSTPROCESS,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ 10,
+    /* output_num */ 1
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
new file mode 100644
index 0000000..e076b7c
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
@@ -0,0 +1,136 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+typedef struct _custom_warp_affine_local_data_t {
+    int32_t placeholder;
+} custom_warp_affine_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_custom_warp_affine_param * p;
+    p = &(self->nn_param.custom_warp_affine);
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_const_buffer( param, "matrix", p->matrix, 6 );
+    vsi_nn_kernel_param_add_int32( param, "type", p->type);
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+            "custom_warp_affine",
+            inputs, 1,
+            outputs, 1, param );
+
+    vsi_nn_kernel_param_release( &param );
+
+    return VSI_SUCCESS;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /*TODO: Check tensor shapes. */
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        uint32_t i = 0;
+
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        outputs[0]->attr.size[0] = self->nn_param.custom_warp_affine.size[0];
+        outputs[0]->attr.size[1] = self->nn_param.custom_warp_affine.size[1];
+
+        for (i = 2; i < outputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+        }
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    status = vsi_nn_op_common_deinit(self);
+
+    return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ CUSTOM_WARP_AFFINE,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c
new file mode 100644
index 0000000..7afbd83
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c
@@ -0,0 +1,136 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+typedef struct _custom_warp_perspective_local_data_t {
+    int32_t placeholder;
+} custom_warp_perspective_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_custom_warp_affine_param * p;
+    p = &(self->nn_param.custom_warp_affine);
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_const_buffer( param, "matrix", p->matrix, 9 );
+    vsi_nn_kernel_param_add_int32( param, "type", p->type);
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+            "custom_warp_perspective",
+            inputs, 1,
+            outputs, 1, param );
+
+    vsi_nn_kernel_param_release( &param );
+
+    return VSI_SUCCESS;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /*TODO: Check tensor shapes. */
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        uint32_t i = 0;
+
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        outputs[0]->attr.size[0] = self->nn_param.custom_warp_perspective.size[0];
+        outputs[0]->attr.size[1] = self->nn_param.custom_warp_perspective.size[1];
+
+        for (i = 2; i < outputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+        }
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    status = vsi_nn_op_common_deinit(self);
+
+    return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ CUSTOM_WARP_PERSPECTIVE,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/cl/clip_cl.c b/src/tim/vx/internal/src/kernel/cl/clip_cl.c
index cc62fab..f40c56e 100644
--- a/src/tim/vx/internal/src/kernel/cl/clip_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c
@@ -64,14 +64,16 @@ typedef struct
 
 static const _kernel_map_type _clip_kernel_map[] =
 {
-    PACK_KERNEL_MAP(F32, F32),
-    PACK_KERNEL_MAP(F32, U8),
-    PACK_KERNEL_MAP(U8,  U8),
-    PACK_KERNEL_MAP(U8,  F32),
-    PACK_KERNEL_MAP_2D(F32, F32),
-    PACK_KERNEL_MAP_2D(F32, U8),
-    PACK_KERNEL_MAP_2D(U8,  U8),
-    PACK_KERNEL_MAP_2D(U8,  F32),
+    PACK_KERNEL_MAP(F32,     F32),
+    PACK_KERNEL_MAP(F32,     U8),
+    PACK_KERNEL_MAP(U8,      U8),
+    PACK_KERNEL_MAP(U8,      F32),
+    PACK_KERNEL_MAP(BF16,    BF16),
+    PACK_KERNEL_MAP_2D(F32,  F32),
+    PACK_KERNEL_MAP_2D(F32,  U8),
+    PACK_KERNEL_MAP_2D(U8,   U8),
+    PACK_KERNEL_MAP_2D(U8,   F32),
+    PACK_KERNEL_MAP_2D(BF16, BF16),
 };
 
 
diff --git a/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
new file mode 100644
index 0000000..e1bb5f9
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
@@ -0,0 +1,226 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define _DEPTH2SPACE_CRD_KERNEL_SOURCE           "depth2space_crd"
+
+// Add kernel hashtable here
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F32TOF32    CVIVANTE_NAMESPACE("cl.depth2space_crd_F32toF32")
+
+// Add kernel hashtable here
+#define HASH_DEPTH2SPACE_CRD_KEY(_input0_type, _output_type, _blk_size) \
+    ((_input0_type << 24) | (_output_type << 16) | (_blk_size << 8))
+
+#define TENSOR_DEPTH2SPACE_CRD_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_DEPTH2SPACE_CRD_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        VX_KERNEL_NAME_DEPTH2SPACE_CRD_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } depth2space_crd_map[] =
+{
+    TENSOR_DEPTH2SPACE_CRD_KERNELS(F32,  F32,        _DEPTH2SPACE_CRD_KERNEL_SOURCE)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _depth2space_crd_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _DEPTH2SPACE_CRD_PARAM_NUM  _cnt_of_array( _depth2space_crd_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    int32_t     output_dims   = 0;
+    int32_t     output_width  = 0;
+    int32_t     output_height = 0;
+    int32_t     output_chn    = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+
+    output_dims = (int32_t)attr[0]->shape->size;
+    output_width = (int32_t)(attr[0]->shape->data[0]);
+    output_height = (int32_t)(attr[0]->shape->data[1]);
+    output_chn = (int32_t)(output_dims > 2 ? attr[0]->shape->data[2] : 1);
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = output_width;
+    gpu_param.global_size[1]   = output_height;
+    gpu_param.global_size[2]   = output_chn;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _depth2space_crd_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_DEPTH2SPACE_CRD_KEY( input0_dtype, output_dtype, 0 );
+
+    for ( i = 0; i < _cnt_of_array(depth2space_crd_map); i ++ )
+    {
+        if ( depth2space_crd_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < _cnt_of_array(depth2space_crd_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  depth2space_crd_map[i].function_name );
+        kernel->info.parameters = _depth2space_crd_kernel_param_def;
+        kernel->info.numParams = _DEPTH2SPACE_CRD_PARAM_NUM;
+        kernel->info.initialize = _depth2space_crd_initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                depth2space_crd_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                depth2space_crd_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_DEPTH2SPACE_CRD_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Pass parameters to node. */
+            vsi_nn_kernel_node_pack_io( node_params, _DEPTH2SPACE_CRD_PARAM_NUM,
+                inputs, 1, outputs, 1 );
+            node_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _DEPTH2SPACE_CRD_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( depth2space_internal, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
index 5572007..ef10ea5 100644
--- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
@@ -42,6 +42,7 @@ __BEGIN_DECLS
 typedef enum
 {
     UNARY_SIN,
+    UNARY_COS,
     UNARY_EXP,
     UNARY_LOG,
     UNARY_ELU,
@@ -89,6 +90,7 @@ typedef enum
         VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
 
 #define SIN_OPERATION           sin
+#define COS_OPERATION           cos
 #define EXP_OPERATION           exp
 #define LOG_OPERATION           log
 #define ELU_OPERATION           elu
@@ -107,6 +109,8 @@ static const struct {
 {
     TENSOR_UNARY_KERNELS_FLOAT(SIN_OPERATION,      UNARY_SIN,      F32, F32)
     TENSOR_UNARY_KERNELS_FLOAT(SIN_OPERATION,      UNARY_SIN,      F16, F16)
+    TENSOR_UNARY_KERNELS_FLOAT(COS_OPERATION,      UNARY_COS,      F32, F32)
+    TENSOR_UNARY_KERNELS_FLOAT(COS_OPERATION,      UNARY_COS,      F16, F16)
     TENSOR_UNARY_KERNELS_FLOAT(EXP_OPERATION,      UNARY_EXP,      F32, F32)
     TENSOR_UNARY_KERNELS_FLOAT(EXP_OPERATION,      UNARY_EXP,      F16, F16)
     TENSOR_UNARY_KERNELS_FLOAT(LOG_OPERATION,      UNARY_LOG,      F32, F32)
@@ -128,6 +132,8 @@ static const struct {
 
     TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION,      UNARY_SIN,      F32, F32)
     TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION,      UNARY_SIN,      F16, F16)
+    TENSOR_UNARY_KERNELS_FLOAT_2D(COS_OPERATION,      UNARY_COS,      F32, F32)
+    TENSOR_UNARY_KERNELS_FLOAT_2D(COS_OPERATION,      UNARY_COS,      F16, F16)
     TENSOR_UNARY_KERNELS_FLOAT_2D(EXP_OPERATION,      UNARY_EXP,      F32, F32)
     TENSOR_UNARY_KERNELS_FLOAT_2D(EXP_OPERATION,      UNARY_EXP,      F16, F16)
     TENSOR_UNARY_KERNELS_FLOAT_2D(LOG_OPERATION,      UNARY_LOG,      F32, F32)
@@ -148,6 +154,7 @@ static const struct {
     TENSOR_UNARY_KERNELS_FLOAT_2D(HGELU_OPERATION,    UNARY_HGELU,    F16, F16)
 
     TENSOR_UNARY_KERNELS(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
+    TENSOR_UNARY_KERNELS(COS_OPERATION,      UNARY_COS,      U8,  U8)
     TENSOR_UNARY_KERNELS(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
     TENSOR_UNARY_KERNELS(LOG_OPERATION,      UNARY_LOG,      U8,  U8)
     TENSOR_UNARY_KERNELS(ELU_OPERATION,      UNARY_ELU,      U8,  U8)
@@ -159,6 +166,7 @@ static const struct {
     TENSOR_UNARY_KERNELS(HGELU_OPERATION,    UNARY_HGELU,    U8,  U8)
 
     TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,      UNARY_COS,      U8,  U8)
     TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
     TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,      UNARY_LOG,      U8,  U8)
     TENSOR_UNARY_KERNELS_2D(ELU_OPERATION,      UNARY_ELU,      U8,  U8)
@@ -175,6 +183,7 @@ static const struct {
 };
 
 #undef SIN_OPERATION
+#undef COS_OPERATION
 #undef EXP_OPERATION
 #undef LOG_OPERATION
 #undef ELU_OPERATION
@@ -438,6 +447,7 @@ OnError:
 
 
 REGISTER_ELTWISE_UNARY_BACKEND_CL( sin,          UNARY_SIN )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( cos,          UNARY_COS )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( exp,          UNARY_EXP )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( log,          UNARY_LOG )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( elu,          UNARY_ELU )
diff --git a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
index 1f0ba44..af31ed1 100644
--- a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
@@ -103,7 +103,6 @@ static vx_param_description_t _floordiv_kernel_param_def[] =
 #define SCALAR_OUTPUT_SCALE          (7)
 #define SCALAR_OUTPUT_TAIL           (8)
 
-#define FLOORDIV_PARAM_NUM         3
 #define FLOORDIV_QUANT_PARAM_NUM   _cnt_of_array( _floordiv_kernel_param_def )
 
 /*
@@ -154,8 +153,6 @@ final:
     return status;
 } /* _floordiv_initializer() */
 
-
-
 /*
  * Query kernel
  */
@@ -164,8 +161,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t * kernel,
     vsi_nn_tensor_t * const * const inputs,
     vsi_nn_tensor_t * const * const outputs,
-    vsi_bool image_2d,
-    vsi_bool *is_use_u8_kernel
+    vsi_bool image_2d
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -189,7 +185,7 @@ static vsi_status _query_kernel
     {
         in0_dtype = F32;
     }
-    else if (I16 == in0_dtype)
+    else if (I16 == in0_dtype || I8 == in0_dtype)
     {
         in0_dtype = I32;
     }
@@ -198,7 +194,7 @@ static vsi_status _query_kernel
     {
         in1_dtype = F32;
     }
-    else if (I16 == in1_dtype)
+    else if (I16 == in1_dtype || I8 == in1_dtype)
     {
         in1_dtype = I32;
     }
@@ -207,16 +203,9 @@ static vsi_status _query_kernel
     {
         out_dtype  = F32;
     }
-
-    if ((U8 == in0_dtype) || (U8 == in1_dtype) || (U8 == out_dtype))
+    else if (I16 == out_dtype || I8 == out_dtype)
     {
-        param_def_size = FLOORDIV_QUANT_PARAM_NUM;
-        *is_use_u8_kernel = TRUE;
-    }
-    else
-    {
-        param_def_size = FLOORDIV_PARAM_NUM;
-        *is_use_u8_kernel = FALSE;
+        out_dtype = I32;
     }
 
     key = FLOORDIV_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d);
@@ -228,7 +217,7 @@ static vsi_status _query_kernel
             break;
         }
     }
-    if( i < kernel_map_size )
+    if ( i < kernel_map_size )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
         kernel->info.parameters  = param_def;
@@ -262,19 +251,18 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_FLOORDIV_PARAM_NUM] = {NULL};
     vsi_nn_kernel_node_t node = NULL;
     vsi_bool image_2d = FALSE;
-    float    outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
-    float    outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
-    float    input0Scale  = vsi_nn_get_tensor_scale(inputs[0]);
-    float    input0Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
-    float    input1Scale  = vsi_nn_get_tensor_scale(inputs[1]);
-    float    input1Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
-    vsi_bool is_use_u8_kernel = FALSE;
+    float outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float input0Scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float input0Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input1Scale  = vsi_nn_get_tensor_scale(inputs[1]);
+    float input1Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
 
     outputScale = 1.0f / outputScale;
     input0Tail   = -(input0Tail * input0Scale);
     input1Tail   = -(input1Tail * input1Scale);
 
-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
@@ -282,40 +270,35 @@ static vsi_nn_kernel_node_t _setup
 
     image_2d = (outputs[0]->attr.dim_num == 2);
 
-    status = _query_kernel( kernel, inputs, outputs, image_2d, &is_use_u8_kernel);
-    if( VSI_SUCCESS == status)
+    status = _query_kernel( kernel, inputs, outputs, image_2d);
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
-            size_t node_params_num = FLOORDIV_PARAM_NUM;
+            size_t node_params_num = FLOORDIV_QUANT_PARAM_NUM;
             /* Set inputs and outputs */
             vsi_nn_kernel_node_pack_io( node_params, _FLOORDIV_PARAM_NUM,
                     inputs, input_num, outputs, output_num );
-            if (is_use_u8_kernel)
-            {
-                node_params[SCALAR_INPUT0_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale );
-                node_params[SCALAR_INPUT0_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail );
-                node_params[SCALAR_INPUT1_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale );
-                node_params[SCALAR_INPUT1_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &input1Tail );
-                node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
-                node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
-                node_params_num = FLOORDIV_QUANT_PARAM_NUM;
-            }
+
+            node_params[SCALAR_INPUT0_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale );
+            node_params[SCALAR_INPUT0_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail );
+            node_params[SCALAR_INPUT1_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale );
+            node_params[SCALAR_INPUT1_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &input1Tail );
+            node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
             VSI_ASSERT( status == VSI_SUCCESS );
-            if (is_use_u8_kernel)
-            {
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_SCALE] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_TAIL] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_SCALE] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
-            }
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_TAIL] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
         }
     }
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/cl/gather_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
index aa5e2e5..fdeda2e 100644
--- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
@@ -47,7 +47,8 @@ typedef enum
     INTERNAL_KERNEL_GATHER,
 } _internal_kernel_e;
 
-#define _GATHER_KERNEL_SOURCE      "gather"
+#define _GATHER_KERNEL_SOURCE           "gather"
+#define _GATHER_BATCH_KERNEL_SOURCE     "gather_batch"
 
 // Add kernel hashtable here
 #define VX_KERNEL_NAME_GATHER_U8TOU8       CVIVANTE_NAMESPACE("cl.gather_U8toU8")
@@ -55,25 +56,39 @@ typedef enum
 #define VX_KERNEL_NAME_GATHER_I32TOI32     CVIVANTE_NAMESPACE("cl.gather_I32toI32")
 #define VX_KERNEL_NAME_GATHER_F32TOF32     CVIVANTE_NAMESPACE("cl.gather_F32toF32")
 
+#define VX_KERNEL_NAME_GATHER_BATCH_U8TOU8       CVIVANTE_NAMESPACE("cl.gather_batch_U8toU8")
+#define VX_KERNEL_NAME_GATHER_BATCH_F16TOF16     CVIVANTE_NAMESPACE("cl.gather_batch_F16toF16")
+#define VX_KERNEL_NAME_GATHER_BATCH_I32TOI32     CVIVANTE_NAMESPACE("cl.gather_batch_I32toI32")
+#define VX_KERNEL_NAME_GATHER_BATCH_F32TOF32     CVIVANTE_NAMESPACE("cl.gather_batch_F32toF32")
+
 // Add kernel hashtable here
-#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
-    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
+#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _image_2d, _batch) \
+    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d << 4) | (_batch))
 
 #define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0), \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0), \
         VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \
         SOURCE },
 
+#define TENSOR_GATHER_BATCH_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 1), \
+        VX_KERNEL_NAME_GATHER_BATCH_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
 static const struct {
         uint32_t key;
         char* function_name;
         const char* source_name;
     } gather_map[] =
 {
-    TENSOR_GATHER_KERNELS(U8,  I32, U8,       _GATHER_KERNEL_SOURCE)
+    TENSOR_GATHER_KERNELS(U8,  I32, U8,        _GATHER_KERNEL_SOURCE)
     TENSOR_GATHER_KERNELS(F16, I32, F16,       _GATHER_KERNEL_SOURCE)
     TENSOR_GATHER_KERNELS(I32, I32, I32,       _GATHER_KERNEL_SOURCE)
     TENSOR_GATHER_KERNELS(F32, I32, F32,       _GATHER_KERNEL_SOURCE)
+    TENSOR_GATHER_BATCH_KERNELS(U8,  I32, U8,  _GATHER_BATCH_KERNEL_SOURCE)
+    TENSOR_GATHER_BATCH_KERNELS(F16, I32, F16, _GATHER_BATCH_KERNEL_SOURCE)
+    TENSOR_GATHER_BATCH_KERNELS(I32, I32, I32, _GATHER_BATCH_KERNEL_SOURCE)
+    TENSOR_GATHER_BATCH_KERNELS(F32, I32, F32, _GATHER_BATCH_KERNEL_SOURCE)
 };
 
 /*
@@ -88,6 +103,7 @@ static vx_param_description_t _gather_kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
 #define _GATHER_PARAM_NUM  _cnt_of_array( _gather_kernel_param_def )
@@ -97,6 +113,7 @@ static vsi_status cal_gather_tensor_reshape_size
     vsi_nn_tensor_t ** inputs,
     vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
     uint32_t block_size,
+    vsi_size_t batch_dims,
     uint32_t idxFlg
     )
 {
@@ -105,30 +122,37 @@ static vsi_status cal_gather_tensor_reshape_size
     vsi_size_t *input_size = inputs[0]->attr.size;
     uint32_t i = 0;
     vsi_size_t elementCnt = 1;
+    vsi_size_t outerCnt = 1;
 #define VSI_NN_MAX_IMAGE_WIDTH  (65536)
 
-    for(i = 0; i < dims_num; ++i)
+    for (i = 0; i < dims_num - batch_dims; ++i)
     {
         elementCnt *= input_size[i];
     }
 
-    for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
+    for (; i < dims_num; ++i)
+    {
+        outerCnt *= input_size[i];
+    }
+
+    for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
     {
         sizes[i] = 1;
     }
 
-    if(idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH)
+    if (idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH)
     {
         sizes[0] = elementCnt;
-        sizes[1] = 1;
+        sizes[1] = outerCnt;
         status = VSI_SUCCESS;
     }
     else
     {
-        if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+        if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
         {
             sizes[0] = block_size;
             sizes[1] = elementCnt / block_size;
+            sizes[2] = outerCnt;
             status = VSI_SUCCESS;
         }
     }
@@ -160,9 +184,9 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
     vsi_size_array_t * input1_shape = NULL;
     int32_t       block_size  = 0;
     int32_t       block_num   = 0;
-    vsi_ssize_t       indices_num = 1;
-    size_t      input_dims1 = 0;
-    size_t     i           = 0;
+    vsi_ssize_t   indices_num = 1;
+    size_t        input_dims1 = 0;
+    size_t        i           = 0;
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@@ -176,7 +200,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
 
     input1_shape  = attr[1]->shape;
     input_dims1   = input1_shape->size;
-    for (i = 0; i < input_dims1; i++)
+    for (i = 0; i < input_dims1 - 1; i++)
     {
         indices_num *= input1_shape->data[i];
     }
@@ -214,7 +238,8 @@ static vsi_status _query_kernel
     (
     vsi_nn_kernel_t * kernel,
     vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t is_batch
     /* Add extra params */
     )
 {
@@ -227,17 +252,17 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0 );
+    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0, is_batch );
 
-    for( i = 0; i < _cnt_of_array(gather_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(gather_map); i ++ )
     {
-        if( gather_map[i].key == key )
+        if ( gather_map[i].key == key )
         {
             break;
         }
     }
 
-    if( i < _cnt_of_array(gather_map) )
+    if ( i < _cnt_of_array(gather_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  gather_map[i].function_name );
         kernel->info.parameters = _gather_kernel_param_def;
@@ -271,54 +296,69 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_GATHER_PARAM_NUM] = {NULL};
     vsi_nn_kernel_node_t node = NULL;
     vsi_size_t  shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    int32_t batch_dims  = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
     int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
     int32_t block_num   = vsi_nn_kernel_param_get_int32( params, "block_num" );
     int32_t axis_num    = vsi_nn_kernel_param_get_int32( params, "axis_num" );
     int32_t indices_num = vsi_nn_kernel_param_get_int32( params, "indices_num" );
+    int32_t is_batch    = batch_dims > 0 ? 1 : 0;
+    vsi_size_t rs_dim   = batch_dims == 0 ? 2 : 3;
+    int32_t i           = 0;
 
-    status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0);
-    status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
-    status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0);
-    if(status != VSI_SUCCESS)
+    status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0);
+    status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1);
+    status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0);
+    if (status != VSI_SUCCESS)
     {
         return NULL;
     }
 
-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+        inputs[0], shapes[0], rs_dim );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+        inputs[1], shapes[1], 2 );
+    reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+        outputs[0], shapes[2], rs_dim );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
     }
 
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if( VSI_SUCCESS == status)
+    status = _query_kernel( kernel, inputs, outputs, is_batch );
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
-            uint32_t index = 0;
-#define RESHAPE_DIM 2
+            uint32_t index = 3;
+            int32_t batch = (int32_t)shapes[1][1];
+
             /* Pass parameters to node. */
-            node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t,  shapes[0], RESHAPE_DIM );
-            node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t,  shapes[1], RESHAPE_DIM );
-            node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], RESHAPE_DIM );
-#undef RESHAPE_DIM
+            vsi_nn_kernel_node_pack_io( node_params, _GATHER_PARAM_NUM,
+                reshape_tensors, 2, &reshape_tensors[2], 1 );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &indices_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &batch );
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, _GATHER_PARAM_NUM );
-            CHECK_STATUS(status);
-            vsi_nn_kernel_tensor_release( &node_params[0] );
-            vsi_nn_kernel_tensor_release( &node_params[1] );
-            vsi_nn_kernel_tensor_release( &node_params[2] );
             vsi_nn_kernel_scalar_release( &node_params[3] );
             vsi_nn_kernel_scalar_release( &node_params[4] );
             vsi_nn_kernel_scalar_release( &node_params[5] );
             vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
         }
     }
+
+    for (i = 0; i < 3; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
index 49ccd23..95a4bff 100644
--- a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
@@ -22,7 +22,6 @@
 *
 *****************************************************************************/
 
-
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -445,45 +444,6 @@ static vsi_status _query_kernel
     return status;
 } /* _query_kernel() */
 
-static int32_t _optimize_gn_shape_cl
-    (
-    vsi_nn_tensor_t ** inputs,
-    vsi_size_t group_size,
-    int32_t group_num,
-    vsi_size_t* opt_shape,
-    int32_t* is2D_flg
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_size_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
-    vsi_size_t new_rank = 0;
-    group_shape[0] = inputs[0]->attr.size[0];
-    group_shape[1] = inputs[0]->attr.size[1];
-    group_shape[2] = group_size;
-
-    vsi_nn_kernel_optimize_element_shape(group_shape, 3, opt_shape, &new_rank );
-
-    if (opt_shape[1] == 1)
-    {
-        opt_shape[1] = group_num;
-        opt_shape[2] = 1;
-        opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
-        is2D_flg[0] = 1;
-    }
-    else if (new_rank == 2)
-    {
-        opt_shape[2] = group_num;
-        opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
-    }
-    else
-    {
-        status = VSI_FAILURE;
-    }
-
-    return status;
-}
-
-
 static vsi_nn_kernel_node_t _setup
     (
     vsi_nn_graph_t              * graph,
@@ -535,11 +495,13 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
-    status = _optimize_gn_shape_cl(inputs, group_size, group_num, new_shape, &is2D_flg);
+    status =  vsi_nn_kernel_optimize_group_norm_shape( (const vsi_size_t*)inputs[0]->attr.size,
+        inputs[0]->attr.dim_num, group_num, 0, new_shape);
     if ( VSI_SUCCESS != status )
     {
         goto final;
     }
+    is2D_flg = (new_shape[2] == 1) && ((int32_t)new_shape[1] == group_num);
     rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4);
     rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4);
 
diff --git a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
index 929c812..58eb2ee 100644
--- a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
@@ -406,12 +406,12 @@ static vsi_nn_kernel_node_t _setup
     uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
     uint32_t hashkey = 0;
     int32_t i = 0;
-
+    uint32_t rank = outputs[0]->attr.dim_num;
     float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
-    int32_t reshape_flg  = vsi_nn_kernel_param_get_int32( params, "reshape_flg" );
-
     size_t width = inputs[0]->attr.size[0];
     size_t height = inputs[0]->attr.size[1];
+    int32_t reshape_flg  = outputs[0]->attr.size[1] * outputs[0]->attr.size[2] < GPU_TENSOR_MAX_WIDTH
+            && rank > 2;
     int32_t group_num = (int32_t)(width + 15) / 16;
     int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
     float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
diff --git a/src/tim/vx/internal/src/kernel/cl/moments_cl.c b/src/tim/vx/internal/src/kernel/cl/moments_cl.c
index ed420ad..e5bae71 100644
--- a/src/tim/vx/internal/src/kernel/cl/moments_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/moments_cl.c
@@ -101,18 +101,23 @@ static const _kernel_map_type moments_map[] =
     TENSOR_MOMENTS_KERNELS(U8,  F32, 0,        KERNEL_SOURCE_1)
     TENSOR_MOMENTS_KERNELS(F32, F32, 0,        KERNEL_SOURCE_1)
     TENSOR_MOMENTS_KERNELS(I32, F32, 0,        KERNEL_SOURCE_1)
+    TENSOR_MOMENTS_KERNELS(BF16,F32, 0,        KERNEL_SOURCE_1)
     TENSOR_MOMENTS_KERNELS(U8,  F32, 1,        KERNEL_SOURCE_2)
     TENSOR_MOMENTS_KERNELS(F32, F32, 1,        KERNEL_SOURCE_2)
     TENSOR_MOMENTS_KERNELS(I32, F32, 1,        KERNEL_SOURCE_2)
+    TENSOR_MOMENTS_KERNELS(BF16,F32, 1,        KERNEL_SOURCE_2)
     TENSOR_MOMENTS_KERNELS(U8,  F32, 2,        KERNEL_SOURCE_3)
     TENSOR_MOMENTS_KERNELS(F32, F32, 2,        KERNEL_SOURCE_3)
     TENSOR_MOMENTS_KERNELS(I32, F32, 2,        KERNEL_SOURCE_3)
+    TENSOR_MOMENTS_KERNELS(BF16,F32, 2,        KERNEL_SOURCE_3)
     TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8,  F32, 0, 1,         KERNEL_SOURCE_4)
     TENSOR_MOMENTS_TWO_AXIS_KERNELS(F32, F32, 0, 1,         KERNEL_SOURCE_4)
     TENSOR_MOMENTS_TWO_AXIS_KERNELS(I32, F32, 0, 1,         KERNEL_SOURCE_4)
+    TENSOR_MOMENTS_TWO_AXIS_KERNELS(BF16,F32, 0, 1,         KERNEL_SOURCE_4)
     TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8,  F32, 0, 1, 2,    KERNEL_SOURCE_5)
     TENSOR_MOMENTS_THREE_AXIS_KERNELS(F32, F32, 0, 1, 2,    KERNEL_SOURCE_5)
     TENSOR_MOMENTS_THREE_AXIS_KERNELS(I32, F32, 0, 1, 2,    KERNEL_SOURCE_5)
+    TENSOR_MOMENTS_THREE_AXIS_KERNELS(BF16,F32, 0, 1, 2,    KERNEL_SOURCE_5)
 };
 
 /*
diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
new file mode 100644
index 0000000..ad99bc6
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@@ -0,0 +1,301 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+#define _TOPK_KERNEL_SOURCE      "topk"
+#define STR(a) #a
+// Add kernel hashtable here
+#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \
+        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) )
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \
+        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \
+          CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
+          _TOPK_KERNEL_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _topk_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( F32, F32, 0 ),
+    PACK_KERNEL_MAP( F32, F32, 1 ),
+    PACK_KERNEL_MAP( F32, F32, 2 ),
+    PACK_KERNEL_MAP( F32, F32, 3 ),
+    PACK_KERNEL_MAP( F32, F32, 4 ),
+    PACK_KERNEL_MAP( F32, F32, 5 ),
+    PACK_KERNEL_MAP( F32, F32, 6 ),
+
+    PACK_KERNEL_MAP( U32, U32, 0 ),
+    PACK_KERNEL_MAP( U32, U32, 1 ),
+    PACK_KERNEL_MAP( U32, U32, 2 ),
+    PACK_KERNEL_MAP( U32, U32, 3 ),
+    PACK_KERNEL_MAP( U32, U32, 4 ),
+    PACK_KERNEL_MAP( U32, U32, 5 ),
+    PACK_KERNEL_MAP( U32, U32, 6 ),
+
+    PACK_KERNEL_MAP( I32, I32, 0 ),
+    PACK_KERNEL_MAP( I32, I32, 1 ),
+    PACK_KERNEL_MAP( I32, I32, 2 ),
+    PACK_KERNEL_MAP( I32, I32, 3 ),
+    PACK_KERNEL_MAP( I32, I32, 4 ),
+    PACK_KERNEL_MAP( I32, I32, 5 ),
+    PACK_KERNEL_MAP( I32, I32, 6 ),
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _topk_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _TOPK_PARAM_NUM  _cnt_of_array( _topk_kernel_param_def )
+#define SCALAR_INPUT_NUM_STAGES (3)
+#define SCALAR_INPUT_WIDTH      (4)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_topk_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        2,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * input_attr   = NULL;
+    vsi_size_array_t * in_shape                = NULL;
+    int32_t num_stages = 0;
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_NUM_STAGES], &num_stages);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    in_shape  = input_attr->shape;
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.local_size[0]   = (size_t)(1 << num_stages);
+    gpu_param.local_size[1]   = 1;
+    gpu_param.global_size[0]  = (size_t)(1 << num_stages);
+    gpu_param.global_size[1]  = in_shape->data[1];
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(input_attr);
+    return status;
+} /* _topk_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t num_stages
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _topk_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _topk_kernel_map );
+    vx_param_description_t * param_def  = _topk_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _topk_initializer;
+#define _PACK_SELECT_KEY( in_type, out_type ) \
+    ( (in_type) | (out_type << 8) )
+    uint32_t key = 0;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+        key = TOPK_HASH_KEY( F32, F32, num_stages );
+        break;
+    case _PACK_SELECT_KEY(U32, U32):
+    case _PACK_SELECT_KEY(U16, U16):
+    case _PACK_SELECT_KEY(U8,  U8):
+        key = TOPK_HASH_KEY( U32, U32, num_stages );
+        break;
+    case _PACK_SELECT_KEY(I32, I32):
+    case _PACK_SELECT_KEY(I16, I16):
+    case _PACK_SELECT_KEY(I8,  I8):
+        key = TOPK_HASH_KEY( I32, I32, num_stages );
+        break;
+    default:
+        break;
+    }
+#undef _PACK_SELECT_KEY
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _topk_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_TOPK_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t block_size = inputs[0]->attr.size[0];
+    vsi_size_t block_num = 1;
+    uint32_t i = 0;
+    vsi_nn_tensor_t* rs_tensors[3] = { NULL };
+    vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
+    int32_t width = (int32_t)block_size;
+    int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
+    int32_t num_stages = (int32_t)ceil(log10(block_size / 2.0f) / log10(2.0f));
+
+    for (i = 1; i < inputs[0]->attr.dim_num; i ++)
+    {
+        block_num = block_num * inputs[0]->attr.size[i];
+    }
+
+    if( vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE ||
+        outputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_INT32 )
+    {
+        return NULL;
+    }
+
+    shape[0][0] = block_size;
+    shape[0][1] = block_num;
+    shape[1][0] = top_k;
+    shape[1][1] = block_num;
+
+    rs_tensors[0] = vsi_nn_reshape_tensor( graph,
+        inputs[0], shape[0], 2 );
+    rs_tensors[1] = vsi_nn_reshape_tensor( graph,
+        outputs[0], shape[1], 2 );
+    rs_tensors[2] = vsi_nn_reshape_tensor( graph,
+        outputs[1], shape[1], 2 );
+
+    status = _query_kernel( kernel, inputs, outputs, num_stages );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _TOPK_PARAM_NUM,
+                    rs_tensors, input_num, &rs_tensors[1], output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_INPUT_NUM_STAGES] = vsi_nn_kernel_scalar_create(
+                graph, I32, &num_stages );
+            node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create(
+                graph, I32, &width );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _TOPK_PARAM_NUM );
+            CHECK_STATUS_FAIL_GOTO( status, final );
+        }
+    }
+final:
+    vsi_safe_release_tensor(rs_tensors[0]);
+    vsi_safe_release_tensor(rs_tensors[1]);
+    vsi_safe_release_tensor(rs_tensors[2]);
+    if (node_params[SCALAR_INPUT_NUM_STAGES])
+    {
+        vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_NUM_STAGES] );
+    }
+    if (node_params[SCALAR_INPUT_WIDTH])
+    {
+        vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( topk, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
index 3aa63e2..17b7be6 100644
--- a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
@@ -40,6 +40,7 @@ __BEGIN_DECLS
 typedef enum
 {
     UNARY_SIN,
+    UNARY_COS,
     UNARY_EXP,
     UNARY_LOG,
     UNARY_ELU,
@@ -69,6 +70,11 @@ static float sin_eval(float data)
     return sinf(data);
 }
 
+static float cos_eval(float data)
+{
+    return cosf(data);
+}
+
 static float log_eval(float data)
 {
     return logf(data);
@@ -212,6 +218,9 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
         case UNARY_SIN:
             data = sin_eval(data);
             break;
+        case UNARY_COS:
+            data = cos_eval(data);
+            break;
         case UNARY_EXP:
             data = exp_eval(data);
             break;
@@ -372,6 +381,7 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( sin,          UNARY_SIN )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( cos,          UNARY_COS )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( exp,          UNARY_EXP )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( log,          UNARY_LOG )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( elu,          UNARY_ELU )
diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c
index 2ea12a5..b91dabd 100644
--- a/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c
@@ -42,7 +42,7 @@ __BEGIN_DECLS
 /*
  * Define kernel meta.
  */
-#define _CPU_ARG_NUM            (3)
+#define _CPU_ARG_NUM            (4)
 #define _CPU_INPUT_NUM          (2)
 #define _CPU_OUTPUT_NUM         (1)
 #define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
@@ -62,9 +62,9 @@ DEF_KERNEL_EXECUTOR(_gather_exec)
     uint32_t* buffer_idx = NULL;
     size_t in_elements = 0, out_elements = 0;
     vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    vsi_size_t i = 0, j = 0;
-    int32_t block_size = 1, block_num = 1, axis_num = 0;
-    vsi_size_t indices_num = 1;
+    vsi_size_t i = 0, j = 0, b = 0;
+    int32_t block_size = 1, block_num = 1, axis_num = 0, batch_dims = 0;
+    vsi_size_t indices_num = 1, batch = 1, in_stride = 1, out_stride = 1;
 
     tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
     tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
@@ -86,6 +86,8 @@ DEF_KERNEL_EXECUTOR(_gather_exec)
     CHECK_STATUS_FAIL_GOTO(status, final );
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis_num);
     CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &batch_dims);
+    CHECK_STATUS_FAIL_GOTO(status, final );
 
     buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
     CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
@@ -98,26 +100,44 @@ DEF_KERNEL_EXECUTOR(_gather_exec)
     memset( buffer[1], 0, out_elements * sizeof(float) );
 
     {
-        for(i = 0; i < attr[1]->shape->size; ++i)
+        for (i = 0; i < attr[1]->shape->size - (vsi_size_t)batch_dims; i++)
         {
             indices_num *= attr[1]->shape->data[i];
         }
 
-        for(i = 0; i < (vsi_size_t)block_num; i++)
+        for (; i < attr[1]->shape->size; i++)
         {
-            for(j = 0; j < indices_num; j++)
+            batch *= attr[1]->shape->data[i];
+        }
+
+        for (i = 0; i < attr[0]->shape->size - (vsi_size_t)batch_dims; i++)
+        {
+            in_stride *= attr[0]->shape->data[i];
+        }
+
+        for (i = 0; i < attr[2]->shape->size - (vsi_size_t)batch_dims; i++)
+        {
+            out_stride *= attr[2]->shape->data[i];
+        }
+
+        for (b = 0; b < batch; b++)
+        {
+            for (i = 0; i < (vsi_size_t)block_num; i++)
             {
-                uint32_t indice = buffer_idx[j];
-                vsi_size_t in_index = (i * axis_num + indice) * block_size;
-                if(in_index < in_elements)
+                for (j = 0; j < indices_num; j++)
                 {
-                    vsi_size_t out_index = (i * indices_num + j) * block_size;
-                    memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float));
-                }
-                else
-                {
-                    status = VX_FAILURE;
-                    CHECK_STATUS_FAIL_GOTO( status, final );
+                    uint32_t indice = buffer_idx[j + indices_num * b];
+                    vsi_size_t in_index = (i * axis_num + indice) * block_size + b * in_stride;
+                    if (in_index < in_elements)
+                    {
+                        vsi_size_t out_index = (i * indices_num + j) * block_size + b * out_stride;
+                        memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float));
+                    }
+                    else
+                    {
+                        status = VX_FAILURE;
+                        CHECK_STATUS_FAIL_GOTO( status, final );
+                    }
                 }
             }
         }
@@ -128,20 +148,20 @@ DEF_KERNEL_EXECUTOR(_gather_exec)
     CHECK_STATUS_FAIL_GOTO( status, final );
 
 final:
-    if( buffer_idx )
+    if ( buffer_idx )
     {
         free( buffer_idx );
     }
-    for( i = 0; i < 2; i ++ )
+    for ( i = 0; i < 2; i ++ )
     {
-        if( buffer[i] )
+        if ( buffer[i] )
         {
             free( buffer[i] );
         }
     }
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    for ( i = 0; i < _CPU_IO_NUM; i ++ )
     {
-        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
     }
     return status;
 } /* _gather_exec() */
@@ -156,6 +176,7 @@ static vx_param_description_t _gather_kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
 #define _GATHER_PARAM_NUM  _cnt_of_array( _gather_kernel_param_def )
@@ -201,15 +222,16 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t node = NULL;
 
     status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             uint32_t index = 3;
-            int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
-            int32_t block_num  = vsi_nn_kernel_param_get_int32( params, "block_num" );
+            int32_t block_size   = vsi_nn_kernel_param_get_int32( params, "block_size" );
+            int32_t block_num    = vsi_nn_kernel_param_get_int32( params, "block_num" );
             int32_t axis_num     = vsi_nn_kernel_param_get_int32( params, "axis_num" );
+            int32_t batch_dims   = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
 
             /* Set inputs and outputs */
             vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
@@ -218,12 +240,14 @@ static vsi_nn_kernel_node_t _setup
             backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
             backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num );
             backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
+            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &batch_dims );
             /* Pass parameters to node. */
             status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
             CHECK_STATUS( status );
             vsi_nn_kernel_scalar_release( &backend_params[3] );
             vsi_nn_kernel_scalar_release( &backend_params[4] );
             vsi_nn_kernel_scalar_release( &backend_params[5] );
+            vsi_nn_kernel_scalar_release( &backend_params[6] );
         }
         else
         {
diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
index 33e8b33..e446623 100644
--- a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
@@ -103,9 +103,10 @@ DEF_KERNEL_EXECUTOR(_gather_nd_exec)
     if(coord_stride <= 4) // reshape 3D
     {
         vsi_ssize_t stride[4] = {block_size, 0, 0, 0};
+        int32_t     start_dim = (int32_t)attr[0]->shape->size - coord_stride;
         for(i = 1; i < coord_stride; ++i)
         {
-            stride[i] = stride[i - 1] * attr[0]->shape->data[i];
+            stride[i] = stride[i - 1] * attr[0]->shape->data[start_dim + i - 1];
         }
 
         for(i = 0; i < indices_num; i++)
@@ -118,8 +119,8 @@ DEF_KERNEL_EXECUTOR(_gather_nd_exec)
             for(j = 0; j < coord_stride; j++)
             {
                 coord[j] = buffer_idx[i * coord_stride + j];
+                in_index += coord[j] * stride[j];
             }
-            in_index = coord[3] * stride[3] + coord[2] * stride[2] + coord[1] * stride[1] + coord[0] * stride[0];
             memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float));
         }
     }
diff --git a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
index 2744643..cf9bb0e 100644
--- a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
@@ -61,7 +61,13 @@ DEF_KERNEL_EXECUTOR(_instance_norm_exec)
     float * buffer[_CPU_IO_NUM] = { NULL };
     size_t out_elements = 0;
     vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i = 0;
+    vsi_size_t batch = 1;
+    vsi_size_t depth = 1;
+    vsi_size_t norm_size = 1;
+    vsi_size_t b = 0;
+    vsi_size_t c = 0;
+    vsi_size_t i = 0;
+    size_t rank = 1;
     float eps = .0f;
 
     tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
@@ -96,62 +102,55 @@ DEF_KERNEL_EXECUTOR(_instance_norm_exec)
     CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
     memset( buffer[3], 0, out_elements * sizeof(float) );
 
+    rank = attr[0]->shape->size;
+
+    batch = attr[0]->shape->data[rank - 1];
+    depth = attr[0]->shape->data[rank - 2];
+
+    for ( i = 0; i < (vsi_size_t)rank - 2; i++)
     {
-        vsi_size_t b = 0, c = 0, h = 0, w = 0;
-        vsi_size_t height = attr[0]->shape->data[1];
-        vsi_size_t width = attr[0]->shape->data[0];
-        vsi_size_t ch = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;
-        vsi_size_t bh = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1;
+        norm_size *= attr[0]->shape->data[i];
+    }
 
-        for (b = 0; b < bh; b++)
+    for (b = 0; b < batch; b++)
+    {
+        for (c = 0; c < depth; c++)
         {
-            for (c = 0; c < ch; c++)
+            vsi_size_t page = c * norm_size + b * norm_size * depth;
+            float sum = .0f;
+            float sumsq = .0f;
+            float mean = .0f;
+            float vari = .0f;
+            float data = 0;
+            float scaleVal = buffer[2][c];
+            float biasVal = buffer[1][c];
+
+            for (i = 0; i < norm_size; i++)
             {
-                vsi_size_t page = c * (height * width) + b * (height * width * ch);
-                float sum = .0f;
-                float sumsq = .0f;
-                float mean = .0f;
-                float vari = .0f;
-                float data = 0;
-                float scaleVal = buffer[2][c];
-                float biasVal = buffer[1][c];
+                vsi_size_t index = page + i;
+                sum += buffer[0][index];
+            }
 
-                for (h = 0; h < height; h++)
-                {
-                    vsi_size_t len = page + h * width;
+            mean = sum / (float)norm_size;
 
-                    for (w = 0; w < width; w++)
-                    {
-                        vsi_size_t index = len + w;
-                        sum += buffer[0][index];
-                    }
-                }
-                mean = sum / (width * height);
-                for (h = 0; h < height; h++)
-                {
-                    vsi_size_t len = page + h * width;
-                    for (w = 0; w < width; w++)
-                    {
-                        vsi_size_t index = len + w;
-                        data = buffer[0][index] - mean;
-                        sumsq += data * data;
-                    }
-                }
-                vari = sumsq / (width * height);
-                vari = (float)(1.0 / sqrtf(vari + eps));
-                for (h = 0; h < height; h++)
-                {
-                    vsi_size_t len = page + h * width;
-                    for (w = 0; w < width; w++)
-                    {
-                        float normVal = 0;
-                        vsi_size_t index = len + w;
-                        data = buffer[0][index] - mean;
+            for (i = 0; i < norm_size; i++)
+            {
+                vsi_size_t index = page + i;
+                data = buffer[0][index] - mean;
+                sumsq += data * data;
+            }
 
-                        normVal = data * vari * scaleVal + biasVal;
-                        buffer[3][index] = normVal;
-                    }
-                }
+            vari = sumsq / (float)norm_size;
+            vari = (float)(1.0 / sqrtf(vari + eps));
+
+            for (i = 0; i < norm_size; i++)
+            {
+                float normVal = 0;
+                vsi_size_t index = page + i;
+                data = buffer[0][index] - mean;
+
+                normVal = data * vari * scaleVal + biasVal;
+                buffer[3][index] = normVal;
             }
         }
     }
@@ -256,4 +255,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( instance_norm, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c
index f133568..611bbfa 100644
--- a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c
@@ -104,7 +104,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -311,4 +310,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( resize_bilinear, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
index a8cec94..3fe4185 100644
--- a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
@@ -63,6 +63,11 @@ __BEGIN_DECLS
         CVIVANTE_NAMESPACE("evis.argmax_axis"#AXIS"_F16to"#OUT_DTYPE"_2D"), \
         HASH_ARGMAX_KERNEL_SOURCE_NAME(AXIS) },
 
+#define HASH_ARGMAX_KERNELS_MIX_OPT( AXIS, IN_DTYPE, OUT_DTYPE) \
+        { HASH_ARGMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 2), \
+        CVIVANTE_NAMESPACE("evis.argmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_opt"), \
+        HASH_ARGMAX_KERNEL_SOURCE_NAME(AXIS) },
+
 static const struct {
         uint32_t key;
         char* function_name;
@@ -132,6 +137,8 @@ static const struct {
     HASH_ARGMAX_KERNELS_2D(2, U8,  I16)
     HASH_ARGMAX_KERNELS_2D(2, I16, U8)
     HASH_ARGMAX_KERNELS_2D(2, I16, I16)
+    HASH_ARGMAX_KERNELS_MIX_OPT(2, U8,  I16)
+    HASH_ARGMAX_KERNELS_MIX_OPT(2, I8,  I16)
 };
 
 static vx_param_description_t kernel_param_def[] =
@@ -228,7 +235,18 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer)
         if (attr[0]->dtype == I8 ||
             attr[0]->dtype == U8)
         {
-            if ( attr[1]->dtype == I8 ||
+            if (axis == 2 &&
+                input_shape->data[2] > 1 &&
+                ((attr[1]->dtype == I8 || attr[1]->dtype == U8)
+                  || (attr[1]->dtype == I16 && input_shape->data[2] < 256)))
+            {
+                uint32_t pack = ((argLenSub1 & 0xFF) << 24) | ((argLenSub1 & 0xFF) << 16)
+                                 | ((argLenSub1 & 0xFF) << 8) | (argLenSub1 & 0xFF);
+                packedArgIdx[0] = packedArgIdx[1] = pack;
+                packedArgIdx[2] = packedArgIdx[3] = pack;
+                gpu_param.global_scale[0]  = 16;
+            }
+            else if ( attr[1]->dtype == I8 ||
                  attr[1]->dtype == U8)
             {
                 uint32_t pack = ((argLenSub1 & 0xFF) << 24) | ((argLenSub1 & 0xFF) << 16)
@@ -302,7 +320,6 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer)
         }
         break;
     case 1:
-    case 2:
         {
             gpu_dp_inst_t uniExtractData_2x8 = {{
                 0x11111111, // TCfg
@@ -324,6 +341,52 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer)
             CHECK_STATUS_FAIL_GOTO(status, final );
         }
         break;
+    case 2:
+        {
+            gpu_dp_inst_t uniExtractData_2x8 = {{
+                0x11111111, // TCfg
+                0x00000000, // ASelt
+                0x03020100, 0x07060504, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniExtract1stU8toI16_2x8 = {{
+                0x11111111, // TCfg
+                0x00000000, // ASelt
+                0x03020100, 0x07060504, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniExtract2ndU8toI16_2x8 = {{
+                0x11111111, // TCfg
+                0x00000000, // ASelt
+                0x0b0a0908, 0x0f0e0d0c, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtractData_2x8", &uniExtractData_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtract1stU8toI16_2x8", &uniExtract1stU8toI16_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtract2ndU8toI16_2x8", &uniExtract2ndU8toI16_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "argLenSub1", &argLenSub1 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "packedArgIdx", packedArgIdx );
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }
+        break;
     default:
         break;
     }
@@ -354,6 +417,16 @@ static vsi_status _query_kernel
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if ((input_dtype == I8 || input_dtype == U8)
+        && output_dtype == I16
+        && axis == 2
+        && inputs[0]->attr.size[2] < 256
+        && image_2d == 0)
+    {
+        image_2d = 2;
+    }
+
     key = HASH_ARGMAX_HASH_KEY( axis, input_dtype, output_dtype, image_2d );
 
     for( i = 0; i < _cnt_of_array(_argmax_evis_kernel_map); i ++ )
diff --git a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
index ee5a622..dbbe2ad 100644
--- a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
@@ -85,12 +85,12 @@ typedef enum
 
 #define COMPARISONS_KERNELS_HALF(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \
     {   HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 0), \
-        HASH_COMPARISONS_SH_KERNEL_NAME(FUNC_NAME, F16, F16), \
+        HASH_COMPARISONS_SH_KERNEL_NAME(FUNC_NAME, BF16, BF16), \
         SOURCE },
 
 #define COMPARISONS_KERNELS_HALF_2D(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \
     {   HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 1), \
-        HASH_COMPARISONS_SH_KERNEL_2D_NAME(FUNC_NAME, F16, F16), \
+        HASH_COMPARISONS_SH_KERNEL_2D_NAME(FUNC_NAME, BF16, BF16), \
         SOURCE },
 
 #define LESS_OP              less
@@ -396,6 +396,26 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
                 0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
                 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
             }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+                    0x11111111, // TCfg
+                    0x01010101, // ASelt
+                    0x01050004, 0x03070206, // ABin
+                    0x22222222, // BSelt
+                    0x00000000, 0x00000000, // BBin
+                    0x00000600, // AccumType, ConstantType, and PostShift
+                    0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                    0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+                    0x11111111, // TCfg
+                    0x01010101, // ASelt
+                    0x05050404, 0x07070606, // ABin
+                    0x22222222, // BSelt
+                    0x00000000, 0x00000000, // BBin
+                    0x00000600, // AccumType, ConstantType, and PostShift
+                    0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                    0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };
 
             status = vsi_nn_kernel_gpu_add_param( node,
                     "uniExtract8Data_2x8", &uniExtractInteger_2x8 );
@@ -403,6 +423,10 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
                     "uniDatatoFp32Part0_4x4", &uniDatatoFp32Part0_4x4 );
             status |= vsi_nn_kernel_gpu_add_param( node,
                     "uniDatatoFp32Part1_4x4", &uniDatatoFp32Part1_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
             status |= vsi_nn_kernel_gpu_add_param( node,
                     "input0Scale", &input0Scale );
             status |= vsi_nn_kernel_gpu_add_param( node,
@@ -453,7 +477,7 @@ static vsi_status _query_kernel
     int i;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
-    input1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
     output_dtype = output_dtype == I8 ? BOOL8 : output_dtype;
     key = HASH_COMPARISONS_KEY( operation, input0_dtype, input1_dtype, output_dtype, image_2d );
diff --git a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
index 732f949..de5aa83 100644
--- a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
@@ -301,6 +301,7 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
         case _PACK_SELECT_KEY( I8, I8):
         case _PACK_SELECT_KEY( I16, I16):
         case _PACK_SELECT_KEY( F16, F16):
+        case _PACK_SELECT_KEY( BF16, BF16):
             {
                 gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift);
                 multAndoutZP0[0] = (uint32_t)(M0);
@@ -367,6 +368,16 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
+    if (input0_dtype == BF16)
+    {
+        input0_dtype = F16;
+    }
+
+    if (output_dtype == BF16)
+    {
+        output_dtype = F16;
+    }
+
     key = HASH_DEPTH2SPACE_CRD_KEY( input0_dtype, output_dtype, blk_flg );
 
     for( i = 0; i < _cnt_of_array(depth2space_crd_map); i ++ )
diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
index 1b99cb1..1e15e71 100644
--- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
@@ -42,6 +42,7 @@ __BEGIN_DECLS
 typedef enum
 {
     UNARY_SIN,
+    UNARY_COS,
     UNARY_EXP,
     UNARY_LOG,
     UNARY_ELU,
@@ -79,6 +80,7 @@ typedef enum
         SOURCE },
 
 #define SIN_OPERATION           sin
+#define COS_OPERATION           cos
 #define EXP_OPERATION           exp
 #define LOG_OPERATION           log
 #define ELU_OPERATION           elu
@@ -106,6 +108,17 @@ static const struct {
     TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, I8,   I8   , KERNEL_SOURCE_3D)
     TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, I8,   F16  , KERNEL_SOURCE_3D)
     TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, BF16, BF16 , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16,  F16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16,  I16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16,  U8   , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16,  I8   , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I16,  I16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I16,  F16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, U8,   U8   , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, U8,   F16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I8,   I8   , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I8,   F16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, BF16, BF16 , KERNEL_SOURCE_3D)
     TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16,  F16  , KERNEL_SOURCE_3D)
     TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16,  I16  , KERNEL_SOURCE_3D)
     TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16,  U8   , KERNEL_SOURCE_3D)
@@ -162,6 +175,17 @@ static const struct {
     TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8,   I8   , KERNEL_SOURCE_2D)
     TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8,   F16  , KERNEL_SOURCE_2D)
     TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, BF16, BF16 , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16,  F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16,  I16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16,  U8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16,  I8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I16,  I16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I16,  F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8,   U8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8,   F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I8,   I8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I8,   F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, BF16, BF16 , KERNEL_SOURCE_2D)
     TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16,  F16  , KERNEL_SOURCE_2D)
     TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16,  I16  , KERNEL_SOURCE_2D)
     TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16,  U8   , KERNEL_SOURCE_2D)
@@ -317,6 +341,7 @@ static const struct {
 };
 
 #undef SIN_OPERATION
+#undef COS_OPERATION
 #undef EXP_OPERATION
 #undef LOG_OPERATION
 #undef ELU_OPERATION
@@ -443,6 +468,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
     switch( pack_key )
     {
         case _PACK_SELECT_KEY( UNARY_SIN, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_COS, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_EXP, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_LOG, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_ELU, BF16, BF16 ):
@@ -736,6 +762,7 @@ OnError:
     REGISTER_BACKEND_EVIS( KERNEL_NAME, _##KERNEL_NAME##_setup )
 
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( sin, UNARY_SIN )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( cos, UNARY_COS )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( exp, UNARY_EXP )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( log, UNARY_LOG )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( elu, UNARY_ELU )
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
index e5b12f7..3be3996 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
@@ -64,6 +64,28 @@ __BEGIN_DECLS
 #define VX_KERNEL_NAME_GATHER_AXIS0_U8TOF16   CVIVANTE_NAMESPACE("evis.gather_U8toF16_axis0")
 #define VX_KERNEL_NAME_GATHER_AXIS0_F16TOU8   CVIVANTE_NAMESPACE("evis.gather_F16toU8_axis0")
 
+#define VX_KERNEL_NAME_GATHER_BATCH_U8TOU8       CVIVANTE_NAMESPACE("evis.gather_batch_U8toU8")
+#define VX_KERNEL_NAME_GATHER_BATCH_I8TOI8       CVIVANTE_NAMESPACE("evis.gather_batch_I8toI8")
+#define VX_KERNEL_NAME_GATHER_BATCH_I16TOI16     CVIVANTE_NAMESPACE("evis.gather_batch_I16toI16")
+#define VX_KERNEL_NAME_GATHER_BATCH_F16TOF16     CVIVANTE_NAMESPACE("evis.gather_batch_F16toF16")
+#define VX_KERNEL_NAME_GATHER_BATCH_I8TOF16      CVIVANTE_NAMESPACE("evis.gather_batch_I8toF16")
+#define VX_KERNEL_NAME_GATHER_BATCH_I16TOF16     CVIVANTE_NAMESPACE("evis.gather_batch_I16toF16")
+#define VX_KERNEL_NAME_GATHER_BATCH_F16TOI8      CVIVANTE_NAMESPACE("evis.gather_batch_F16toI8")
+#define VX_KERNEL_NAME_GATHER_BATCH_F16TOI16     CVIVANTE_NAMESPACE("evis.gather_batch_F16toI16")
+#define VX_KERNEL_NAME_GATHER_BATCH_U8TOF16      CVIVANTE_NAMESPACE("evis.gather_batch_U8toF16")
+#define VX_KERNEL_NAME_GATHER_BATCH_F16TOU8      CVIVANTE_NAMESPACE("evis.gather_batch_F16toU8")
+
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_U8TOU8    CVIVANTE_NAMESPACE("evis.gather_batch_U8toU8_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_I8TOI8    CVIVANTE_NAMESPACE("evis.gather_batch_I8toI8_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_I16TOI16  CVIVANTE_NAMESPACE("evis.gather_batch_I16toI16_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_F16TOF16  CVIVANTE_NAMESPACE("evis.gather_batch_F16toF16_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_I8TOF16   CVIVANTE_NAMESPACE("evis.gather_batch_I8toF16_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_I16TOF16  CVIVANTE_NAMESPACE("evis.gather_batch_I16toF16_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_F16TOI8   CVIVANTE_NAMESPACE("evis.gather_batch_F16toI8_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_F16TOI16  CVIVANTE_NAMESPACE("evis.gather_batch_F16toI16_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_U8TOF16   CVIVANTE_NAMESPACE("evis.gather_batch_U8toF16_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_F16TOU8   CVIVANTE_NAMESPACE("evis.gather_batch_F16toU8_axis0")
+
 #define VX_KERNEL_NAME_GATHER_ARRAY_U8TOU8    CVIVANTE_NAMESPACE("evis.gather_U8toU8_array")
 #define VX_KERNEL_NAME_GATHER_ARRAY_I8TOI8    CVIVANTE_NAMESPACE("evis.gather_I8toI8_array")
 #define VX_KERNEL_NAME_GATHER_ARRAY_I16TOI16  CVIVANTE_NAMESPACE("evis.gather_I16toI16_array")
@@ -77,31 +99,43 @@ __BEGIN_DECLS
 #define KERNEL_SOURCE_1    "gather"
 #define KERNEL_SOURCE_2    "gather_mix"
 #define KERNEL_SOURCE_3    "gather_array"
+#define KERNEL_SOURCE_4    "gather_batch"
+#define KERNEL_SOURCE_5    "gather_mix_batch"
 
 // Add kernel hashtable here
-#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_axis0, _is_max) \
-    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_axis0 << 4) | (_is_max))
+#define HASH_GATHER_KEY(_in0_type, _in1_type, _out_type, _axis0, _max, _batch) \
+    ((_in0_type << 24) | (_in1_type << 16) | (_out_type << 8) | (_axis0 << 6) | (_max << 4) | (_batch))
 
 #define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0), \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0, 0), \
         VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \
         SOURCE },
 
 #define TENSOR_GATHER_AXIS0_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0), \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0, 0), \
         VX_KERNEL_NAME_GATHER_AXIS0_##IN0_TYPE##TO##OUT_TYPE, \
         SOURCE },
 
 #define TENSOR_GATHER_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 1), \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 1, 0), \
         VX_KERNEL_NAME_GATHER_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \
         SOURCE },
 
 #define TENSOR_GATHER_AXIS0_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 1), \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 1, 0), \
         VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \
         SOURCE },
 
+#define TENSOR_GATHER_BATCH_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0, 1), \
+        VX_KERNEL_NAME_GATHER_BATCH_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
+#define TENSOR_GATHER_BATCH_AXIS0_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0, 1), \
+        VX_KERNEL_NAME_GATHER_BATCH_AXIS0_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
 static const struct {
         uint32_t key;
         char* function_name;
@@ -136,6 +170,26 @@ static const struct {
     TENSOR_GATHER_AXIS0_ARRAY_KERNELS(I8, I32,  I8,    KERNEL_SOURCE_3)
     TENSOR_GATHER_AXIS0_ARRAY_KERNELS(I16, I32, I16,   KERNEL_SOURCE_3)
     TENSOR_GATHER_AXIS0_ARRAY_KERNELS(F16, I32, F16,   KERNEL_SOURCE_3)
+    TENSOR_GATHER_BATCH_KERNELS(U8, I32,  U8,          KERNEL_SOURCE_4)
+    TENSOR_GATHER_BATCH_KERNELS(I8, I32,  I8,          KERNEL_SOURCE_4)
+    TENSOR_GATHER_BATCH_KERNELS(I16, I32, I16,         KERNEL_SOURCE_4)
+    TENSOR_GATHER_BATCH_KERNELS(F16, I32, F16,         KERNEL_SOURCE_4)
+    TENSOR_GATHER_BATCH_KERNELS(I8, I32,  F16,         KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_KERNELS(I16, I32, F16,         KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_KERNELS(F16, I32, I8,          KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_KERNELS(F16, I32, I16,         KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_KERNELS(U8, I32,  F16,         KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_KERNELS(F16, I32, U8,          KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(U8, I32,  U8,    KERNEL_SOURCE_4)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(I8, I32,  I8,    KERNEL_SOURCE_4)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(I16, I32, I16,   KERNEL_SOURCE_4)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(F16, I32, F16,   KERNEL_SOURCE_4)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(I8, I32,  F16,   KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(I16, I32, F16,   KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(F16, I32, I8,    KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(F16, I32, I16,   KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(U8, I32,  F16,   KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(F16, I32, U8,    KERNEL_SOURCE_5)
 };
 
 /*
@@ -158,6 +212,7 @@ static vsi_status get_gather_tensor_reshape_size
     vsi_nn_tensor_t ** inputs,
     vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
     vsi_size_t block_size,
+    vsi_size_t batch_dims,
     uint32_t idxFlg,
     int32_t* arrayFlg
     )
@@ -167,13 +222,19 @@ static vsi_status get_gather_tensor_reshape_size
     vsi_size_t *input_size = inputs[0]->attr.size;
     uint32_t i = 0;
     vsi_size_t elementCnt = 1;
+    vsi_size_t outerCnt = 1;
 #define VSI_NN_MAX_IMAGE_WIDTH  (65536)
 
-    for(i = 0; i < dims_num; ++i)
+    for(i = 0; i < dims_num - batch_dims; ++i)
     {
         elementCnt *= input_size[i];
     }
 
+    for(; i < dims_num; ++i)
+    {
+        outerCnt *= input_size[i];
+    }
+
     for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
     {
         sizes[i] = 1;
@@ -182,13 +243,14 @@ static vsi_status get_gather_tensor_reshape_size
     if (idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH)
     {
         sizes[0] = elementCnt;
-        sizes[1] = 1;
+        sizes[1] = outerCnt;
         status = VSI_SUCCESS;
     }
     else
     {
         sizes[0] = block_size;
         sizes[1] = elementCnt / block_size;
+        sizes[2] = outerCnt;
         if ((elementCnt / block_size) > VSI_NN_MAX_IMAGE_WIDTH)
         {
             arrayFlg[0] = 1;
@@ -222,6 +284,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
     int32_t       block_num   = 0;
     int32_t       indices_num = 1;
     uint32_t      input_dims1 = 0;
+    int32_t       batch       = 1;
     vx_uint32     i           = 0;
     vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
     vsi_size_array_t * input1_shape = NULL;
@@ -283,7 +346,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
 
     input1_shape  = attr[1]->shape;
     input_dims1   = (uint32_t)input1_shape->size;
-    for (i = 0; i < input_dims1; i++)
+    for (i = 0; i < input_dims1 - 1; i++)
     {
         indices_num *= (int32_t)(input1_shape->data[i]);
     }
@@ -376,6 +439,11 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
 #undef _PACK_SELECT_KEY
 
     status = vsi_nn_kernel_gpu_add_param(node, "indices_num", &indices_num);
+    if (attr[2]->shape->size > 2)
+    {
+        batch = (int32_t)attr[2]->shape->data[2];
+        status = vsi_nn_kernel_gpu_add_param(node, "batch", &batch);
+    }
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
 OnError:
@@ -415,6 +483,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
 
     int32_t       block_num   = 0;
     int32_t       indices_num = 1;
+    int32_t       batch       = 1;
     uint32_t      input_dims1 = 0;
     vx_uint32     i           = 0;
     vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
@@ -475,10 +544,11 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
 
     input1_shape  = attr[1]->shape;
     input_dims1   = (uint32_t)input1_shape->size;
-    for (i = 0; i < input_dims1; i++)
+    for (i = 0; i < input_dims1 - 1; i++)
     {
         indices_num *= (int32_t)(input1_shape->data[i]);
     }
+    batch = (int32_t)(input1_shape->data[input_dims1 - 1]);
 
     shaderParam.global_scale[0]  = 4;
     shaderParam.global_scale[1]  = 1;
@@ -486,7 +556,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
     shaderParam.global_size[0]   = gpu_align_p2((indices_num + shaderParam.global_scale[0] - 1)
         / shaderParam.global_scale[0], 4);
     shaderParam.global_size[1]   = block_num;
-    shaderParam.global_size[2]   = 1;
+    shaderParam.global_size[2]   = batch;
 
     status = vsi_nn_kernel_gpu_config( node, &shaderParam );
     CHECK_STATUS_FAIL_GOTO(status, OnError);
@@ -585,6 +655,10 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
 #undef _PACK_SELECT_KEY
 
     status = vsi_nn_kernel_gpu_add_param(node, "indices_num", &indices_num);
+    if (attr[2]->shape->size > 2)
+    {
+        status |= vsi_nn_kernel_gpu_add_param(node, "batch", &batch);
+    }
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
 OnError:
@@ -617,7 +691,8 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel,
     const vsi_nn_kernel_param_t * params,
     int32_t axis,
-    int32_t is_array
+    int32_t is_array,
+    int32_t is_batch
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -638,7 +713,7 @@ static vsi_status _query_kernel
         output_dtype = F16;
     }
 
-    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis, is_array);
+    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis, is_array, is_batch);
 
     for( i = 0; i < _cnt_of_array(gather_map); i ++ )
     {
@@ -688,25 +763,30 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t tmp_params[_GATHER_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
     vsi_size_t  shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
     int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
     int32_t block_num   = vsi_nn_kernel_param_get_int32( params, "block_num" );
     int32_t axis_num    = vsi_nn_kernel_param_get_int32( params, "axis_num" );
     int32_t axis        = vsi_nn_kernel_param_get_int32( params, "axis" );
+    int32_t batch_dims  = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
     int32_t axis0_flg   = 0;
     int32_t is_array    = block_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0;
+    int32_t is_batch    = batch_dims > 0 ? 1 : 0;
+    vsi_size_t rs_dim   = batch_dims == 0 ? 2 : 3;
+    int32_t i           = 0;
 
     if (axis == 0)
     {
-        status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, 0, &is_array);
-        status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1, &is_array);
-        status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], 0, &is_array);
+        status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, batch_dims, 0, &is_array);
+        status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array);
+        status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], batch_dims, 0, &is_array);
         axis0_flg = 1;
     }
     else
     {
-        status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0, &is_array);
-        status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1, &is_array);
-        status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &is_array);
+        status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0, &is_array);
+        status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array);
+        status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0, &is_array);
         axis0_flg = 0;
     }
 #undef VSI_NN_MAX_BLOCK_SIZE
@@ -715,38 +795,45 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+        inputs[0], shapes[0], rs_dim );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+        inputs[1], shapes[1], 2 );
+    reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+        outputs[0], shapes[2], rs_dim );
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
     }
 
-    status = _query_kernel( inputs, outputs, kernel, params, axis0_flg, is_array);
+    status = _query_kernel( inputs, outputs, kernel, params, axis0_flg, is_array, is_batch);
     if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
         if ( node )
         {
-            uint32_t index = 0;
-#define RESHAPE_DIM 2
+            uint32_t index = 3;
+
             /* Pass parameters to node. */
-            tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t,  shapes[0], RESHAPE_DIM );
-            tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t,  shapes[1], RESHAPE_DIM );
-            tmp_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], RESHAPE_DIM );
-#undef RESHAPE_DIM
+            vsi_nn_kernel_node_pack_io( tmp_params, _GATHER_PARAM_NUM,
+                reshape_tensors, 2, &reshape_tensors[2], 1 );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
             status = vsi_nn_kernel_node_pass_param( node, tmp_params, _GATHER_PARAM_NUM );
-            CHECK_STATUS(status);
-            vsi_nn_kernel_tensor_release( &tmp_params[0] );
-            vsi_nn_kernel_tensor_release( &tmp_params[1] );
-            vsi_nn_kernel_tensor_release( &tmp_params[2] );
             vsi_nn_kernel_scalar_release( &tmp_params[3] );
             vsi_nn_kernel_scalar_release( &tmp_params[4] );
             vsi_nn_kernel_scalar_release( &tmp_params[5] );
         }
     }
+
+    for (i = 0; i < 3; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
index 2894f11..9693c29 100644
--- a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
@@ -22,7 +22,6 @@
 *
 *****************************************************************************/
 
-
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -994,44 +993,6 @@ static vsi_status _query_kernel
     return status;
 } /* _query_kernel() */
 
-static int32_t _optimize_gn_shape
-    (
-    vsi_nn_tensor_t ** inputs,
-    vsi_size_t group_size,
-    int32_t group_num,
-    vsi_size_t* opt_shape,
-    int32_t* is2D_flg
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_size_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
-    vsi_size_t new_rank = 0;
-    group_shape[0] = inputs[0]->attr.size[0];
-    group_shape[1] = inputs[0]->attr.size[1];
-    group_shape[2] = group_size;
-
-    vsi_nn_kernel_optimize_element_shape( group_shape, 3, opt_shape, &new_rank );
-
-    if (opt_shape[1] == 1)
-    {
-        opt_shape[1] = group_num;
-        opt_shape[2] = 1;
-        opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
-        is2D_flg[0] = 1;
-    }
-    else if (new_rank == 2)
-    {
-        opt_shape[2] = group_num;
-        opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
-    }
-    else
-    {
-        status = VSI_FAILURE;
-    }
-
-    return status;
-}
-
 static vsi_nn_kernel_node_t _setup
     (
     vsi_nn_graph_t              * graph,
@@ -1077,11 +1038,13 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
-    status = _optimize_gn_shape(inputs, group_size, group_num, new_shape, &is2D_flg);
+    status =  vsi_nn_kernel_optimize_group_norm_shape( (const vsi_size_t*)inputs[0]->attr.size,
+        inputs[0]->attr.dim_num, group_num, 0, new_shape);
     if ( VSI_SUCCESS != status )
     {
         goto final;
     }
+    is2D_flg = (new_shape[2] == 1) && ((int32_t)new_shape[1] == group_num);
     rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4);
     rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4);
 
diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
index 9ddc0bf..4f3367e 100644
--- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
@@ -1004,12 +1004,15 @@ static vsi_nn_kernel_node_t _setup
     uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
     uint32_t hashkey = 0;
     int32_t i = 0;
+    uint32_t rank = outputs[0]->attr.dim_num;
     float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
-    int32_t reshape_flg  = vsi_nn_kernel_param_get_int32( params, "reshape_flg" );
+    int32_t reshape_flg  = outputs[0]->attr.size[1] * outputs[0]->attr.size[2] < GPU_TENSOR_MAX_WIDTH
+            && rank > 2;
 
     // Check if gpu can support the size
     if ( !vsi_nn_kernel_gpu_check_shape(
-        outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
+        outputs[0]->attr.size, outputs[0]->attr.dim_num ) ||
+        rank > 4 )
     {
         return NULL;
     }
diff --git a/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c b/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c
index ed9561c..6a323c0 100644
--- a/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c
@@ -76,9 +76,15 @@ static const _kernel_map_type _logical_ops_kernel_map[] =
     PACK_KERNEL_MAP(VSI_NN_LOGICAL_OR,  I8,  I8,  "or"),
     PACK_KERNEL_MAP(VSI_NN_LOGICAL_AND, I8,  I8,  "and"),
     PACK_KERNEL_MAP(VSI_NN_LOGICAL_XOR, I8,  I8,  "xor"),
+    PACK_KERNEL_MAP(VSI_NN_LOGICAL_OR,  BF16,  I8,  "or"),
+    PACK_KERNEL_MAP(VSI_NN_LOGICAL_AND, BF16,  I8,  "and"),
+    PACK_KERNEL_MAP(VSI_NN_LOGICAL_XOR, BF16,  I8,  "xor"),
     PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_OR,  I8,  I8,  "or"),
     PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_AND, I8,  I8,  "and"),
     PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_XOR, I8,  I8,  "xor"),
+    PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_OR,  BF16,  I8,  "or"),
+    PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_AND, BF16,  I8,  "and"),
+    PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_XOR, BF16,  I8,  "xor"),
 };
 
 
@@ -159,6 +165,22 @@ DEF_KERNEL_INITIALIZER(_logical_ops_initializer)
         status = vsi_nn_kernel_gpu_add_param( node, "uniMulShortMinus1toFp16_2x8", &uniMulShortMinus1toFp16_2x8);
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
+    else if (BF16 == input_dtype)
+    {
+        gpu_dp_inst_t uniConvertInt16toInt8_2x8 = {{
+                0x11111111, // TCfg
+                0x00000000, // ASelt
+                0x03020100, 0x07060504, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000700, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status = vsi_nn_kernel_gpu_add_param( node, "uniConvertInt16toInt8_2x8", &uniConvertInt16toInt8_2x8);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
 
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
     CHECK_STATUS_FAIL_GOTO(status, final );
@@ -209,9 +231,13 @@ static vsi_status _query_kernel
         return VSI_FAILURE;
     }
 
-    if (BOOL8 == in_dtype && BOOL8 == out_dtype)
+    if (BOOL8 == in_dtype)
     {
         in_dtype  = I8;
+    }
+
+    if (BOOL8 == out_dtype)
+    {
         out_dtype = I8;
     }
 
diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
index f368c97..c03e942 100644
--- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
@@ -56,6 +56,7 @@ __BEGIN_DECLS
 #define KERNEL_SOURCE_12   "matrixmul_u8u8_f16"
 #define KERNEL_SOURCE_13   "matrixmul_i16"
 #define KERNEL_SOURCE_14   "matrixmul_f16i16_i16"
+#define KERNEL_SOURCE_15   "matrixmul_bf16"
 
 #define HASH_MATRIX_MUL_KEY(_input0_type, _input1_type, _output_type, _trans_a, _trans_b) \
     ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_trans_a << 4) | (_trans_b))
@@ -110,6 +111,7 @@ static const struct {
     TENSOR_MATRIX_MUL_KERNELS(I8,  F16, F16,      KERNEL_SOURCE_8)
     TENSOR_MATRIX_MUL_KERNELS(I16, F16, F16,      KERNEL_SOURCE_8)
     TENSOR_MATRIX_MUL_KERNELS(F16, F16, F16,      KERNEL_SOURCE_2)
+    TENSOR_MATRIX_MUL_KERNELS(BF16,BF16,BF16,     KERNEL_SOURCE_15)
     TENSOR_MATRIX_MUL_KERNELS(F16, F16, U8,       KERNEL_SOURCE_11)
     TENSOR_MATRIX_MUL_KERNELS(F16, F16, I8,       KERNEL_SOURCE_11)
     TENSOR_MATRIX_MUL_KERNELS(F16, F16, I16,      KERNEL_SOURCE_11)
@@ -119,6 +121,7 @@ static const struct {
     TENSOR_MATRIX_MUL_TRANSB_KERNELS(F16, U8,  U8,     KERNEL_SOURCE_4)
     TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8,  U8,  F16,    KERNEL_SOURCE_5)
     TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8,  U8,  U8,     KERNEL_SOURCE_5)
+    TENSOR_MATRIX_MUL_TRANSB_KERNELS(BF16,BF16,BF16,   KERNEL_SOURCE_15)
     TENSOR_MATRIX_MUL_TRANSA_KERNELS(U8,  U8,  U8,     KERNEL_SOURCE_7)
     TENSOR_MATRIX_MUL_TRANSA_KERNELS(I8,  I8,  I8,     KERNEL_SOURCE_7)
     TENSOR_MATRIX_MUL_TRANSA_KERNELS(I16, I16, I16,    KERNEL_SOURCE_7)
@@ -126,6 +129,7 @@ static const struct {
     TENSOR_MATRIX_MUL_TRANSA_KERNELS(I8,  F16, I8,     KERNEL_SOURCE_7)
     TENSOR_MATRIX_MUL_TRANSA_KERNELS(I16, F16, I16,    KERNEL_SOURCE_7)
     TENSOR_MATRIX_MUL_TRANSA_KERNELS(F16, F16, F16,    KERNEL_SOURCE_7)
+    TENSOR_MATRIX_MUL_TRANSA_KERNELS(BF16,BF16,BF16,   KERNEL_SOURCE_15)
 };
 
 /*
@@ -587,6 +591,36 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
             0x00000600, // AccumType, ConstantType, and PostShift
             0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x05050404, 0x07070606, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractOddData_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x07050301, 0x07050301, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
 
         float scaleIn0divOut = src0Scale / dstScale;
         float scaleIn1divOut = src1Scale / dstScale;
@@ -936,6 +970,22 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
                 CHECK_STATUS_FAIL_GOTO(status, OnError );
             }
             break;
+        case _PACK_SELECT_KEY( BF16, BF16, BF16, 0, 0, 0 ):
+        case _PACK_SELECT_KEY( BF16, BF16, BF16, 0, 0, 1 ):
+        case _PACK_SELECT_KEY( BF16, BF16, BF16, 0, 1, 0 ):
+        case _PACK_SELECT_KEY( BF16, BF16, BF16, 0, 1, 1 ):
+        case _PACK_SELECT_KEY( BF16, BF16, BF16, 1, 0, 0 ):
+        case _PACK_SELECT_KEY( BF16, BF16, BF16, 1, 0, 1 ):
+            {
+                status = vsi_nn_kernel_gpu_add_param( node,
+                        "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
         case _PACK_SELECT_KEY( F16, F16, F16, 0, 0, 1 ):
             {
                 status = vsi_nn_kernel_gpu_add_param( node,
diff --git a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
index 3c76c65..5bade3b 100644
--- a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
@@ -64,6 +64,10 @@ __BEGIN_DECLS
 #define KERNEL_NAME_MAXIMUM_F16F16TOI8_2D           CVIVANTE_NAMESPACE("evis.maximum_F16F16toI8_2D")
 #define KERNEL_NAME_MAXIMUM_F16F16TOI16             CVIVANTE_NAMESPACE("evis.maximum_F16F16toI16")
 #define KERNEL_NAME_MAXIMUM_F16F16TOI16_2D          CVIVANTE_NAMESPACE("evis.maximum_F16F16toI16_2D")
+#define KERNEL_NAME_MAXIMUM_I16I16TOU8              CVIVANTE_NAMESPACE("evis.maximum_I16I16toU8")
+#define KERNEL_NAME_MAXIMUM_I16I16TOU8_2D           CVIVANTE_NAMESPACE("evis.maximum_I16I16toU8_2D")
+#define KERNEL_NAME_MAXIMUM_U8U8TOI16               CVIVANTE_NAMESPACE("evis.maximum_U8U8toI16")
+#define KERNEL_NAME_MAXIMUM_U8U8TOI16_2D            CVIVANTE_NAMESPACE("evis.maximum_U8U8toI16_2D")
 
 #define KERNEL_SOURCE_1    "maximum",
 #define KERNEL_SOURCE_2    "maximum_fp16",
@@ -109,6 +113,7 @@ static const struct {
     TENSOR_MAX_KERNELS(F16, F16, I8,        KERNEL_SOURCE_1)
     TENSOR_MAX_KERNELS(I8,  I8, I8,         KERNEL_SOURCE_1)
     TENSOR_MAX_KERNELS(U8,  U8, U8,         KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS(U8,  U8, I16,        KERNEL_SOURCE_1)
     TENSOR_MAX_KERNELS(I16, I16, I16,       KERNEL_SOURCE_1)
 
     TENSOR_MAX_KERNELS(F16, F16, U8,        KERNEL_SOURCE_2)
@@ -120,12 +125,14 @@ static const struct {
     TENSOR_MAX_KERNELS(I16, F16, I16,       KERNEL_SOURCE_3)
     TENSOR_MAX_KERNELS(I16, F16, F16,       KERNEL_SOURCE_3)
     TENSOR_MAX_KERNELS(F16, F16, I16,       KERNEL_SOURCE_3)
+    TENSOR_MAX_KERNELS(I16, I16, U8,        KERNEL_SOURCE_3)
 
     TENSOR_MAX_KERNELS_2D_HALF(F16, F16, F16,    KERNEL_SOURCE_1)
     TENSOR_MAX_KERNELS_2D_HALF(BF16, BF16, BF16, KERNEL_SOURCE_1)
     TENSOR_MAX_KERNELS_2D(F16, F16, I8,     KERNEL_SOURCE_1)
     TENSOR_MAX_KERNELS_2D(I8,  I8, I8,      KERNEL_SOURCE_1)
     TENSOR_MAX_KERNELS_2D(U8,  U8, U8,      KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS_2D(U8,  U8,  I16,    KERNEL_SOURCE_1)
     TENSOR_MAX_KERNELS_2D(I16, I16, I16,    KERNEL_SOURCE_1)
 
     TENSOR_MAX_KERNELS_2D(F16, F16, U8,     KERNEL_SOURCE_2)
@@ -137,6 +144,7 @@ static const struct {
     TENSOR_MAX_KERNELS_2D(I16, F16, I16,    KERNEL_SOURCE_3)
     TENSOR_MAX_KERNELS_2D(I16, F16, F16,    KERNEL_SOURCE_3)
     TENSOR_MAX_KERNELS_2D(F16, F16, I16,    KERNEL_SOURCE_3)
+    TENSOR_MAX_KERNELS_2D(I16, I16, U8,     KERNEL_SOURCE_3)
 };
 
 static vx_param_description_t kernel_param_def[] =
@@ -192,6 +200,14 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
     if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         in0_fl = (uint8_t)attr[0]->dfp.fl;
+        if (in0_fl > 0)
+        {
+            src0Scale = 1.0f / (float) ((int64_t)1 << in0_fl);
+        }
+        else
+        {
+            src0Scale = (float)((int64_t)1 << -in0_fl);
+        }
     }
     else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
         || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM)
@@ -203,6 +219,14 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
     if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         in1_fl = (uint8_t)attr[1]->dfp.fl;
+        if (in1_fl > 0)
+        {
+            src0Scale = 1.0f / (float) ((int64_t)1 << in1_fl);
+        }
+        else
+        {
+            src0Scale = (float)((int64_t)1 << -in1_fl);
+        }
     }
     else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
         || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
@@ -242,7 +266,8 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
             attr[1]->dtype, attr[2]->dtype );
 
     if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16)
-        || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) )
+        || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16))
+        || (attr[0]->dtype == I16 && attr[1]->dtype == I16 && attr[2]->dtype == U8) )
     {
         gpu_param.global_scale[0] = 8;
         gpu_param.global_scale[1] = 1;
@@ -384,6 +409,8 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
     case _PACK_SELECT_KEY( U8, U8, U8 ):
     case _PACK_SELECT_KEY( U8, F16, U8 ):
     case _PACK_SELECT_KEY( F16, F16, U8 ):
+    case _PACK_SELECT_KEY( U8, U8, I16 ):
+    case _PACK_SELECT_KEY( I16, I16, U8 ):
         {
             uint16_t M0               = 0;
             uint16_t M1               = 0;
@@ -427,12 +454,15 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
             status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
             CHECK_STATUS_FAIL_GOTO(status, final );
 
-            if (attr[0]->dtype == U8)
+            if (attr[0]->dtype == U8 || attr[0]->dtype == I16)
             {
                 status = vsi_nn_kernel_gpu_add_param( node,
                         "uniU8MulAndPostShift0_Lo_2x8",  &uniU8MulAndPostShift_Lo_2x8 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                        "uniU8MulAndPostShift0_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
+                if (attr[0]->dtype != I16)
+                {
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniU8MulAndPostShift0_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
+                }
                 status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
                 CHECK_STATUS_FAIL_GOTO(status, final );
             }
@@ -461,8 +491,11 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
                 gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 );
                 status = vsi_nn_kernel_gpu_add_param( node,
                         "uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                        "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
+                if (attr[0]->dtype != I16)
+                {
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
+                }
                 CHECK_STATUS_FAIL_GOTO(status, final );
             }
         }
@@ -751,7 +784,6 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
                     tmp_inputs, 2, outputs, 1 );
             status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
-
         }
     }
     return node;
@@ -760,4 +792,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( maximum, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
index 16be973..9a64243 100644
--- a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
@@ -64,6 +64,10 @@ __BEGIN_DECLS
 #define KERNEL_NAME_MINIMUM_F16F16TOI8_2D           CVIVANTE_NAMESPACE("evis.minimum_F16F16toI8_2D")
 #define KERNEL_NAME_MINIMUM_F16F16TOI16             CVIVANTE_NAMESPACE("evis.minimum_F16F16toI16")
 #define KERNEL_NAME_MINIMUM_F16F16TOI16_2D          CVIVANTE_NAMESPACE("evis.minimum_F16F16toI16_2D")
+#define KERNEL_NAME_MINIMUM_I16I16TOU8              CVIVANTE_NAMESPACE("evis.minimum_I16I16toU8")
+#define KERNEL_NAME_MINIMUM_I16I16TOU8_2D           CVIVANTE_NAMESPACE("evis.minimum_I16I16toU8_2D")
+#define KERNEL_NAME_MINIMUM_U8U8TOI16               CVIVANTE_NAMESPACE("evis.minimum_U8U8toI16")
+#define KERNEL_NAME_MINIMUM_U8U8TOI16_2D            CVIVANTE_NAMESPACE("evis.minimum_U8U8toI16_2D")
 
 #define KERNEL_SOURCE_1    "minimum",
 #define KERNEL_SOURCE_2    "minimum_fp16",
@@ -109,6 +113,7 @@ static const struct {
     TENSOR_MIN_KERNELS(F16, F16, I8,        KERNEL_SOURCE_1)
     TENSOR_MIN_KERNELS(I8,  I8, I8,         KERNEL_SOURCE_1)
     TENSOR_MIN_KERNELS(U8,  U8, U8,         KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS(U8,  U8, I16,        KERNEL_SOURCE_1)
     TENSOR_MIN_KERNELS(I16, I16, I16,       KERNEL_SOURCE_1)
 
     TENSOR_MIN_KERNELS(F16, F16, U8,        KERNEL_SOURCE_2)
@@ -120,12 +125,14 @@ static const struct {
     TENSOR_MIN_KERNELS(I16, F16, I16,       KERNEL_SOURCE_3)
     TENSOR_MIN_KERNELS(I16, F16, F16,       KERNEL_SOURCE_3)
     TENSOR_MIN_KERNELS(F16, F16, I16,       KERNEL_SOURCE_3)
+    TENSOR_MIN_KERNELS(I16, I16, U8,        KERNEL_SOURCE_3)
 
     TENSOR_MIN_KERNELS_2D_HALF(F16, F16, F16,    KERNEL_SOURCE_1)
     TENSOR_MIN_KERNELS_2D_HALF(BF16, BF16, BF16, KERNEL_SOURCE_1)
     TENSOR_MIN_KERNELS_2D(F16, F16, I8,     KERNEL_SOURCE_1)
     TENSOR_MIN_KERNELS_2D(I8,  I8, I8,      KERNEL_SOURCE_1)
     TENSOR_MIN_KERNELS_2D(U8,  U8, U8,      KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS_2D(U8,  U8,  I16,    KERNEL_SOURCE_1)
     TENSOR_MIN_KERNELS_2D(I16, I16, I16,    KERNEL_SOURCE_1)
 
     TENSOR_MIN_KERNELS_2D(F16, F16, U8,     KERNEL_SOURCE_2)
@@ -137,6 +144,7 @@ static const struct {
     TENSOR_MIN_KERNELS_2D(I16, F16, I16,    KERNEL_SOURCE_3)
     TENSOR_MIN_KERNELS_2D(I16, F16, F16,    KERNEL_SOURCE_3)
     TENSOR_MIN_KERNELS_2D(F16, F16, I16,    KERNEL_SOURCE_3)
+    TENSOR_MIN_KERNELS_2D(I16, I16, U8,     KERNEL_SOURCE_3)
 };
 
 static vx_param_description_t kernel_param_def[] =
@@ -192,6 +200,14 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
     if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         in0_fl = (uint8_t)attr[0]->dfp.fl;
+        if (in0_fl > 0)
+        {
+            src0Scale = 1.0f / (float) ((int64_t)1 << in0_fl);
+        }
+        else
+        {
+            src0Scale = (float)((int64_t)1 << -in0_fl);
+        }
     }
     else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
         || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM)
@@ -203,6 +219,14 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
     if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         in1_fl = (uint8_t)attr[1]->dfp.fl;
+        if (in1_fl > 0)
+        {
+            src0Scale = 1.0f / (float) ((int64_t)1 << in1_fl);
+        }
+        else
+        {
+            src0Scale = (float)((int64_t)1 << -in1_fl);
+        }
     }
     else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
         || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
@@ -242,7 +266,8 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
             attr[1]->dtype, attr[2]->dtype );
 
     if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16)
-        || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) )
+        || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16))
+        || (attr[0]->dtype == I16 && attr[1]->dtype == I16 && attr[2]->dtype == U8) )
     {
         gpu_param.global_scale[0] = 8;
         gpu_param.global_scale[1] = 1;
@@ -384,6 +409,8 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
     case _PACK_SELECT_KEY( U8, U8, U8 ):
     case _PACK_SELECT_KEY( U8, F16, U8 ):
     case _PACK_SELECT_KEY( F16, F16, U8 ):
+    case _PACK_SELECT_KEY( U8, U8, I16 ):
+    case _PACK_SELECT_KEY( I16, I16, U8 ):
         {
             uint16_t M0               = 0;
             uint16_t M1               = 0;
@@ -427,12 +454,15 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
             status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
             CHECK_STATUS_FAIL_GOTO(status, final );
 
-            if (attr[0]->dtype == U8)
+            if (attr[0]->dtype == U8 || attr[0]->dtype == I16)
             {
                 status = vsi_nn_kernel_gpu_add_param( node,
                         "uniU8MulAndPostShift0_Lo_2x8",  &uniU8MulAndPostShift_Lo_2x8 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                        "uniU8MulAndPostShift0_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
+                if (attr[0]->dtype != I16)
+                {
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniU8MulAndPostShift0_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
+                }
                 status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
                 CHECK_STATUS_FAIL_GOTO(status, final );
             }
@@ -461,8 +491,11 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
                 gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 );
                 status = vsi_nn_kernel_gpu_add_param( node,
                         "uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                        "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
+                if (attr[0]->dtype != I16)
+                {
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
+                }
                 CHECK_STATUS_FAIL_GOTO(status, final );
             }
         }
@@ -751,7 +784,6 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
                     tmp_inputs, 2, outputs, 1 );
             status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
-
         }
     }
     return node;
@@ -760,4 +792,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( minimum, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/moments_evis.c b/src/tim/vx/internal/src/kernel/evis/moments_evis.c
index cf540bc..bc45fc0 100644
--- a/src/tim/vx/internal/src/kernel/evis/moments_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c
@@ -101,14 +101,17 @@ static const struct {
     TENSOR_MOMENTS_KERNELS(I8,  F16, 0,    KERNEL_SOURCE_1)
     TENSOR_MOMENTS_KERNELS(I16, F16, 0,    KERNEL_SOURCE_1)
     TENSOR_MOMENTS_KERNELS(F16, F16, 0,    KERNEL_SOURCE_1)
+    TENSOR_MOMENTS_KERNELS(BF16,BF16,0,    KERNEL_SOURCE_1)
     TENSOR_MOMENTS_KERNELS(U8,  F16, 1,    KERNEL_SOURCE_2)
     TENSOR_MOMENTS_KERNELS(I8,  F16, 1,    KERNEL_SOURCE_2)
     TENSOR_MOMENTS_KERNELS(I16, F16, 1,    KERNEL_SOURCE_2)
     TENSOR_MOMENTS_KERNELS(F16, F16, 1,    KERNEL_SOURCE_2)
+    TENSOR_MOMENTS_KERNELS(BF16,BF16,1,    KERNEL_SOURCE_2)
     TENSOR_MOMENTS_KERNELS(U8,  F16, 2,    KERNEL_SOURCE_3)
     TENSOR_MOMENTS_KERNELS(I8,  F16, 2,    KERNEL_SOURCE_3)
     TENSOR_MOMENTS_KERNELS(I16, F16, 2,    KERNEL_SOURCE_3)
     TENSOR_MOMENTS_KERNELS(F16, F16, 2,    KERNEL_SOURCE_3)
+    TENSOR_MOMENTS_KERNELS(BF16,BF16,2,    KERNEL_SOURCE_3)
     TENSOR_MOMENTS_KERNELS(U8,  U8,  0,    KERNEL_SOURCE_6)
     TENSOR_MOMENTS_KERNELS(U8,  U8,  1,    KERNEL_SOURCE_6)
     TENSOR_MOMENTS_KERNELS(U8,  U8,  2,    KERNEL_SOURCE_6)
@@ -116,26 +119,31 @@ static const struct {
     TENSOR_MOMENTS_TWO_AXIS_KERNELS(I8,  F16, 0, 1,       KERNEL_SOURCE_4)
     TENSOR_MOMENTS_TWO_AXIS_KERNELS(I16, F16, 0, 1,       KERNEL_SOURCE_4)
     TENSOR_MOMENTS_TWO_AXIS_KERNELS(F16, F16, 0, 1,       KERNEL_SOURCE_4)
+    TENSOR_MOMENTS_TWO_AXIS_KERNELS(BF16,BF16,0, 1,       KERNEL_SOURCE_7)
     TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8,  U8,  0, 1,       KERNEL_SOURCE_6)
     TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8,  F16, 0, 1, 2,  KERNEL_SOURCE_5)
     TENSOR_MOMENTS_THREE_AXIS_KERNELS(I8,  F16, 0, 1, 2,  KERNEL_SOURCE_5)
     TENSOR_MOMENTS_THREE_AXIS_KERNELS(I16, F16, 0, 1, 2,  KERNEL_SOURCE_5)
     TENSOR_MOMENTS_THREE_AXIS_KERNELS(F16, F16, 0, 1, 2,  KERNEL_SOURCE_5)
+    TENSOR_MOMENTS_THREE_AXIS_KERNELS(BF16,BF16,0, 1, 2,  KERNEL_SOURCE_5)
     TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8,  U8,  0, 1, 2,  KERNEL_SOURCE_7)
     TENSOR_MOMENTS_KERNELS_2D(U8,  F16, 0,                KERNEL_SOURCE_1)
     TENSOR_MOMENTS_KERNELS_2D(I8,  F16, 0,                KERNEL_SOURCE_1)
     TENSOR_MOMENTS_KERNELS_2D(I16, F16, 0,                KERNEL_SOURCE_1)
     TENSOR_MOMENTS_KERNELS_2D(F16, F16, 0,                KERNEL_SOURCE_1)
+    TENSOR_MOMENTS_KERNELS_2D(BF16,BF16,0,                KERNEL_SOURCE_1)
     TENSOR_MOMENTS_KERNELS_2D(U8,  F16, 1,                KERNEL_SOURCE_2)
     TENSOR_MOMENTS_KERNELS_2D(I8,  F16, 1,                KERNEL_SOURCE_2)
     TENSOR_MOMENTS_KERNELS_2D(I16, F16, 1,                KERNEL_SOURCE_2)
     TENSOR_MOMENTS_KERNELS_2D(F16, F16, 1,                KERNEL_SOURCE_2)
+    TENSOR_MOMENTS_KERNELS_2D(BF16,BF16,1,                KERNEL_SOURCE_2)
     TENSOR_MOMENTS_KERNELS_2D(U8,  U8,  0,                KERNEL_SOURCE_6)
     TENSOR_MOMENTS_KERNELS_2D(U8,  U8,  1,                KERNEL_SOURCE_6)
     TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(U8,  F16, 0, 1,    KERNEL_SOURCE_4)
     TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(I8,  F16, 0, 1,    KERNEL_SOURCE_4)
     TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(I16, F16, 0, 1,    KERNEL_SOURCE_4)
     TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(F16, F16, 0, 1,    KERNEL_SOURCE_4)
+    TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(BF16,BF16,0, 1,    KERNEL_SOURCE_7)
     TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(U8,  U8,  0, 1,    KERNEL_SOURCE_6)
 };
 
@@ -461,6 +469,36 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
             0x00000000, 0x00000000, 0x00000000, 0x00000000,
             0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x05050404, 0x07070606, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractOddData_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x07050301, 0x07050301, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
 
         switch( pack_key )
         {
@@ -494,6 +532,18 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
                 CHECK_STATUS_FAIL_GOTO(status, OnError );
             }
             break;
+        case _PACK_SELECT_KEY( BF16, BF16, 1, 0):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8",
+                        &uniConvBF16toF32_Part0_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8",
+                        &uniConvBF16toF32_Part1_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
+                        &uniExtractOddData_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
         case _PACK_SELECT_KEY( U8,  F16, 1, 1):
         case _PACK_SELECT_KEY( I8,  F16, 1, 1):
         case _PACK_SELECT_KEY( I16, F16, 1, 1):
@@ -518,6 +568,16 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
                 CHECK_STATUS_FAIL_GOTO(status, OnError );
             }
             break;
+        case _PACK_SELECT_KEY( BF16, BF16, 1, 1):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8",
+                        &uniConvBF16toF32_Part0_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
+                        &uniExtractOddData_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
         case _PACK_SELECT_KEY( U8,  F16, 1, 2):
         case _PACK_SELECT_KEY( I8,  F16, 1, 2):
         case _PACK_SELECT_KEY( I16, F16, 1, 2):
@@ -542,6 +602,15 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
                 CHECK_STATUS_FAIL_GOTO(status, OnError );
             }
             break;
+        case _PACK_SELECT_KEY( BF16, BF16, 1, 2):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
+                        &uniExtractOddData_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
         case _PACK_SELECT_KEY( U8,  F16, 2, 0):
         case _PACK_SELECT_KEY( I8,  F16, 2, 0):
         case _PACK_SELECT_KEY( I16, F16, 2, 0):
@@ -597,6 +666,18 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
                 CHECK_STATUS_FAIL_GOTO(status, OnError );
             }
             break;
+        case _PACK_SELECT_KEY( BF16, BF16, 2, 0):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8",
+                        &uniConvBF16toF32_Part1_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
+                        &uniExtractOddData_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
+                status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
         case _PACK_SELECT_KEY( F16, F16, 3, 0):
             {
                 status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
@@ -608,6 +689,19 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
                 CHECK_STATUS_FAIL_GOTO(status, OnError );
             }
             break;
+        case _PACK_SELECT_KEY( BF16, BF16, 3, 0):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8",
+                        &uniConvBF16toF32_Part1_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
+                        &uniExtractOddData_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
+                status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
+                status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
         case _PACK_SELECT_KEY( U8, U8, 1, 0):
         case _PACK_SELECT_KEY( U8, U8, 1, 1):
         case _PACK_SELECT_KEY( U8, U8, 1, 2):
diff --git a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
index 60de16a..bc78fd3 100644
--- a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
@@ -68,27 +68,29 @@ typedef struct
 static const _kernel_map_type _one_hot_kernel_map[] =
 {
     // Register kernel here
-    PACK_ONE_HOT_KERNEL_3D( U8,  U8 ),
-    PACK_ONE_HOT_KERNEL_3D( U8,  F16 ),
-    PACK_ONE_HOT_KERNEL_3D( I8,  I8 ),
-    PACK_ONE_HOT_KERNEL_3D( I8,  F16 ),
-    PACK_ONE_HOT_KERNEL_3D( I16, I16 ),
-    PACK_ONE_HOT_KERNEL_3D( I16, F16 ),
-    PACK_ONE_HOT_KERNEL_3D( F16, F16 ),
-    PACK_ONE_HOT_KERNEL_3D( F16, I16 ),
-    PACK_ONE_HOT_KERNEL_3D( F16, U8 ),
-    PACK_ONE_HOT_KERNEL_3D( F16, I8 ),
+    PACK_ONE_HOT_KERNEL_3D( U8,   U8 ),
+    PACK_ONE_HOT_KERNEL_3D( U8,   F16 ),
+    PACK_ONE_HOT_KERNEL_3D( I8,   I8 ),
+    PACK_ONE_HOT_KERNEL_3D( I8,   F16 ),
+    PACK_ONE_HOT_KERNEL_3D( I16,  I16 ),
+    PACK_ONE_HOT_KERNEL_3D( I16,  F16 ),
+    PACK_ONE_HOT_KERNEL_3D( F16,  F16 ),
+    PACK_ONE_HOT_KERNEL_3D( F16,  I16 ),
+    PACK_ONE_HOT_KERNEL_3D( F16,  U8 ),
+    PACK_ONE_HOT_KERNEL_3D( F16,  I8 ),
+    PACK_ONE_HOT_KERNEL_3D( BF16, BF16 ),
 
-    PACK_ONE_HOT_KERNEL_2D( U8,  U8 ),
-    PACK_ONE_HOT_KERNEL_2D( U8,  F16 ),
-    PACK_ONE_HOT_KERNEL_2D( I8,  I8 ),
-    PACK_ONE_HOT_KERNEL_2D( I8,  F16 ),
-    PACK_ONE_HOT_KERNEL_2D( I16, I16 ),
-    PACK_ONE_HOT_KERNEL_2D( I16, F16 ),
-    PACK_ONE_HOT_KERNEL_2D( F16, F16 ),
-    PACK_ONE_HOT_KERNEL_2D( F16, I16 ),
-    PACK_ONE_HOT_KERNEL_2D( F16, U8 ),
-    PACK_ONE_HOT_KERNEL_2D( F16, I8 ),
+    PACK_ONE_HOT_KERNEL_2D( U8,   U8 ),
+    PACK_ONE_HOT_KERNEL_2D( U8,   F16 ),
+    PACK_ONE_HOT_KERNEL_2D( I8,   I8 ),
+    PACK_ONE_HOT_KERNEL_2D( I8,   F16 ),
+    PACK_ONE_HOT_KERNEL_2D( I16,  I16 ),
+    PACK_ONE_HOT_KERNEL_2D( I16,  F16 ),
+    PACK_ONE_HOT_KERNEL_2D( F16,  F16 ),
+    PACK_ONE_HOT_KERNEL_2D( F16,  I16 ),
+    PACK_ONE_HOT_KERNEL_2D( F16,  U8 ),
+    PACK_ONE_HOT_KERNEL_2D( F16,  I8 ),
+    PACK_ONE_HOT_KERNEL_2D( BF16, BF16 ),
 };
 
 
@@ -274,6 +276,51 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer)
             "depth", &depth );
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
+    break;
+    case BF16:
+    {
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x05050404, 0x07070606, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractInteger_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status = vsi_nn_kernel_gpu_add_param( node,
+            "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "uniExtract8Data_2x8", &uniExtractInteger_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "depth", &depth );
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    break;
     default:
         break;
     }
diff --git a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
index 2bed1e4..c007a08 100644
--- a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
@@ -98,7 +98,6 @@ static const struct {
     PRELU_KERNELS_2D(I8,   F16,  F16,  _2D,     KERNEL_SOURCE0)
     PRELU_KERNELS_2D(U8,   U8,   U8,   _2D,     KERNEL_SOURCE0)
     PRELU_KERNELS_2D(U8,   U8,   F16,  _2D,     KERNEL_SOURCE0)
-
 };
 
 static vx_param_description_t kernel_param_def[] =
@@ -199,6 +198,7 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
     }
     else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
     {
+        out_fl = 1;
         outputZP      = (float)attr[2]->asymm.zero_point;
         input_scale0   = input_scale0 / attr[2]->asymm.scale;
     }
@@ -628,7 +628,6 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
                     reshape_tensors, 2, &reshape_tensors[2], 1 );
             status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
-
         }
     }
 
@@ -643,4 +642,3 @@ final:
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( prelu, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
index a4e4fa9..e3b5582 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@@ -51,11 +51,13 @@ typedef enum
     UP_2X_HALF,
     UP_3X_HALF,
     UP_4X_HALF,
+    UP_8X_HALF,
 } _internal_scale_e;
 
 #define _RESIZE_BILINEAR_KERNEL_SOURCE(_input_type)      "resize_bilinear_"#_input_type
 #define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type)  "resize_bilinear_"#_input_type"_opt"
-#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers"
+#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_1"
+#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_2"
 
 #define STR(a) #a
 // Add kernel hashtable here
@@ -81,19 +83,25 @@ typedef enum
         { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF ), \
           CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
             "_SAME_2x_upsample_half_pixel_centers"), \
-          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
 
 #define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
         { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF ), \
           CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
             "_SAME_4x_upsample_half_pixel_centers"), \
-          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
+
+#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF ), \
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_SAME_8x_upsample_half_pixel_centers"), \
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(IN_DTYPE) }
 
 #define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
         { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF ), \
           CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
             "_SAME_3x_upsample_half_pixel_centers"), \
-          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
 
 typedef struct
 {
@@ -120,6 +128,7 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
     PACK_KERNEL_MAP_UP_2X_HALF(U8, U8),
     PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
     PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
+    PACK_KERNEL_MAP_UP_8X_HALF(U8, U8),
 };
 
 
@@ -224,6 +233,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
     vsi_bool    is_2x_up_kernel  = FALSE;
     vsi_bool    is_3x_up_kernel  = FALSE;
     vsi_bool    is_4x_up_kernel  = FALSE;
+    vsi_bool    is_8x_up_kernel  = FALSE;
 
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
@@ -280,6 +290,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
         is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
         is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
         is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
+        is_8x_up_kernel = (8 * in_width == out_width) && (8 * in_height == out_height);
     }
 
     if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
@@ -330,7 +341,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
         outputZP     = 0;
     }
 
-    if (is_2x_up_kernel || is_4x_up_kernel)
+    if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
     {
         gpu_param.global_scale[0] = 16;
         gpu_param.global_scale[1] = 1;
@@ -479,6 +490,76 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
         status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
+    else if (is_8x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
+            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
+            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
+            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
+            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
+            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
+            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
+            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
+            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
     else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
     {
         float dfpScale = input_scale * output_scale;
@@ -965,25 +1046,25 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
         goto final;
     }
 
-    if (!is_2x_up_kernel && !is_3x_up_kernel && !is_4x_up_kernel)
+    if (!is_2x_up_kernel && !is_3x_up_kernel && !is_4x_up_kernel&& !is_8x_up_kernel)
     {
         status  = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value);
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
 
-    if (is_2x_up_kernel || is_4x_up_kernel)
+    if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
     {
-        gpu_param.global_size[0]   = gpu_align_p2((out_width  + \
-                                     gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
-        gpu_param.global_size[1]   = depth;
-        gpu_param.dim              = 2;
+        gpu_param.global_size[0] = gpu_align_p2((out_width  + \
+                                   gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
+        gpu_param.global_size[1] = depth;
+        gpu_param.dim            = 2;
     }
     else
     {
-        gpu_param.global_size[0]   = gpu_align_p2((out_width  + \
-                                     gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
-        gpu_param.global_size[1]   = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1];
-        gpu_param.global_size[2]   = depth / gpu_param.global_scale[2];
+        gpu_param.global_size[0] = gpu_align_p2((out_width  + \
+                                   gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
+        gpu_param.global_size[1] = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1];
+        gpu_param.global_size[2] = depth / gpu_param.global_scale[2];
     }
 
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
@@ -1024,6 +1105,8 @@ static vsi_status _query_kernel
                     && (3 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
     vsi_bool is_4x_upsample =(4 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
                     && (4 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
+    vsi_bool is_8x_upsample =(8 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
+                    && (8 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
     _internal_scale_e scale_flag = UP;
 
     in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
@@ -1032,6 +1115,7 @@ static vsi_status _query_kernel
     is_2x_upsample &= (in_dtype == U8);
     is_3x_upsample &= (in_dtype == U8);
     is_4x_upsample &= (in_dtype == U8);
+    is_8x_upsample &= (in_dtype == U8);
 
     if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0])
     {
@@ -1047,6 +1131,10 @@ static vsi_status _query_kernel
         {
             scale_flag = UP_4X_HALF;
         }
+        else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_8x_upsample)
+        {
+            scale_flag = UP_8X_HALF;
+        }
         else if (is_same_type && is_evis2)
         {
             scale_flag = UP_OPT;
@@ -1123,7 +1211,6 @@ static vsi_status _query_kernel
     }
 
     return status;
-
 } /* _query_kernel() */
 
 static vsi_nn_tensor_t* _create_scale_tensor
@@ -1307,4 +1394,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( resize_bilinear, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
index fe8a9d7..778d1fe 100644
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
@@ -74,6 +74,7 @@ static const struct {
     TENSOR_SCATTER_ND_KERNELS(I32, U8,  U8,       KERNEL_SOURCE_1)
     TENSOR_SCATTER_ND_KERNELS(I32, I16, I16,      KERNEL_SOURCE_1)
     TENSOR_SCATTER_ND_KERNELS(I32, F16, F16,      KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_KERNELS(I32, BF16,BF16,     KERNEL_SOURCE_1)
     TENSOR_SCATTER_ND_BIG_KERNELS(I32, I8,  I8,   KERNEL_SOURCE_2)
     TENSOR_SCATTER_ND_BIG_KERNELS(I32, U8,  U8,   KERNEL_SOURCE_2)
     TENSOR_SCATTER_ND_BIG_KERNELS(I32, I16, I16,  KERNEL_SOURCE_2)
@@ -250,8 +251,45 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer)
                 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
         }, GPU_DP_TYPE_16 };
 
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x05050404, 0x07070606, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractOddData_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x07050301, 0x07050301, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+
         status = vsi_nn_kernel_gpu_add_param( node,
             "uniAccumulateSum_2x8", &uniAccumulateSum_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
         status |= vsi_nn_kernel_gpu_add_param( node, "index_num", &index_num );
         status |= vsi_nn_kernel_gpu_add_param( node, "zeropoint", &output_zp );
         status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX );
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
index 02526f5..91ea9cb 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@@ -67,6 +67,13 @@ static vsi_status _gpu_register
     vsi_nn_kernel_t* kernel
     );
 
+static vsi_status _gpu_register_ext
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_kernel_t* kernel,
+    const char** resources
+    );
+
 static vx_program _create_program_from_executable
     (
     vsi_nn_graph_t* graph,
@@ -79,6 +86,13 @@ static vx_program _create_program_from_code
     vsi_nn_kernel_t* kernel
     );
 
+static vx_program _create_program_from_code_ext
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_kernel_t* kernel,
+    const char** resources
+    );
+
 static const uint8_t* _load_internal_executable
     (
     const char* source_name,
@@ -104,6 +118,14 @@ static void _kernel_clear_source
 
 static vsi_bool _check_shader_support(vsi_nn_graph_t* graph);
 
+static vsi_bool vsi_nn_kernel_is_asymmtric_int8
+    (
+    vsi_nn_tensor_t** inputs,
+    size_t input_num,
+    vsi_nn_tensor_t** outputs,
+    size_t output_num
+    );
+
 static vsi_status VX_CALLBACK _kernel_validator
     (
     vx_node node,
@@ -290,7 +312,7 @@ static char* _load_source_code_from_file
     size_t read_bytes;
     source = NULL;
     //TODO: Pack new name
-    fp = fopen( source_name, "rb" );
+    fp = vsi_nn_fopen( source_name, "rb" );
     if( NULL == fp )
     {
         VSILOGE("Open program file %s fail.", source_name);
@@ -414,6 +436,58 @@ static vx_program _create_program_from_code
     return program;
 } /* _create_program_from_code() */
 
+static vx_program _create_program_from_code_ext
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_kernel_t* kernel,
+    const char** resources
+    )
+{
+    const vsi_nn_kernel_source_info_t* source_info;
+    kernel_program_info_t* program_info;
+    size_t i;
+    vx_program program = NULL;
+    source_info = &kernel->gpu.sources[VSI_NN_GPU_SOURCE_FMT_CODE];
+
+    if( source_info->num == 0 )
+    {
+        VSILOGE("Not executable source found in kernel.");
+        return NULL;
+    }
+    program_info = (kernel_program_info_t*)malloc(
+            source_info->num * sizeof(kernel_program_info_t) );
+    if( !program_info )
+    {
+        VSILOGE("Malloc program memory fail.");
+        return NULL;
+    }
+    memset( program_info, 0, source_info->num * sizeof(kernel_program_info_t) );
+
+    for( i = 0; i < source_info->num; i ++ )
+    {
+        program_info[i].data = (const void*)(resources[i]);
+        if( !program_info[i].data )
+        {
+            program_info[i].reserve_mem = (void*)_load_source_code_from_file(
+                    source_info->data[i], &program_info[i].size );
+            program_info[i].data = (const void*)program_info[i].reserve_mem;
+        }
+    }
+    program = _create_program( graph->ctx->c, program_info, source_info->num );
+    if( program_info )
+    {
+        for( i = 0; i < source_info->num; i ++ )
+        {
+            if( program_info[i].reserve_mem )
+            {
+                free( program_info[i].reserve_mem );
+            }
+        }
+        free( program_info );
+    }
+    return program;
+} /* _create_program_from_code_ext() */
+
 static vx_program _create_program_from_executable
     (
     vsi_nn_graph_t* graph,
@@ -547,6 +621,113 @@ static vsi_status _gpu_register
     return status;
 } /* _gpu_register() */
 
+static vsi_status _gpu_register_ext
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_kernel_t* kernel,
+    const char** resources
+    )
+{
+    vsi_status status;
+    vx_kernel_description_t* info;
+    vx_kernel obj;
+    vsi_nn_context_t context;
+    vx_program program = NULL;
+    const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
+
+#define MAX_BUILDPROGRAM_LEN 1024
+    char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
+    size_t cost_bytes = 0;
+
+    memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
+    context = graph->ctx;
+
+    status = VSI_FAILURE;
+    info = &(kernel->info);
+
+    switch( active_fmt )
+    {
+        case VSI_NN_GPU_SOURCE_FMT_CODE:
+            program = _create_program_from_code_ext( graph, kernel,resources );
+            break;
+        case VSI_NN_GPU_SOURCE_FMT_EXECUTABLE:
+            program = _create_program_from_executable( graph, kernel );
+            break;
+        default:
+            VSILOGE("Unknown source format %d", kernel->gpu.active_source_fmt);
+            break;
+    }
+    if( NULL == program )
+    {
+        return status;
+    }
+
+    if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE )
+    {
+        // set default evis version is 2
+        if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
+        {
+            cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
+                    "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
+                    context->config.use_40bits_va );
+        }
+    }
+    else
+    {
+        cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
+                "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
+                context->config.evis.ver, context->config.use_40bits_va );
+    }
+    // Pack build option
+    if( kernel->gpu.sources[active_fmt].build_option.data )
+    {
+        vsi_nn_kernel_build_option_t * option = &kernel->gpu.sources[active_fmt].build_option;
+        if( MAX_BUILDPROGRAM_LEN - cost_bytes > strlen( option->data ) + 1 )
+        {
+            snprintf( &cmd[cost_bytes], MAX_BUILDPROGRAM_LEN - cost_bytes,
+                    " %s", option->data );
+        }
+        else
+        {
+            VSILOGE("Build option is too long!");
+            VSI_ASSERT( FALSE );
+        }
+    }
+
+    status = vxBuildProgram( program, cmd );
+
+    if( VSI_SUCCESS != status )
+    {
+        VSILOGE("Build program fail.");
+        return status;
+    }
+
+    obj = vxAddKernelInProgram(
+        program,
+        info->name,
+        info->enumeration,
+        info->numParams,
+        info->validate,
+        info->initialize,
+        info->deinitialize
+        );
+
+    if( obj )
+    {
+        status = _kernel_init_obj( info, obj );
+        //vxReleaseKernel( &obj );
+    }
+    else
+    {
+        VSILOGE( "Add kernel %s fail.", info->name );
+    }
+    if( program )
+    {
+        vxReleaseProgram( &program );
+    }
+    return status;
+} /* _gpu_register_ext() */
+
 static vsi_status _kernel_init_obj
     (
     vx_kernel_description_t* info,
@@ -620,6 +801,19 @@ vsi_status vsi_nn_kernel_register
     return status;
 } /* vsi_nn_kernel_register() */
 
+vsi_status vsi_nn_kernel_register_ext
+    (
+    vsi_nn_graph_t * graph,
+    vsi_nn_kernel_t * kernel,
+    const char** resources
+    )
+{
+    vsi_status status;
+    status = VSI_FAILURE;
+    status = _gpu_register_ext( graph, kernel,resources );
+    return status;
+} /* vsi_nn_kernel_register_ext */
+
 vsi_nn_kernel_node_t  vsi_nn_kernel_create_node
     (
     vsi_nn_graph_t* graph,
@@ -667,7 +861,6 @@ vsi_nn_kernel_node_t  vsi_nn_kernel_create_node
     status = vxGetStatus( (vx_reference)obj );
     if (VSI_SUCCESS != status)
     {
-        fprintf(stderr, "\n"); // TODO: This is a hack for driver msg
         /* Register kernel */
         status = vsi_nn_kernel_register( graph, kernel );
         if( VSI_SUCCESS != status )
@@ -712,6 +905,92 @@ vsi_nn_kernel_node_t  vsi_nn_kernel_create_node
     return (vsi_nn_kernel_node_t)node;
 } /* vsi_nn_kernel_create_node() */
 
+vsi_nn_kernel_node_t  vsi_nn_kernel_create_node_ext
+    (
+    vsi_nn_graph_t * graph,
+    vsi_nn_kernel_t * kernel,
+    const char** resources
+    ){
+    vsi_status status;
+    vx_context ctx;
+    vx_kernel obj;
+    vx_node node;
+    vx_kernel_description_t* info;
+
+    info = &(kernel->info);
+    // Validate kernel
+    if( !info->initialize )
+    {
+        VSILOGE("Kernel %s initializer is NULL", info->name);
+        return NULL;
+    }
+    if( !info->validate )
+    {
+        VSILOGE("Kernel %s validator is NULL", info->name);
+        return NULL;
+    }
+    if( !info->deinitialize )
+    {
+        VSILOGE("Kernel %s deinitializer is NULL", info->name);
+        return NULL;
+    }
+    if( info->enumeration == KERNEL_ID_PLACEHOLDER )
+    {
+        //VSILOGD("Kernel id: %#x, %#x", kernel->unique_id, info->enumeration);
+        info->enumeration = (vx_enum)kernel->unique_id;
+    }
+
+    ctx = vxGetContext( (vx_reference)graph->g );
+
+    obj = vxGetKernelByName( ctx, info->name );
+    status = vxGetStatus( (vx_reference)obj );
+    if (VSI_SUCCESS != status)
+    {
+        fprintf(stderr, "\n"); // TODO: This is a hack for driver msg
+        /* Register kernel */
+        status = vsi_nn_kernel_register_ext( graph, kernel,resources );
+        if( VSI_SUCCESS != status )
+        {
+            VSILOGE( "Register client kernel %s fail with %d.",
+                info->name, status );
+            return NULL;
+        }
+        else
+        {
+            VSILOGD( "Register client kernel %s successfully.",
+                info->name );
+        }
+
+        /* Load kernel */
+        obj = vxGetKernelByName( ctx, info->name );
+        status = vxGetStatus( (vx_reference)obj );
+    }
+    if( VSI_SUCCESS != status )
+    {
+        VSILOGE( "Load client kernel %s fail with %d.",
+            info->name, status );
+        return NULL;
+    }
+    node = vxCreateGenericNode( graph->g, obj );
+    vxReleaseKernel( &obj );
+    status = vxGetStatus( (vx_reference)node );
+    if( VSI_SUCCESS != status )
+    {
+        VSILOGE( "Load client node from kernel %s fail with %d.",
+            info->name, status );
+        return NULL;
+    }
+    if( node )
+    {
+        // Set default border mode.
+        vx_border_t border;
+        border.mode = VX_BORDER_REPLICATE;
+        border.constant_value.U32 = 0;
+        status |= vxSetNodeAttribute( node, VX_NODE_BORDER, &border, sizeof(border) );
+    }
+    return (vsi_nn_kernel_node_t)node;
+} /* vsi_nn_kernel_create_node_ext() */
+
 vsi_status vsi_nn_kernel_node_set_border
     (vsi_nn_kernel_node_t node,
     vx_border_t* border)
@@ -987,7 +1266,8 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
 
             /* Skip evis and cl when disable shader */
             if ( (type == VSI_NN_KERNEL_TYPE_EVIS || type == VSI_NN_KERNEL_TYPE_CL)
-                && _check_shader_support(graph) == FALSE)
+                && ( _check_shader_support(graph) == FALSE ||
+                vsi_nn_kernel_is_asymmtric_int8(inputs, input_num, outputs, output_num) ) )
             {
                 continue;
             }
@@ -1292,3 +1572,38 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
 
     return FALSE;
 }
+
+static vsi_bool vsi_nn_kernel_is_asymmtric_int8
+    (
+    vsi_nn_tensor_t** inputs,
+    size_t input_num,
+    vsi_nn_tensor_t** outputs,
+    size_t output_num
+    )
+{
+    size_t i = 0;
+
+    for (i = 0; i < input_num; i++)
+    {
+        if ( inputs[i] &&
+             inputs[i]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
+             inputs[i]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
+           )
+        {
+            return TRUE;
+        }
+    }
+
+    for (i = 0; i < output_num; i++)
+    {
+        if ( outputs[i] &&
+             outputs[i]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
+             outputs[i]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
+           )
+        {
+            return TRUE;
+        }
+    }
+
+    return FALSE;
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
index da0de6e..105027d 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
@@ -361,7 +361,6 @@ vsi_bool vsi_nn_kernel_optimize_softmax_shape
     return ret;
 } /* vsi_nn_kernel_optimize_softmax_shape() */
 
-
 typedef enum
 {
     TILE_STATE_AXIS_X  = 0,
@@ -611,4 +610,47 @@ vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape
     *out_rank = vsi_nn_min(dim_num, 3);
 
     return TRUE;
+}
+
+vsi_bool vsi_nn_kernel_optimize_group_norm_shape
+    (
+    const vsi_size_t* shape, const uint32_t rank, int32_t groups,
+    int32_t is_sp_kernel, vsi_size_t* out_shape
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    uint32_t i = 0;
+    vsi_size_t out_rank = 0;
+    vsi_size_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
+    group_shape[0] = shape[0];
+    group_shape[1] = shape[1];
+    group_shape[2] = shape[2] / groups;
+
+    vsi_nn_kernel_optimize_element_shape( group_shape, 3, out_shape, &out_rank );
+
+    if (!is_sp_kernel && out_shape[1] == 1 && out_rank < 3)
+    {
+        out_shape[1] = groups;
+        out_shape[2] = 1;
+        out_shape[3] = 1;
+        for (i = 3; i < rank; i++)
+        {
+            out_shape[3] = out_shape[3] * shape[i];
+        }
+    }
+    else if (out_rank == 2)
+    {
+        out_shape[2] = groups;
+        out_shape[3] = 1;
+        for (i = 3; i < rank; i++)
+        {
+            out_shape[3] = out_shape[3] * shape[i];
+        }
+    }
+    else
+    {
+        status = VSI_FAILURE;
+    }
+
+    return status;
 }
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c b/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c
new file mode 100644
index 0000000..955c61d
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c
@@ -0,0 +1,84 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include <float.h>
+#include "utils/vsi_nn_dtype_util_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_lut.h"
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vx_node node = NULL;
+    float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
+
+    node = vxBatchNormalizationLayer(
+        graph->g,
+        eps,
+        inputs[1]->t,
+        inputs[2]->t,
+        inputs[3]->t,
+        inputs[4]->t,
+        inputs[0]->t,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* _setup() */
+
+#define REGISTER_BATCH_NORM_OPENVX_KERNEL(KERNEL_NAME) \
+    static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num, \
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ) \
+    { \
+        return _setup(graph, inputs, input_num, outputs, output_num, \
+                params, kernel); \
+    } \
+    REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
+
+REGISTER_BATCH_NORM_OPENVX_KERNEL( batch_norm )
+
+#undef REGISTER_BATCH_NORM_OPENVX_KERNEL
diff --git a/src/tim/vx/internal/src/kernel/vx/convolutional.c b/src/tim/vx/internal/src/kernel/vx/convolutional.c
index 8cc0794..89c8fa4 100644
--- a/src/tim/vx/internal/src/kernel/vx/convolutional.c
+++ b/src/tim/vx/internal/src/kernel/vx/convolutional.c
@@ -181,6 +181,51 @@ static vsi_bool _build_vx_conv3d_param
 } /* _build_vx_conv2d_param() */
 #endif
 
+#if VX_DECONV_3D_API_SUPPORT
+static vsi_bool _build_vx_deconv3d_param
+    (
+    vx_nn_deconvolution_3d_params_t * param,
+    int32_t stride_d, int32_t stride_h, int32_t stride_w,
+    int32_t pad_d_front, int32_t pad_d_end,
+    int32_t pad_h_front, int32_t pad_h_end,
+    int32_t pad_w_front, int32_t pad_w_end,
+    int32_t outpadding_d, int32_t outpadding_h, int32_t outpadding_w,
+    int32_t group, vsi_enum overflow_policy,
+    vsi_enum rounding_policy, vsi_enum down_scale_size_rounding
+    )
+{
+    VSI_ASSERT( stride_d > 0 );
+    VSI_ASSERT( stride_h > 0 );
+    VSI_ASSERT( stride_w > 0 );
+    VSI_ASSERT( outpadding_d >= 0 );
+    VSI_ASSERT( outpadding_h >= 0 );
+    VSI_ASSERT( outpadding_w >= 0 );
+    VSI_ASSERT( group >= 0 );
+
+    param->padding_d_front  = (uint32_t)pad_d_front;
+    param->padding_d_rear   = (uint32_t)pad_d_end;
+    param->padding_h_top    = (uint32_t)pad_h_front;
+    param->padding_h_bottom = (uint32_t)pad_h_end;
+    param->padding_w_left   = (uint32_t)pad_w_front;
+    param->padding_w_right  = (uint32_t)pad_w_end;
+
+    param->a_w = outpadding_w;
+    param->a_h = outpadding_h;
+    param->a_d = outpadding_d;
+
+    param->overflow_policy = (vx_enum)overflow_policy;
+    param->rounding_policy = (vx_enum)rounding_policy;
+    param->down_scale_size_rounding = (vx_enum)down_scale_size_rounding;
+    param->channel_group = group;
+
+    param->stride_w = (uint32_t)stride_w;
+    param->stride_h = (uint32_t)stride_h;
+    param->stride_d = (uint32_t)stride_d;
+
+    return TRUE;
+} /* _build_vx_deconv3d_param() */
+#endif
+
 static vx_tensor _expand_tensor_dim
     ( vx_tensor tensor, vsi_ssize_t * shape, size_t rank, vsi_ssize_t expand_dim )
 {
@@ -242,7 +287,7 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d )
     vx_node node = NULL;
     vx_nn_convolution_params_ext2_t vxparam;
     vx_tensor temp_tensors[3] = { NULL };
-    int i;
+    uint32_t i = 0;
 
     _build_vx_conv2d_param(
             &vxparam,
@@ -270,7 +315,6 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d )
     {
         uint8_t    * data = NULL;
         vsi_nn_tensor_attr_t attr;
-        uint32_t i;
 
         data = vsi_nn_ConvertTensorToData( graph, inputs[1] );
         CHECK_PTR_FAIL_GOTO( data, "Convert data fail.", final );
@@ -317,7 +361,7 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
     vx_node node = NULL;
     vx_nn_convolution_params_ext2_t vxparam;
     vx_tensor temp_tensors[3] = { NULL };
-    int32_t i;
+    uint32_t i = 0;
     vsi_bool need_explicit_padding = FALSE;
 
     _build_vx_conv2d_param(
@@ -344,7 +388,7 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
         new_w_shape[0] = inputs[1]->attr.size[0];
         new_w_shape[1] = 1;
         new_w_shape[2] = 1;
-        for (i = 1; i < (int32_t)(inputs[1]->attr.dim_num); i++)
+        for (i = 1; i < inputs[1]->attr.dim_num; i++)
         {
             new_w_shape[2] *= inputs[1]->attr.size[i];
         }
@@ -358,7 +402,6 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
     {
         uint8_t    * data = NULL;
         vsi_nn_tensor_attr_t attr;
-        uint32_t i;
 
         data = vsi_nn_ConvertTensorToData( graph, inputs[1] );
         CHECK_PTR_FAIL_GOTO( data, "Convert data fail.", final );
@@ -576,4 +619,41 @@ REGISTER_CONV_OPENVX_KERNEL( conv3d )
     return (vsi_nn_kernel_node_t)node;
 } /* depthwise_conv2d*/
 
-#undef REGISTER_CONV_OPENVX_KERNEL
+REGISTER_CONV_OPENVX_KERNEL( deconv3d )
+{
+    vx_node node = NULL;
+#if VX_DECONV_3D_API_SUPPORT
+    vx_nn_deconvolution_3d_params_t vxparam;
+    memset(&vxparam, 0, sizeof(vxparam));
+
+    _build_vx_deconv3d_param(
+            &vxparam,
+            vsi_nn_kernel_param_get_int32(params, "stride_d"),
+            vsi_nn_kernel_param_get_int32(params, "stride_h"),
+            vsi_nn_kernel_param_get_int32(params, "stride_w"),
+            vsi_nn_kernel_param_get_int32(params, "pad_front"),
+            vsi_nn_kernel_param_get_int32(params, "pad_end"),
+            vsi_nn_kernel_param_get_int32(params, "pad_top"),
+            vsi_nn_kernel_param_get_int32(params, "pad_bottom"),
+            vsi_nn_kernel_param_get_int32(params, "pad_left"),
+            vsi_nn_kernel_param_get_int32(params, "pad_right"),
+            vsi_nn_kernel_param_get_int32(params, "outpadding_w"),
+            vsi_nn_kernel_param_get_int32(params, "outpadding_h"),
+            vsi_nn_kernel_param_get_int32(params, "outpadding_w"),
+            vsi_nn_kernel_param_get_int32(params, "group"),
+            vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
+            vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
+            vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding")
+            );
+
+    node = vxDeconv3dLayer( graph->g,
+        inputs[0]->t, inputs[1]->t, inputs[2] ? inputs[2]->t : NULL,
+        &vxparam,
+        sizeof( vxparam),
+        outputs[0]->t
+        );
+#endif
+    return (vsi_nn_kernel_node_t)node;
+} /* deconv3d */
+
+#undef REGISTER_CONV_OPENVX_KERNEL
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
new file mode 100644
index 0000000..d67751b
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
@@ -0,0 +1,113 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+
+#define REGISTER_PAD2_OPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_PAD2_OPENVX_KERNEL( pad2 )
+{
+    vx_node node = NULL;
+    vx_nn_pad_params_t param;
+    size_t dim_num = 0;
+    int32_t* front_size = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "front_size", &dim_num);
+    int32_t* back_size = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "back_size", &dim_num);
+    int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "pad_mode");
+    int32_t pad_front_array[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t pad_back_array[VSI_NN_MAX_DIM_NUM] = {0};
+    vsi_nn_tensor_t *convert_tensor = NULL;
+    float const_val = vsi_nn_kernel_param_get_float32(params, "const_val");
+
+    memset(&param, 0, sizeof(param));
+    memset(pad_front_array, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
+    memset(pad_back_array, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
+
+    memcpy(pad_front_array, front_size, sizeof(int32_t) * dim_num);
+    memcpy(pad_back_array, back_size, sizeof(int32_t) * dim_num);
+
+    param.pad_mode = pad_mode;
+    param.pad_const = vxCreateScalar( graph->ctx->c, VX_TYPE_FLOAT32, &const_val );
+    param.numViewDimensions = (uint8_t)vsi_nn_max(dim_num, 2);
+    param.pad_front_array = pad_front_array;
+    param.pad_back_array = pad_back_array;
+
+    if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
+    {
+        vsi_nn_tensor_attr_t attr;
+        memcpy( &attr, &outputs[0]->attr, sizeof( attr ) );
+        memcpy( &attr.size, &inputs[0]->attr.size, sizeof( attr.size ) );
+        attr.vtl = FALSE;
+        attr.is_const = FALSE;
+
+        convert_tensor = vsi_nn_CreateTensor(graph, &attr);
+
+        node = vxTensorCopyNode(
+            graph->g,
+            inputs[0]->t,
+            convert_tensor->t
+            );
+    }
+    else
+    {
+        convert_tensor = vsi_nn_reshape_tensor( graph,
+            inputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num );
+    }
+
+    node = vxTensorPadNode( graph->g, convert_tensor->t, outputs[0]->t, &param, sizeof(param) );
+
+    vxReleaseScalar( &param.pad_const );
+    vsi_safe_release_tensor(convert_tensor);
+
+    return (vsi_nn_kernel_node_t)node;
+} /* pad2() */
+
+#undef REGISTER_PAD2_OPENVX_KERNEL
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/clip_BF16.cl b/src/tim/vx/internal/src/libnnext/ops/cl/clip_BF16.cl
new file mode 100644
index 0000000..dddb09b
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/clip_BF16.cl
@@ -0,0 +1,37 @@
+#pragma OPENCL EXTENSION CL_VIV_asm : enable
+
+__kernel void clip_BF16toBF16(
+    __read_only  image2d_array_t  input,
+    __write_only image2d_array_t  output,
+                           float  minData,
+                           float  maxData)
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    uint4 src0 = read_imageui(input, coord);
+    src0 = src0 << 16;
+    float4 src;
+    _viv_asm(COPY, src, src0, 16);
+    float4 dst0 = clamp(src, minData, maxData);
+    uint4 dst;
+    _viv_asm(COPY, dst, dst0, 16);
+    dst = dst >> 16;
+    write_imageui(output, coord, dst);
+}
+
+__kernel void clip_BF16toBF16_2D(
+    __read_only  image2d_t  input,
+    __write_only image2d_t  output,
+                     float  minData,
+                     float  maxData)
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    uint4 src0 = read_imageui(input, coord);
+    src0 = src0 << 16;
+    float4 src;
+    _viv_asm(COPY, src, src0, 16);
+    float4 dst0 = clamp(src, minData, maxData);
+    uint4 dst;
+    _viv_asm(COPY, dst, dst0, 16);
+    dst = dst >> 16;
+    write_imageui(output, coord, dst);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/depth2space_crd.cl b/src/tim/vx/internal/src/libnnext/ops/cl/depth2space_crd.cl
new file mode 100644
index 0000000..12f6977
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/depth2space_crd.cl
@@ -0,0 +1,17 @@
+
+__kernel void depth2space_crd_F32toF32(
+    image2d_array_t input, image2d_array_t output, int block_size)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord_out = (int4)(gidx, gidy, gidz, 0);
+    int block_e2 = block_size * block_size;
+    ushort blk = (ushort)block_size;
+    int inx = (int)((ushort)gidx / blk);
+    int iny = (int)((ushort)gidy / blk);
+    int inz = (gidx  % block_size) + (gidy % block_size) * block_size + gidz * block_e2;
+    int4 coord_in = (int4)(inx, iny, inz, 0);
+    float4 data = read_imagef(input, coord_in);
+    write_imagef(output, coord_out, data);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl
index 5b90eb1..55b63cb 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl
@@ -3,6 +3,11 @@ float eltwise_unary_sin(float x, float alpha, float beta)
     return native_sin(x);
 }
 
+float eltwise_unary_cos(float x, float alpha, float beta)
+{
+    return native_cos(x);
+}
+
 #define logE        (1.44269502f)
 #define twoLogE     (logE * 2.0f)
 float eltwise_unary_exp(float x, float alpha, float beta)
@@ -135,6 +140,7 @@ __kernel void func_name##_F32toF32 \
     write_imagef(output, coord, dst.xxxx); \
 }
 ELTWISE_UNARY_F32(sin)
+ELTWISE_UNARY_F32(cos)
 ELTWISE_UNARY_F32(exp)
 ELTWISE_UNARY_F32(log)
 ELTWISE_UNARY_F32(elu)
@@ -168,6 +174,7 @@ __kernel void func_name##_F32toF32_2D \
     write_imagef(output, coord, dst.xxxx); \
 }
 ELTWISE_UNARY_F32_2D(sin)
+ELTWISE_UNARY_F32_2D(cos)
 ELTWISE_UNARY_F32_2D(exp)
 ELTWISE_UNARY_F32_2D(log)
 ELTWISE_UNARY_F32_2D(elu)
@@ -202,6 +209,7 @@ __kernel void func_name##_U8toU8 \
     write_imageui(output, coord, dst); \
 }
 ELTWISE_UNARY_U8(sin)
+ELTWISE_UNARY_U8(cos)
 ELTWISE_UNARY_U8(exp)
 ELTWISE_UNARY_U8(log)
 ELTWISE_UNARY_U8(elu)
@@ -236,6 +244,7 @@ __kernel void func_name##_U8toU8_2D \
     write_imageui(output, coord, dst); \
 }
 ELTWISE_UNARY_U8_2D(sin)
+ELTWISE_UNARY_U8_2D(cos)
 ELTWISE_UNARY_U8_2D(exp)
 ELTWISE_UNARY_U8_2D(log)
 ELTWISE_UNARY_U8_2D(elu)
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl
index 64f6775..1bacbc0 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl
@@ -1,7 +1,15 @@
-__kernel void floordiv_F32F32toF32(
+__kernel void floordiv_F32F32toF32
+    (
     __read_only  image2d_array_t  input,
     __read_only  image2d_array_t  input1,
-    __write_only image2d_array_t  output)
+    __write_only image2d_array_t  output,
+                 float            input0Scale,
+                 float            input0Tail,
+                 float            input1Scale,
+                 float            input1Tail,
+                 float            outputScale,
+                 float            outputTail
+     )
 {
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
     float4 src0;
@@ -12,10 +20,18 @@ __kernel void floordiv_F32F32toF32(
     write_imagef(output, coord, dst);
 }
 
-__kernel void floordiv_F32F32toF32_2D(
-    __read_only  image2d_t  input,
-    __read_only  image2d_t  input1,
-    __write_only image2d_t  output)
+__kernel void floordiv_F32F32toF32_2D
+    (
+    __read_only  image2d_t input,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 float     input0Scale,
+                 float     input0Tail,
+                 float     input1Scale,
+                 float     input1Tail,
+                 float     outputScale,
+                 float     outputTail
+     )
 {
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));
     float4 src0 = read_imagef(input, coord);
@@ -24,33 +40,8 @@ __kernel void floordiv_F32F32toF32_2D(
     write_imagef(output, coord, dst);
 }
 
-__kernel void floordiv_I32I32toI32(
-    __read_only  image2d_array_t  input,
-    __read_only  image2d_array_t  input1,
-    __write_only image2d_array_t  output)
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    int4 src0;
-    int4 src1;
-    READ_IMAGEI_2DARRAY(src0, input, coord);
-    READ_IMAGEI_2DARRAY(src1, input1, coord);
-    int4 dst  = convert_int4(floor(convert_float4(src0) / convert_float4(src1)));
-    write_imagei(output, coord, dst);
-}
-
-__kernel void floordiv_I32I32toI32_2D(
-    __read_only  image2d_t  input,
-    __read_only  image2d_t  input1,
-    __write_only image2d_t  output)
-{
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
-    int4 src0 = read_imagei(input, coord);
-    int4 src1 = read_imagei(input1, coord);
-    int4 dst  = convert_int4(floor(convert_float4(src0) / convert_float4(src1)));
-    write_imagei(output, coord, dst);
-}
-
-__kernel void floordiv_I32I32toU8(
+__kernel void floordiv_I32I32toI32
+    (
     __read_only  image2d_array_t  input,
     __read_only  image2d_array_t  input1,
     __write_only image2d_array_t  output,
@@ -59,7 +50,56 @@ __kernel void floordiv_I32I32toU8(
                  float            input1Scale,
                  float            input1Tail,
                  float            outputScale,
-                 float            outputTail )
+                 float            outputTail
+     )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 src0;
+    int4 src1;
+    READ_IMAGEI_2DARRAY(src0, input, coord);
+    READ_IMAGEI_2DARRAY(src1, input1, coord);
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;
+    float4 out = floor(in0 / in1) * outputScale + outputTail;
+    int4 dst  = convert_int4(out);
+    write_imagei(output, coord, dst);
+}
+
+__kernel void floordiv_I32I32toI32_2D
+    (
+    __read_only  image2d_t input,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 float     input0Scale,
+                 float     input0Tail,
+                 float     input1Scale,
+                 float     input1Tail,
+                 float     outputScale,
+                 float     outputTail
+     )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    int4 src0 = read_imagei(input, coord);
+    int4 src1 = read_imagei(input1, coord);
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;
+    float4 out = floor(in0 / in1) * outputScale + outputTail;
+    int4 dst  = convert_int4(out);
+    write_imagei(output, coord, dst);
+}
+
+__kernel void floordiv_I32I32toU8
+    (
+    __read_only  image2d_array_t  input,
+    __read_only  image2d_array_t  input1,
+    __write_only image2d_array_t  output,
+                 float            input0Scale,
+                 float            input0Tail,
+                 float            input1Scale,
+                 float            input1Tail,
+                 float            outputScale,
+                 float            outputTail
+     )
 {
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
     int4 src0;
@@ -73,16 +113,18 @@ __kernel void floordiv_I32I32toU8(
     write_imageui(output, coord, dst);
 }
 
-__kernel void floordiv_I32I32toU8_2D(
-    __read_only  image2d_t  input,
-    __read_only  image2d_t  input1,
-    __write_only image2d_t  output,
-                 float      input0Scale,
-                 float      input0Tail,
-                 float      input1Scale,
-                 float      input1Tail,
-                 float      outputScale,
-                 float      outputTail )
+__kernel void floordiv_I32I32toU8_2D
+    (
+    __read_only  image2d_t input,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 float     input0Scale,
+                 float     input0Tail,
+                 float     input1Scale,
+                 float     input1Tail,
+                 float     outputScale,
+                 float     outputTail
+     )
 {
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));
     int4 src0 = read_imagei(input, coord);
@@ -94,7 +136,8 @@ __kernel void floordiv_I32I32toU8_2D(
     write_imageui(output, coord, dst);
 }
 
-__kernel void floordiv_U8U8toU8(
+__kernel void floordiv_U8U8toU8
+    (
     __read_only  image2d_array_t  input,
     __read_only  image2d_array_t  input1,
     __write_only image2d_array_t  output,
@@ -103,7 +146,8 @@ __kernel void floordiv_U8U8toU8(
                  float            input1Scale,
                  float            input1Tail,
                  float            outputScale,
-                 float            outputTail )
+                 float            outputTail
+     )
 {
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
     uint4 src0, src1;
@@ -117,16 +161,18 @@ __kernel void floordiv_U8U8toU8(
     write_imageui(output, coord, dst);
 }
 
-__kernel void floordiv_U8U8toU8_2D(
-    __read_only  image2d_t  input,
-    __read_only  image2d_t  input1,
-    __write_only image2d_t  output,
-                 float      input0Scale,
-                 float      input0Tail,
-                 float      input1Scale,
-                 float      input1Tail,
-                 float      outputScale,
-                 float      outputTail )
+__kernel void floordiv_U8U8toU8_2D
+    (
+    __read_only  image2d_t input,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 float     input0Scale,
+                 float     input0Tail,
+                 float     input1Scale,
+                 float     input1Tail,
+                 float     outputScale,
+                 float     outputTail
+     )
 {
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));
     uint4 src0 = read_imageui(input, coord);
@@ -139,7 +185,8 @@ __kernel void floordiv_U8U8toU8_2D(
     write_imageui(output, coord, dst);
 }
 
-__kernel void floordiv_U8I32toU8(
+__kernel void floordiv_U8I32toU8
+    (
     __read_only  image2d_array_t  input,
     __read_only  image2d_array_t  input1,
     __write_only image2d_array_t  output,
@@ -148,7 +195,8 @@ __kernel void floordiv_U8I32toU8(
                  float            input1Scale,
                  float            input1Tail,
                  float            outputScale,
-                 float            outputTail )
+                 float            outputTail
+     )
 {
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
     uint4 src0;
@@ -163,16 +211,18 @@ __kernel void floordiv_U8I32toU8(
     write_imageui(output, coord, dst);
 }
 
-__kernel void floordiv_U8I32toU8_2D(
-    __read_only  image2d_t  input,
-    __read_only  image2d_t  input1,
-    __write_only image2d_t  output,
-                 float      input0Scale,
-                 float      input0Tail,
-                 float      input1Scale,
-                 float      input1Tail,
-                 float      outputScale,
-                 float      outputTail )
+__kernel void floordiv_U8I32toU8_2D
+    (
+    __read_only  image2d_t input,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 float     input0Scale,
+                 float     input0Tail,
+                 float     input1Scale,
+                 float     input1Tail,
+                 float     outputScale,
+                 float     outputTail
+     )
 {
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));
     uint4 src0 = read_imageui(input, coord);
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl
index 1c8caff..49d04e2 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl
@@ -5,7 +5,8 @@ __kernel void gather_U8toU8(
     int block_size,
     int block_num,
     int axis_num,
-    int indices_num
+    int indices_num,
+    int batch
     )
 {
     int gidx = get_global_id(0);  // block_size
@@ -29,7 +30,8 @@ __kernel void gather_F16toF16(
     int block_size,
     int block_num,
     int axis_num,
-    int indices_num
+    int indices_num,
+    int batch
     )
 {
     int gidx = get_global_id(0);  // block_size
@@ -53,7 +55,8 @@ __kernel void gather_I32toI32(
     int block_size,
     int block_num,
     int axis_num,
-    int indices_num
+    int indices_num,
+    int batch
     )
 {
     int gidx = get_global_id(0);  // block_size
@@ -77,7 +80,8 @@ __kernel void gather_F32toF32(
     int block_size,
     int block_num,
     int axis_num,
-    int indices_num
+    int indices_num,
+    int batch
     )
 {
     int gidx = get_global_id(0);  // block_size
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl
new file mode 100644
index 0000000..4ff6ec1
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl
@@ -0,0 +1,123 @@
+__kernel void gather_batch_U8toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num,
+    int indices_num,
+    int batch
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        uint4 data = read_imageui(input0, coord_in);
+        coord_in.z++;
+        write_imageui(output, coord, data);
+        coord.z++;
+    }
+}
+
+__kernel void gather_batch_F16toF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num,
+    int indices_num,
+    int batch
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        float4 data = read_imagef(input0, coord_in);
+        coord_in.z++;
+        write_imagef(output, coord, data);
+        coord.z++;
+    }
+}
+
+__kernel void gather_batch_I32toI32(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num,
+    int indices_num,
+    int batch
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        int4 data = read_imagei(input0, coord_in);
+        coord_in.z++;
+        write_imagei(output, coord, data);
+        coord.z++;
+    }
+}
+
+__kernel void gather_batch_F32toF32(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num,
+    int indices_num,
+    int batch
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        float4 data = read_imagef(input0, coord_in);
+        coord_in.z++;
+        write_imagef(output, coord, data);
+        coord.z++;
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl
index 8b4dd55..effa919 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl
@@ -112,6 +112,48 @@ __kernel void moments_axis0_I32toF32(
     vari.x = sqr * dimRatio * input_scale * input_scale;
     vari.x = vari.x - mean.x * mean.x;
 
+    int2 coord_out = (int2)(gidy, gidz);
+    write_imagef(output_mean, coord_out, mean);
+    write_imagef(output_vari, coord_out, vari);
+}
+
+__kernel void moments_axis0_BF16toF32(
+    __read_only image2d_array_t   input,
+    __write_only image2d_t  output_mean,
+    __write_only image2d_t  output_vari,
+    int axis,
+    int axis_num,
+    int input_zp,
+    float input_scale,
+    int width,
+    int height,
+    int chn,
+    float dimRatio
+    )
+{
+    int gidy = get_global_id(0);
+    int gidz = get_global_id(1);
+
+    int4 coord0 = (int4)(0, gidy, gidz, 0);
+    float4 data;
+    float sum = 0, sqr = 0;
+
+    for(coord0.x = 0; coord0.x < width;)
+    {
+        uint4 src0 = read_imageui(input, coord0);
+        src0 = src0 << 16;
+        _viv_asm(COPY, data, src0, 16);
+        coord0.x++;
+
+        sum = sum + data.x;
+        sqr = sqr + data.x * data.x;
+    }
+
+    float4 mean, vari;
+    mean.x = sum * dimRatio;
+    vari.x = sqr * dimRatio;
+    vari.x = vari.x - mean.x * mean.x;
+
     int2 coord_out = (int2)(gidy, gidz);
     write_imagef(output_mean, coord_out, mean);
     write_imagef(output_vari, coord_out, vari);
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl
index a89ec8a..05f9e3a 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl
@@ -172,3 +172,63 @@ __kernel void moments_axis01_I32toF32(
         write_imagef(output_vari, coord_out, vari);
     }
 }
+
+__kernel void moments_axis01_BF16toF32(
+    image2d_array_t   input, image2d_t  output_mean, image2d_t  output_vari,
+    int axis, int axis_num, int input_zp, float input_scale,
+    int width, int height, int chn, float dimRatio
+    )
+{
+    int gidx = get_global_id(0);
+    int gidz = get_global_id(1);
+    int lidx = get_local_id(0);
+
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    float4 data;
+    float sum = 0, sqr = 0;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    for(coord.x = gidx; coord.x < width; coord.x += 16)
+    {
+        float tmpSum = 0, tmpSqr = 0;
+        for(coord.y = 0; coord.y < height;)
+        {
+            uint4 src0 = read_imageui(input, coord);
+            src0 = src0 << 16;
+            _viv_asm(COPY, data, src0, 16);
+            coord.y++;
+
+            tmpSum = tmpSum + data.x;
+            tmpSqr = tmpSqr + data.x * data.x;
+        }
+        sqr += tmpSqr;
+        sum += tmpSum;
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(gidz, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 mean, vari;
+        mean.x = sum * dimRatio;
+        vari.x = sqr * dimRatio;
+        vari.x = vari.x - mean.x * mean.x;
+        write_imagef(output_mean, coord_out, mean);
+        write_imagef(output_vari, coord_out, vari);
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl
index fa0ce44..44e9809 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl
@@ -177,3 +177,64 @@ __kernel void moments_axis012_I32toF32(
         write_imagef(output_vari, coord_out, vari);
     }
 }
+
+__kernel void moments_axis012_BF16toF32(
+    image2d_array_t   input, image2d_t  output_mean, image2d_t  output_vari,
+    int axis, int axis_num, int input_zp, float input_scale,
+    int width, int height, int chn, float dimRatio
+    )
+{
+    int gidx = get_global_id(0);
+    int lidx = get_local_id(0);
+
+    int4 coord = (int4)(gidx, 0, 0, 0);
+    float4 data;
+    float sum = 0, sqr = 0;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    for(coord.z = 0; coord.z < chn; coord.z++)
+    {
+        for(coord.x = gidx; coord.x < width; coord.x += 16)
+        {
+            float tmpSum = 0, tmpSqr = 0;
+            for(coord.y = 0; coord.y < height;)
+            {
+                uint4 src0 = read_imageui(input, coord);
+                src0 = src0 << 16;
+                _viv_asm(COPY, data, src0, 16);
+                coord.y++;
+                tmpSum = tmpSum + data.x;
+                tmpSqr = tmpSqr + data.x * data.x;
+            }
+            sqr += tmpSqr;
+            sum += tmpSum;
+        }
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(0, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 mean, vari;
+        mean.x = sum * dimRatio;
+        vari.x = sqr * dimRatio;
+        vari.x = vari.x - mean.x * mean.x;
+        write_imagef(output_mean, coord_out, mean);
+        write_imagef(output_vari, coord_out, vari);
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl
index a18bdc2..191e321 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl
@@ -106,6 +106,47 @@ __kernel void moments_axis1_I32toF32(
     vari.x = sqr * dimRatio * input_scale * input_scale;
     vari.x = vari.x - mean.x * mean.x;
 
+    int2 coord_out = (int2)(gidx, gidz);
+    write_imagef(output_mean, coord_out, mean);
+    write_imagef(output_vari, coord_out, vari);
+}
+
+__kernel void moments_axis1_BF16toF32(
+    __read_only image2d_array_t   input,
+    __write_only image2d_t  output_mean,
+    __write_only image2d_t  output_vari,
+    int axis,
+    int axis_num,
+    int input_zp,
+    float input_scale,
+    int width,
+    int height,
+    int chn,
+    float dimRatio
+    )
+{
+    int gidx = get_global_id(0);
+    int gidz = get_global_id(1);
+
+    int4 coord0 = (int4)(gidx, 0, gidz, 0);
+    float4 data;
+    float sum = 0, sqr = 0;
+
+    for(coord0.y = 0; coord0.y < height;)
+    {
+        uint4 src0 = read_imageui(input, coord0);
+        src0 = src0 << 16;
+        _viv_asm(COPY, data, src0, 16);
+        coord0.y++;
+        sum = sum + data.x;
+        sqr = sqr + data.x * data.x;
+    }
+
+    float4 mean, vari;
+    mean.x = sum * dimRatio;
+    vari.x = sqr * dimRatio;
+    vari.x = vari.x - mean.x * mean.x;
+
     int2 coord_out = (int2)(gidx, gidz);
     write_imagef(output_mean, coord_out, mean);
     write_imagef(output_vari, coord_out, vari);
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl
index 078cf74..8cf72cb 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl
@@ -123,4 +123,46 @@ __kernel void moments_axis2_I32toF32(
     int2 coord_out = (int2)(gidx, gidy);
     write_imagef(output_mean, coord_out, mean);
     write_imagef(output_vari, coord_out, vari);
-}
\ No newline at end of file
+}
+
+__kernel void moments_axis2_BF16toF32(
+    __read_only image2d_array_t   input,
+    __write_only image2d_t  output_mean,
+    __write_only image2d_t  output_vari,
+    int axis,
+    int axis_num,
+    int input_zp,
+    float input_scale,
+    int width,
+    int height,
+    int chn,
+    float dimRatio
+    )
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int4 coord0 = (int4)(gidx, gidy, 0, 0);
+    float4 data;
+    float sum = 0, sqr = 0;
+
+    for(coord0.z = 0; coord0.z < chn;)
+    {
+        uint4 src0 = read_imageui(input, coord0);
+        src0 = src0 << 16;
+        _viv_asm(COPY, data, src0, 16);
+        coord0.z++;
+
+        sum = sum + data.x;
+        sqr = sqr + data.x * data.x;
+    }
+
+    float4 mean, vari;
+    mean.x = sum * dimRatio;
+    vari.x = sqr * dimRatio;
+    vari.x = vari.x - mean.x * mean.x;
+
+    int2 coord_out = (int2)(gidx, gidy);
+    write_imagef(output_mean, coord_out, mean);
+    write_imagef(output_vari, coord_out, vari);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
new file mode 100644
index 0000000..2596e66
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
@@ -0,0 +1,251 @@
+#define TOPK_F32(LOCAL_SIZE0, STAGES) \
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toF32_I32 \
+ ( \
+  __read_only  image2d_t input, \
+  __write_only image2d_t output, \
+  __write_only image2d_t indices, \
+               int       num_stages, \
+               int       width \
+  ) \
+ { \
+    uint local_id = get_local_id(0); \
+    uint work_group_size = get_local_size(0); \
+    uint offset = 0; \
+ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    __local float local_data[128]; \
+    __local uint local_indices[128]; \
+ \
+    float left = read_imagef(input, coord.xy).x; \
+    coord.z += work_group_size; \
+    float data = read_imagef(input, coord.zy).x; \
+    float right = coord.z < width ? data : -2147483647.0f; \
+ \
+    local_data[local_id] = left; \
+    local_indices[local_id] = local_id; \
+    local_data[local_id + work_group_size] = right; \
+    local_indices[local_id + work_group_size] = local_id + work_group_size; \
+ \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \
+    { \
+        uint signo = (local_id >> stage) & 1; \
+ \
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+        { \
+            uint postShift = (stage - passOfStage); \
+            uint pairDistance = 1 << postShift; \
+ \
+            uint left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \
+            uint right_id = left_id + pairDistance; \
+ \
+            uint left_idx = local_indices[left_id]; \
+            uint right_idx = local_indices[right_id]; \
+ \
+            float left_elem = local_data[left_id]; \
+            float right_elem = local_data[right_id]; \
+ \
+            if ((left_elem < right_elem) ^ signo) \
+            { \
+                local_data[left_id] = right_elem; \
+                local_data[right_id] = left_elem; \
+ \
+                local_indices[left_id] = right_idx; \
+                local_indices[right_id] = left_idx; \
+            } \
+ \
+            barrier(CLK_LOCAL_MEM_FENCE); \
+        } \
+    } \
+ \
+    float4 dst; \
+    dst.x = local_data[local_id]; \
+    dst.y = local_data[local_id + work_group_size]; \
+ \
+    write_imagef(output, coord.xy, dst.xxxx); \
+    write_imagef(output, coord.zy, dst.yyyy); \
+ \
+    int4 index; \
+    index.x = ((int*)local_indices)[local_id]; \
+    index.y = ((int*)local_indices)[local_id + work_group_size]; \
+ \
+    write_imagei(indices, coord.xy, index.xxxx); \
+    write_imagei(indices, coord.zy, index.yyyy); \
+ }
+TOPK_F32(1 << 0, 0)
+TOPK_F32(1 << 1, 1)
+TOPK_F32(1 << 2, 2)
+TOPK_F32(1 << 3, 3)
+TOPK_F32(1 << 4, 4)
+TOPK_F32(1 << 5, 5)
+TOPK_F32(1 << 6, 6)
+
+#define TOPK_U32(LOCAL_SIZE0, STAGES) \
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_U32toU32_I32 \
+ ( \
+  __read_only  image2d_t input, \
+  __write_only image2d_t output, \
+  __write_only image2d_t indices, \
+               int       num_stages, \
+               int       width \
+  ) \
+ { \
+    uint local_id = get_local_id(0); \
+    uint work_group_size = get_local_size(0); \
+    uint offset = 0; \
+ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    __local uint local_data[128]; \
+    __local uint local_indices[128]; \
+ \
+    uint left = read_imageui(input, coord.xy).x; \
+    coord.z += work_group_size; \
+    uint data = read_imageui(input, coord.zy).x; \
+    uint right = coord.z < width ? data : 0; \
+ \
+    local_data[local_id] = left; \
+    local_indices[local_id] = local_id; \
+    local_data[local_id + work_group_size] = right; \
+    local_indices[local_id + work_group_size] = local_id + work_group_size; \
+ \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \
+    { \
+        uint signo = (local_id >> stage) & 1; \
+ \
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+        { \
+            uint postShift = (stage - passOfStage); \
+            uint pairDistance = 1 << postShift; \
+ \
+            uint left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \
+            uint right_id = left_id + pairDistance; \
+ \
+            uint left_idx = local_indices[left_id]; \
+            uint right_idx = local_indices[right_id]; \
+ \
+            uint left_elem = local_data[left_id]; \
+            uint right_elem = local_data[right_id]; \
+ \
+            if ((left_elem < right_elem) ^ signo) \
+            { \
+                local_data[left_id] = right_elem; \
+                local_data[right_id] = left_elem; \
+ \
+                local_indices[left_id] = right_idx; \
+                local_indices[right_id] = left_idx; \
+            } \
+ \
+            barrier(CLK_LOCAL_MEM_FENCE); \
+        } \
+    } \
+ \
+    uint4 dst; \
+    dst.x = local_data[local_id]; \
+    dst.y = local_data[local_id + work_group_size]; \
+ \
+    write_imageui(output, coord.xy, dst.xxxx); \
+    write_imageui(output, coord.zy, dst.yyyy); \
+ \
+    int4 index; \
+    index.x = ((int*)local_indices)[local_id]; \
+    index.y = ((int*)local_indices)[local_id + work_group_size]; \
+ \
+    write_imagei(indices, coord.xy, index.xxxx); \
+    write_imagei(indices, coord.zy, index.yyyy); \
+ }
+TOPK_U32(1 << 0, 0)
+TOPK_U32(1 << 1, 1)
+TOPK_U32(1 << 2, 2)
+TOPK_U32(1 << 3, 3)
+TOPK_U32(1 << 4, 4)
+TOPK_U32(1 << 5, 5)
+TOPK_U32(1 << 6, 6)
+
+#define TOPK_I32(LOCAL_SIZE0, STAGES) \
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_I32toI32_I32 \
+ ( \
+  __read_only  image2d_t input, \
+  __write_only image2d_t output, \
+  __write_only image2d_t indices, \
+               int       num_stages, \
+               int       width \
+  ) \
+ { \
+    int local_id = get_local_id(0); \
+    int work_group_size = get_local_size(0); \
+    int offset = 0; \
+ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    __local int local_data[128]; \
+    __local int local_indices[128]; \
+ \
+    int left = read_imagei(input, coord.xy).x; \
+    coord.z += work_group_size; \
+    int data = read_imagei(input, coord.zy).x; \
+    int right = coord.z < width ? data : -2147483647; \
+ \
+    local_data[local_id] = left; \
+    local_indices[local_id] = local_id; \
+    local_data[local_id + work_group_size] = right; \
+    local_indices[local_id + work_group_size] = local_id + work_group_size; \
+ \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    for (int stage = 0; stage < num_stages + 1; ++stage) \
+    { \
+        int signo = (local_id >> stage) & 1; \
+ \
+        for (int passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+        { \
+            int postShift = (stage - passOfStage); \
+            int pairDistance = 1 << postShift; \
+ \
+            int left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \
+            int right_id = left_id + pairDistance; \
+ \
+            int left_idx = local_indices[left_id]; \
+            int right_idx = local_indices[right_id]; \
+ \
+            int left_elem = local_data[left_id]; \
+            int right_elem = local_data[right_id]; \
+ \
+            if ((left_elem < right_elem) ^ signo) \
+            { \
+                local_data[left_id] = right_elem; \
+                local_data[right_id] = left_elem; \
+ \
+                local_indices[left_id] = right_idx; \
+                local_indices[right_id] = left_idx; \
+            } \
+ \
+            barrier(CLK_LOCAL_MEM_FENCE); \
+        } \
+    } \
+ \
+    int4 dst; \
+    dst.x = local_data[local_id]; \
+    dst.y = local_data[local_id + work_group_size]; \
+ \
+    write_imagei(output, coord.xy, dst.xxxx); \
+    write_imagei(output, coord.zy, dst.yyyy); \
+ \
+    int4 index; \
+    index.x = ((int*)local_indices)[local_id]; \
+    index.y = ((int*)local_indices)[local_id + work_group_size]; \
+ \
+    write_imagei(indices, coord.xy, index.xxxx); \
+    write_imagei(indices, coord.zy, index.yyyy); \
+ }
+TOPK_I32(1 << 0, 0)
+TOPK_I32(1 << 1, 1)
+TOPK_I32(1 << 2, 2)
+TOPK_I32(1 << 3, 3)
+TOPK_I32(1 << 4, 4)
+TOPK_I32(1 << 5, 5)
+TOPK_I32(1 << 6, 6)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis2.vx
index ac867c9..540a834 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis2.vx
@@ -3,6 +3,8 @@
 _viv_uniform int4 packedArgIdx;
 _viv_uniform int argLenSub1;
 _viv_uniform VXC_512Bits uniExtractData_2x8;
+_viv_uniform VXC_512Bits uniExtract1stU8toI16_2x8;
+_viv_uniform VXC_512Bits uniExtract2ndU8toI16_2x8;
 
 #define TENSOR_ARGMAX_AXIS2_16BITS(src_type_name, dst_type_name,\
                 src_type, copy_type, axis_type, dst_type, inst_type) \
@@ -67,6 +69,56 @@ TENSOR_ARGMAX_AXIS2_16BITS_2D(I16, U8,  vxc_short8, vxc_short8, vxc_short8, vxc_
 #define TENSOR_ARGMAX_AXIS2_8BITS(src_type_name, dst_type_name, src_type, dst_type) \
     __kernel void argmax_axis2_##src_type_name##to##dst_type_name( \
 __read_only  image2d_array_t  input, \
+__write_only image2d_array_t  output, \
+        int  axisVal \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \
+    src_type src; \
+    src_type maxVal; \
+    VXC_ReadImage2DArray(maxVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    dst_type axis; \
+    dst_type packIdx; \
+ \
+    _viv_asm(COPY, axis, packedArgIdx, 16); \
+    _viv_asm(COPY, packIdx, packedArgIdx, 16); \
+ \
+    coord.z --; \
+    do \
+    { \
+       VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+       coord.z --; \
+       packIdx --; \
+       maxVal = max(maxVal, src); \
+       src_type condition; \
+       VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \
+       axis = condition ? packIdx : axis; \
+    } while (coord.z >= 0); \
+ \
+    VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+}
+TENSOR_ARGMAX_AXIS2_8BITS(I8,  U8,  vxc_char16,  vxc_uchar16)
+TENSOR_ARGMAX_AXIS2_8BITS(U8,  U8,  vxc_uchar16, vxc_uchar16)
+
+#define TENSOR_ARGMAX_AXIS2_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \
+    __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_2D( \
+__read_only  image2d_array_t  input, \
+__write_only image2d_array_t  output, \
+        int  axisVal \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \
+    VXC_WriteImage(output, coord, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+TENSOR_ARGMAX_AXIS2_8BITS_2D(I8,  I16, vxc_char8,  vxc_short8)
+TENSOR_ARGMAX_AXIS2_8BITS_2D(I8,  U8,  vxc_char8,  vxc_uchar8)
+TENSOR_ARGMAX_AXIS2_8BITS_2D(U8,  I16, vxc_uchar8, vxc_short8)
+TENSOR_ARGMAX_AXIS2_8BITS_2D(U8,  U8,  vxc_uchar8, vxc_uchar8)
+
+#define TENSOR_ARGMAX_AXIS2_MIX(src_type_name, dst_type_name, src_type, dst_type) \
+    __kernel void argmax_axis2_##src_type_name##to##dst_type_name( \
+__read_only  image2d_array_t  input, \
 __write_only image2d_array_t  output, \
         int  axisVal \
     ) \
@@ -95,23 +147,46 @@ __write_only image2d_array_t  output, \
  \
     VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
-TENSOR_ARGMAX_AXIS2_8BITS(I8,  I16, vxc_char8,  vxc_short8)
-TENSOR_ARGMAX_AXIS2_8BITS(I8,  U8,  vxc_char8,  vxc_uchar8)
-TENSOR_ARGMAX_AXIS2_8BITS(U8,  I16, vxc_uchar8, vxc_short8)
-TENSOR_ARGMAX_AXIS2_8BITS(U8,  U8,  vxc_uchar8, vxc_uchar8)
+TENSOR_ARGMAX_AXIS2_MIX(I8,  I16, vxc_char8,  vxc_short8)
+TENSOR_ARGMAX_AXIS2_MIX(U8,  I16, vxc_uchar8, vxc_short8)
 
-#define TENSOR_ARGMAX_AXIS2_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \
-    __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_2D( \
+#define TENSOR_ARGMAX_AXIS2_MIX_OPT(src_type_name, dst_type_name, src_type, dst_type) \
+    __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_opt( \
 __read_only  image2d_array_t  input, \
 __write_only image2d_array_t  output, \
         int  axisVal \
     ) \
 { \
-    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
-    dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \
-    VXC_WriteImage(output, coord, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \
+    src_type src; \
+    src_type maxVal; \
+    VXC_ReadImage2DArray(maxVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    vxc_uchar16 axis; \
+    vxc_uchar16 packIdx; \
+ \
+    _viv_asm(COPY, axis, packedArgIdx, 16); \
+    _viv_asm(COPY, packIdx, packedArgIdx, 16); \
+ \
+    coord.z --; \
+    do \
+    { \
+       VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+       coord.z --; \
+       packIdx --; \
+       maxVal = max(maxVal, src); \
+       src_type condition; \
+       VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \
+       axis = condition ? packIdx : axis; \
+    } while (coord.z >= 0); \
+    vxc_short8 dst0, dst1; \
+    VXC_DP2x8(dst0, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+                    uniExtract1stU8toI16_2x8); \
+    VXC_DP2x8(dst1, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+                    uniExtract2ndU8toI16_2x8); \
+ \
+    VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    coord.x += 8; \
+    VXC_WriteImage(output, coord.xy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
-TENSOR_ARGMAX_AXIS2_8BITS_2D(I8,  I16, vxc_char8,  vxc_short8)
-TENSOR_ARGMAX_AXIS2_8BITS_2D(I8,  U8,  vxc_char8,  vxc_uchar8)
-TENSOR_ARGMAX_AXIS2_8BITS_2D(U8,  I16, vxc_uchar8, vxc_short8)
-TENSOR_ARGMAX_AXIS2_8BITS_2D(U8,  U8,  vxc_uchar8, vxc_uchar8)
+TENSOR_ARGMAX_AXIS2_MIX_OPT(I8,  I16, vxc_char16,  vxc_short8)
+TENSOR_ARGMAX_AXIS2_MIX_OPT(U8,  I16, vxc_uchar16, vxc_short8)
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax.vx b/src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx
similarity index 83%
rename from src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax.vx
rename to src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx
index 305f666..e3ca29e 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx
@@ -19,14 +19,13 @@ __kernel void Softmax2VXC
     int axis
     )
 {
-
    int4 coord_in = (int4)(0,0,0,0);
    float fMax = 0.0;
    for (int i = 0; i < sf_size; i++)
    {
        vxc_char8 val;
        coord_in.x = i;
-       VXC_ReadImage2DArray(val, input, coord_in, VXC_5BITOFFSET_XY(0,0), VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
        float fval;
        VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
 
@@ -40,7 +39,7 @@ __kernel void Softmax2VXC
        vxc_char8 val;
 
        coord_in.x = i;
-       VXC_ReadImage2DArray(val, input, coord_in, VXC_5BITOFFSET_XY(0,0), VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
        float fval;
        VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
 
@@ -57,7 +56,7 @@ __kernel void Softmax2VXC
        vxc_short8 val;
        vxc_half8  val_h;
        coord_in.x = i;
-       VXC_ReadImage2DArray(val, output, coord_in, VXC_5BITOFFSET_XY(0,0), VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
+       VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
        float fval;
        _viv_asm(COPY, val_h,val, 16);
        VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
@@ -68,8 +67,4 @@ __kernel void Softmax2VXC
        _viv_asm(COPY,dst,hVal, 4);
        VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     }
-
 }
-
-
-
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine.vx b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine.vx
new file mode 100644
index 0000000..cd9511b
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine.vx
@@ -0,0 +1,353 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float4 matrix0;
+_viv_uniform float2 matrix1;
+_viv_uniform float4 matrix4;
+__kernel void custom_warp_affine_nearest_neighbor_U8toU8_2D
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5
+)
+{
+    int2   coord = (int2)(get_global_id(0), get_global_id(1));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f = convert_float4(coord_in);
+
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
+
+    coord_in = convert_int4(coord_f);
+
+    vxc_uchar16 dst;
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_warp_affine_bilinear_U8toU8_2D
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5
+)
+{
+    int2   coord = (int2)(get_global_id(0), get_global_id(1));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f = convert_float4(coord_in);
+
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
+
+    coord_in = convert_int4(coord_f);
+
+    vxc_uchar16 src0, src1, dst;
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_warp_affine_nearest_neighbor_U8toU8
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5
+)
+{
+    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f = convert_float4(coord_in);
+
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
+
+    coord_in = convert_int4(coord_f);
+
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_input.w, baseAddr);
+
+    vxc_uchar16 dst;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    coord_input.xy = coord_in.xy;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    coord_input.xy = coord_in.xy;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    coord_input.xy = coord_in.xy;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_warp_affine_bilinear_U8toU8
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5
+)
+{
+    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f = convert_float4(coord_in);
+
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
+
+    coord_in = convert_int4(coord_f);
+
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_input.w, baseAddr);
+
+    vxc_uchar16 src0, src1, dst;
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+
+    coord_input.xy = coord_in.xy;
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    coord_input.xy = coord_in.xy;
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    coord_input.xy = coord_in.xy;
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_perspective.vx b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_perspective.vx
new file mode 100644
index 0000000..3b3b3f1
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_perspective.vx
@@ -0,0 +1,395 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float4 matrix0;
+_viv_uniform float4 matrix1;
+_viv_uniform float4 matrix2;
+_viv_uniform float4 matrix4;
+__kernel void custom_warp_perspective_nearest_neighbor_U8toU8_2D
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5,
+                 float           _m6,
+                 float           _m7,
+                 float           _m8
+)
+{
+    int2   coord = (int2)(get_global_id(0), get_global_id(1));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f0 = convert_float4(coord_in);
+
+    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;
+    z0.zw = z0.zw + 2 * matrix1.z;
+    float4 z1 = z0 + 4 * matrix1.z;
+
+    z0 = 1.0f / z0;
+    z1 = 1.0f / z1;
+
+    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;
+    float4 coord_f = coord_f0 * z0.xxyy;
+
+    coord_in = convert_int4(coord_f);
+
+    vxc_uchar16 dst;
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z0.zzww;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z1.xxyy;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z1.zzww;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_warp_perspective_bilinear_U8toU8_2D
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5,
+                 float           _m6,
+                 float           _m7,
+                 float           _m8
+)
+{
+    int2   coord = (int2)(get_global_id(0), get_global_id(1));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f0 = convert_float4(coord_in);
+
+    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;
+    z0.zw = z0.zw + 2 * matrix1.z;
+    float4 z1 = z0 + 4 * matrix1.z;
+
+    z0 = 1.0f / z0;
+    z1 = 1.0f / z1;
+
+    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;
+    float4 coord_f = coord_f0 * z0.xxyy;
+
+    coord_in = convert_int4(floor(coord_f));
+
+    vxc_uchar16 src0, src1, dst;
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z0.zzww;
+    coord_in = convert_int4(floor(coord_f));
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z1.xxyy;
+    coord_in = convert_int4(floor(coord_f));
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z1.zzww;
+    coord_in = convert_int4(floor(coord_f));
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+#define IMAGE_LOAD_3D(dst, xoffset, yoffset, start, end) \
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, VXC_5BITOFFSET_XY(xoffset, yoffset), \
+        VXC_MODIFIER(start, end, 0, VXC_RM_TowardZero, 0));
+__kernel void custom_warp_perspective_nearest_neighbor_U8toU8
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5,
+                 float           _m6,
+                 float           _m7,
+                 float           _m8
+)
+{
+    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f0 = convert_float4(coord_in);
+
+    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;
+    z0.zw = z0.zw + 2 * matrix1.z;
+    float4 z1 = z0 + 4 * matrix1.z;
+
+    z0 = 1.0f / z0;
+    z1 = 1.0f / z1;
+
+    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;
+    float4 coord_f = coord_f0 * z0.xxyy;
+
+    coord_in = convert_int4(coord_f);
+
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_input.w, baseAddr);
+
+    vxc_uchar16 dst;
+    IMAGE_LOAD_3D(dst, 0, 0, 0, 0)
+    coord_input.xy = coord_in.zw;
+    IMAGE_LOAD_3D(dst, 0, 0, 1, 1)
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z0.zzww;
+    coord_in = convert_int4(coord_f);
+    coord_input.xy = coord_in.xy;
+    IMAGE_LOAD_3D(dst, 0, 0, 2, 2)
+    coord_input.xy = coord_in.zw;
+    IMAGE_LOAD_3D(dst, 0, 0, 3, 3)
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z1.xxyy;
+    coord_in = convert_int4(coord_f);
+    coord_input.xy = coord_in.xy;
+    IMAGE_LOAD_3D(dst, 0, 0, 4, 4)
+    coord_input.xy = coord_in.zw;
+    IMAGE_LOAD_3D(dst, 0, 0, 5, 5)
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z1.zzww;
+    coord_in = convert_int4(coord_f);
+    coord_input.xy = coord_in.xy;
+    IMAGE_LOAD_3D(dst, 0, 0, 6, 6)
+    coord_input.xy = coord_in.zw;
+    IMAGE_LOAD_3D(dst, 0, 0, 7, 7)
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_warp_perspective_bilinear_U8toU8
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5,
+                 float           _m6,
+                 float           _m7,
+                 float           _m8
+)
+{
+    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f0 = convert_float4(coord_in);
+
+    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;
+    z0.zw = z0.zw + 2 * matrix1.z;
+    float4 z1 = z0 + 4 * matrix1.z;
+
+    z0 = 1.0f / z0;
+    z1 = 1.0f / z1;
+
+    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;
+    float4 coord_f = coord_f0 * z0.xxyy;
+
+    coord_in = convert_int4(floor(coord_f));
+
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_input.w, baseAddr);
+
+    vxc_uchar16 src0, src1, dst;
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_input.xy = coord_in.zw;
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z0.zzww;
+    coord_in = convert_int4(floor(coord_f));
+    coord_input.xy = coord_in.xy;
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_input.xy = coord_in.zw;
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z1.xxyy;
+    coord_in = convert_int4(floor(coord_f));
+    coord_input.xy = coord_in.xy;
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_input.xy = coord_in.zw;
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z1.zzww;
+    coord_in = convert_int4(floor(coord_f));
+    coord_input.xy = coord_in.xy;
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_input.xy = coord_in.zw;
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx b/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx
index a5612b4..601ebfd 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx
@@ -304,4 +304,4 @@ __kernel void depth2space_crd_F16toI16_blk2(
     VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
     coord_out.x += 8;
     VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx
index 8a56bb3..086e399 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx
@@ -8,6 +8,11 @@ float4 eltwise_unary_sin(float4 x)
     return native_sin(x);
 }
 
+float4 eltwise_unary_cos(float4 x)
+{
+    return native_cos(x);
+}
+
 #define logE        (1.44269502f)
 #define twoLogE     (logE * 2.0f)
 float4 eltwise_unary_exp(float4 x)
@@ -189,6 +194,17 @@ ELTSISE_UNARY_2D(sin, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_u
 ELTSISE_UNARY_2D(sin, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
 ELTSISE_UNARY_2D(sin, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
 ELTSISE_UNARY_2D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//COS
+ELTSISE_UNARY_2D(cos, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(cos, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(cos, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(cos, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(cos, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(cos, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(cos, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(cos, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(cos, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
 //LOG
 ELTSISE_UNARY_2D(log, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
 ELTSISE_UNARY_2D(log, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
@@ -315,6 +331,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;
 ELTSISE_UNARY_BF16_2D(exp)
 //SIN
 ELTSISE_UNARY_BF16_2D(sin)
+//COS
+ELTSISE_UNARY_BF16_2D(cos)
 //LOG
 ELTSISE_UNARY_BF16_2D(log)
 //ELU
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx
index 3faa1f5..a7ba363 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx
@@ -8,6 +8,11 @@ float4 eltwise_unary_sin(float4 x)
     return native_sin(x);
 }
 
+float4 eltwise_unary_cos(float4 x)
+{
+    return native_cos(x);
+}
+
 #define logE        (1.44269502f)
 #define twoLogE     (logE * 2.0f)
 float4 eltwise_unary_exp(float4 x)
@@ -189,6 +194,17 @@ ELTSISE_UNARY_3D(sin, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_u
 ELTSISE_UNARY_3D(sin, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
 ELTSISE_UNARY_3D(sin, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
 ELTSISE_UNARY_3D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//COS
+ELTSISE_UNARY_3D(cos, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(cos, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(cos, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(cos, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(cos, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(cos, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(cos, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(cos, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(cos, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
 //LOG
 ELTSISE_UNARY_3D(log, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
 ELTSISE_UNARY_3D(log, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
@@ -314,6 +330,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;
 ELTSISE_UNARY_BF16(exp)
 //SIN
 ELTSISE_UNARY_BF16(sin)
+//COS
+ELTSISE_UNARY_BF16(cos)
 //LOG
 ELTSISE_UNARY_BF16(log)
 //ELU
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
index 90b5135..3a1661e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
@@ -91,8 +91,6 @@ __kernel void gather_F16toF16(
     int gidz = get_global_id(2);  // block_num
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
-
-
     int4 indice = read_imagei(input1, coord_in.xy);
     coord_in.w = gidz * axis_num + indice.x;
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx
new file mode 100644
index 0000000..8d09d50
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx
@@ -0,0 +1,237 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int indices_num;
+_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8;
+_viv_uniform int batch;
+
+__kernel void gather_batch_I8toI8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        vxc_char16 src;
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_in.z++;
+        VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord.z++;
+    }
+}
+
+__kernel void gather_batch_U8toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        vxc_uchar16 src;
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_in.z++;
+        VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord.z++;
+    }
+}
+
+__kernel void gather_batch_I16toI16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        vxc_short8 src;
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.z++;
+        VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord.z++;
+    }
+}
+
+__kernel void gather_batch_F16toF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        vxc_short8 src;
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.z++;
+        VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord.z++;
+    }
+}
+
+__kernel void gather_batch_I8toI8_axis0(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 indices = read_imagei(input1, coord.xz);
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
+
+    vxc_char16 src, dst;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.y;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.z;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.w;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniExtraCopyDpKeepinEvis_2x8);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_batch_U8toU8_axis0(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 indices = read_imagei(input1, coord.xz);
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
+
+    vxc_uchar16 src, dst;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.y;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.z;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.w;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniExtraCopyDpKeepinEvis_2x8);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_batch_I16toI16_axis0(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 indices = read_imagei(input1, coord.xz);
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.y;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.z;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.w;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniExtraCopyDpKeepinEvis_2x8);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_batch_F16toF16_axis0(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 indices = read_imagei(input1, coord.xz);
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.y;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.z;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.w;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniExtraCopyDpKeepinEvis_2x8);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx
new file mode 100644
index 0000000..0e94445
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx
@@ -0,0 +1,236 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int indices_num;
+_viv_uniform int batch;
+
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
+
+_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8;
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
+
+#define GATHER_BATCH_8BITS_TO_F16(src0_type_name, read_type) \
+__kernel void gather_batch_##src0_type_name##toF16( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int block_num, \
+    int axis_num \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+ \
+    int2 coord_idx = (int2)(gidy, 0); \
+    int4 coord_in = (int4)(gidx, 0, 0, 0); \
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+ \
+    for(; coord_idx.y < batch;) \
+    { \
+        int4 indice = read_imagei(input1, coord_idx); \
+        coord_idx.y++; \
+        coord_in.y = gidz * axis_num + indice.x; \
+ \
+        read_type src; \
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.z++; \
+        vxc_half8  src0, src1; \
+        vxc_short8 dst0, dst1; \
+ \
+        VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \
+        VXC_DP2x8(src1,src,ms0, VXC_MODIFIER(0,7,8, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \
+        _viv_asm(COPY, dst0, src0, 16); \
+        _viv_asm(COPY, dst1, src1, 16); \
+        VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.x += 8; \
+        VXC_WriteImage2DArray(output, coord, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.z++; \
+        coord.x = gidx; \
+    } \
+}
+GATHER_BATCH_8BITS_TO_F16(U8, vxc_uchar16)
+GATHER_BATCH_8BITS_TO_F16(I8, vxc_char16)
+
+#define GATHER_BATCH_F16_TO_QINT(src1_type_name, write_type) \
+__kernel void gather_batch_F16to##src1_type_name( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int block_num, \
+    int axis_num \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+ \
+    int2 coord_idx = (int2)(gidy, 0); \
+    int4 coord_in = (int4)(gidx, 0, 0, 0); \
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); \
+    vxc_ushort8 mp1; \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    for(; coord_idx.y < batch;) \
+    { \
+        int4 indice = read_imagei(input1, coord_idx); \
+        coord_idx.y++; \
+        coord_in.y = gidz * axis_num + indice.x; \
+ \
+        vxc_short8 src; \
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.z++; \
+ \
+        vxc_half8 data; \
+        write_type dst; \
+        _viv_asm(COPY, data, src, 16); \
+        VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.z++; \
+    } \
+}
+GATHER_BATCH_F16_TO_QINT(U8, vxc_uchar16)
+GATHER_BATCH_F16_TO_QINT(I8, vxc_char16)
+GATHER_BATCH_F16_TO_QINT(I16, vxc_short8)
+
+__kernel void gather_batch_I16toF16(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    vxc_half8  src0;
+    vxc_short8 dst0;
+    vxc_ushort8 ms0;
+    _viv_asm(COPY, ms0, multAndoutZP0, 16);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        vxc_short8 src;
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.z++;
+        VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniU8MulAndPostShift_0_Lo_2x8);
+        _viv_asm(COPY, dst0, src0, 16);
+        VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord.z++;
+    }
+}
+
+#define GATHER_BATCH_8BITS_TO_F16_AXIS0(src0_type_name, read_type) \
+__kernel void gather_batch_##src0_type_name##toF16_axis0( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int block_num, \
+    int axis_num \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int4 indices = read_imagei(input1, coord.xz); \
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \
+ \
+    read_type src; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = indices.y; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = indices.z; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = indices.w; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_half8  src0; \
+    vxc_short8 dst0; \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \
+    _viv_asm(COPY, dst0, src0, 16); \
+    VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+GATHER_BATCH_8BITS_TO_F16_AXIS0(U8, vxc_uchar16)
+GATHER_BATCH_8BITS_TO_F16_AXIS0(I8, vxc_char16)
+
+#define GATHER_BATCH_F16_TO_QINT_AXIS0(src1_type_name, write_type) \
+__kernel void gather_batch_F16to##src1_type_name##_axis0( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int block_num, \
+    int axis_num \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int4 indices = read_imagei(input1, coord.xz); \
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \
+ \
+    vxc_short8 src; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = indices.y; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = indices.z; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = indices.w; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_ushort8 mp1; \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    vxc_half8 data; \
+    write_type dst; \
+    _viv_asm(COPY, data, src, 16); \
+    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+GATHER_BATCH_F16_TO_QINT_AXIS0(U8, vxc_uchar16)
+GATHER_BATCH_F16_TO_QINT_AXIS0(I8, vxc_char16)
+GATHER_BATCH_F16_TO_QINT_AXIS0(I16, vxc_short8)
+
+__kernel void gather_batch_I16toF16_axis0(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 indices = read_imagei(input1, coord.xz);
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.y;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.z;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.w;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    vxc_half8  src0;
+    vxc_short8 dst0;
+    vxc_ushort8 ms0;
+    _viv_asm(COPY, ms0, multAndoutZP0, 16);
+    VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniU8MulAndPostShift_0_Lo_2x8);
+    _viv_asm(COPY, dst0, src0, 16);
+
+    VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/logical_ops.vx b/src/tim/vx/internal/src/libnnext/ops/vx/logical_ops.vx
index 4ba7c40..dceb404 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/logical_ops.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/logical_ops.vx
@@ -1,5 +1,7 @@
 #include "cl_viv_vx_ext.h"
 
+_viv_uniform VXC_512Bits uniConvertInt16toInt8_2x8;
+
 #define TENSORLOGICAL_PROCESS(input_type, copy_type, output_type, out_copy_type,\
 lgc_op, lgc_op2, read_fun, write_fun) \
     input_type vA;\
@@ -59,7 +61,7 @@ out_copy_type, lgc_op, lgc_op2,  read_fun, write_fun) \
     VXC_DP2x8(tmpOut,dst,dst,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero, 0),uniMulShortMinus1toFp16_2x8); \
     out_copy_type data; \
     _viv_asm(COPY, data, tmpOut, 16); \
-    write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 
 
 #define TENSORLOGICAL_FP(name0, src_type_name, dst_type_name, input_type,\
@@ -86,6 +88,47 @@ copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \
     VXC_ReadImage, VXC_WriteImage) \
 }
 
+#define TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type,\
+out_copy_type, lgc_op, lgc_op2,  read_fun, write_fun) \
+    input_type vA;\
+    copy_type  src0;\
+    input_type vB;\
+    copy_type  src1;\
+    read_fun(vA,in0,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\
+    _viv_asm(COPY, src0, vA, 16); \
+    read_fun(vB,in1,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\
+    _viv_asm(COPY, src1, vB, 16); \
+    output_type dst; \
+    dst = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \
+    vxc_char8 data; \
+    VXC_DP2x8(data,dst,dst,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero, 1),uniConvertInt16toInt8_2x8); \
+    data &= 1; \
+    write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+#define TENSORLOGICAL_BFP16(name0, src_type_name, dst_type_name, input_type,\
+copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \
+    __kernel void logical_##name0##_##src_type_name##to##dst_type_name( \
+    __read_only  image2d_array_t in0, \
+    __read_only  image2d_array_t in1, \
+    __write_only image2d_array_t output) \
+{\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\
+    TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2,\
+    VXC_ReadImage2DArray, VXC_WriteImage2DArray) \
+}
+
+#define TENSORLOGICAL_BFP16_2D(name0, src_type_name, dst_type_name, input_type,\
+copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \
+    __kernel void logical_##name0##_##src_type_name##to##dst_type_name##_2D( \
+    __read_only  image2d_array_t in0, \
+    __read_only  image2d_array_t in1, \
+    __write_only image2d_array_t output) \
+{\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));\
+    TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2,\
+    VXC_ReadImage, VXC_WriteImage) \
+}
+
 //          name0, src_name, dst_name, input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2
 TENSORLOGICAL(or,      I8,     I8,     vxc_char8,   vxc_char8,   vxc_char8,   vxc_char8,   ||, )
 //TENSORLOGICAL(or,      U8,     U8,     vxc_uchar8,  vxc_uchar8,  vxc_char8,   vxc_uchar8,  ||, )
@@ -100,6 +143,10 @@ TENSORLOGICAL(xor,     I8,     I8,     vxc_char8,   vxc_char8,   vxc_char8,   vx
 //TENSORLOGICAL(xor,     I16,    I16,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)
 //TENSORLOGICAL_FP(xor,  F16,    F16,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)
 
+TENSORLOGICAL_BFP16(or,   BF16,   I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ||, )
+TENSORLOGICAL_BFP16(and,  BF16,   I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  &&, )
+TENSORLOGICAL_BFP16(xor,  BF16,   I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)
+
 TENSORLOGICAL_2D(or,      I8,     I8,     vxc_char8,   vxc_char8,   vxc_char8,   vxc_char8,   ||, )
 //TENSORLOGICAL_2D(or,      U8,     U8,     vxc_uchar8,  vxc_uchar8,  vxc_char8,   vxc_uchar8,  ||, )
 //TENSORLOGICAL_2D(or,      I16,    I16,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ||, )
@@ -112,3 +159,7 @@ TENSORLOGICAL_2D(xor,     I8,     I8,     vxc_char8,   vxc_char8,   vxc_char8,
 //TENSORLOGICAL_2D(xor,     U8,     U8,     vxc_uchar8,  vxc_uchar8,  vxc_char8,   vxc_uchar8,  ^, !!)
 //TENSORLOGICAL_2D(xor,     I16,    I16,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)
 //TENSORLOGICAL_FP_2D(xor,  F16,    F16,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)
+
+TENSORLOGICAL_BFP16_2D(or,   BF16,  I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ||, )
+TENSORLOGICAL_BFP16_2D(and,  BF16,  I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  &&, )
+TENSORLOGICAL_BFP16_2D(xor,  BF16,  I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx
new file mode 100644
index 0000000..433dc4f
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx
@@ -0,0 +1,272 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int ac2zero;
+_viv_uniform int bc2zero;
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+
+__kernel void gemm_BF16BF16toBF16(image2d_array_t inputA,
+            image2d_array_t inputB, image2d_array_t output,
+            int transposeA, int transposeB,
+            int adjointA, int adjointB, uint M, uint K, uint N)
+{
+    uint gidy = get_global_id(1);
+    int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0);
+    int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0);
+
+    vxc_ushort8 valC0, valC1, src0, src1;
+    vxc_ushort8 srcA0, srcB0, srcA1, srcB1, outC;
+    vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);
+    vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);
+
+    int8 inputA_desc, inputB_desc, output_desc;
+    _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
+    int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
+    _viv_asm(MOV, coord_a.w, baseAddr_a);
+    _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
+    int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
+    _viv_asm(MOV, coord_b.w, baseAddr_b);
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+
+    for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)
+    {
+        vxc_float4 tempA0, tempA1, tempA2, tempA3;
+        vxc_float4 tempB0, tempB1, tempB2, tempB3;
+        VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),
+                    VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),
+                    VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),
+                    VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),
+                    VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+        coord_a.x += 4;
+        coord_b.y += 4;
+
+        VXC_DP2x8(src0, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempA0, src0, 16);
+        VXC_DP2x8(src1, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, tempA1, src1, 16);
+        VXC_DP2x8(src0, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempA2, src0, 16);
+        VXC_DP2x8(src1, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, tempA3, src1, 16);
+        VXC_DP2x8(src0, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempB0, src0, 16);
+        VXC_DP2x8(src1, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, tempB1, src1, 16);
+        VXC_DP2x8(src0, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempB2, src0, 16);
+        VXC_DP2x8(src1, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, tempB3, src1, 16);
+
+        sum0 += (tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3);
+        sum1 += (tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3);
+        sum2 += (tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3);
+        sum3 += (tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3);
+    }
+    coord_b.y = gidy;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_b.w, baseAddr);
+    _viv_asm(COPY, valC0, sum0, 16);
+    _viv_asm(COPY, valC1, sum1, 16);
+    VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    coord_b.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    coord_b.y++;
+    _viv_asm(COPY, valC0, sum2, 16);
+    _viv_asm(COPY, valC1, sum3, 16);
+    VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    coord_b.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void gemm_transa_BF16BF16toBF16(
+                        image2d_array_t inputA,
+                        image2d_array_t inputB,
+                        image2d_array_t output,
+                                    int transposeA,
+                                    int transposeB,
+                                    int adjointA,
+                                    int adjointB,
+                        uint M, uint K, uint N)
+{
+    uint gidy = get_global_id(1);
+
+    vxc_ushort8 valC0, valC1;
+    vxc_ushort8 srcA, srcB, outC, src0, src1;
+
+    int4 coord_a = (int4)(gidy, 0, (ac2zero ? 0 : get_global_id(2)), 0);
+    int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0);
+
+    vxc_float4 sum0 = (vxc_float4)(0);
+    vxc_float4 sum1 = (vxc_float4)(0);
+    vxc_float4 sum2 = (vxc_float4)(0);
+    vxc_float4 sum3 = (vxc_float4)(0);
+
+    int8 inputA_desc, inputB_desc, output_desc;
+    _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
+    int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
+    _viv_asm(MOV, coord_a.w, baseAddr_a);
+    _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
+    int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
+    _viv_asm(MOV, coord_b.w, baseAddr_b);
+
+    vxc_float4 tempA0;
+    vxc_float4 tempB0;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;)
+    {
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        coord_a.y++;
+        coord_b.y++;
+
+        VXC_DP2x8(src0, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempA0, src0, 16);
+
+        VXC_DP2x8(src1, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempB0, src1, 16);
+
+        sum0 = (sum0 + tempA0.x * tempB0);
+        sum1 = (sum1 + tempA0.y * tempB0);
+        sum2 = (sum2 + tempA0.z * tempB0);
+        sum3 = (sum3 + tempA0.w * tempB0);
+    }
+    coord_b.y = gidy;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_b.w, baseAddr);
+    _viv_asm(COPY, valC0, sum0, 16);
+    _viv_asm(COPY, valC1, sum1, 16);
+    VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    coord_b.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    coord_b.y++;
+    _viv_asm(COPY, valC0, sum2, 16);
+    _viv_asm(COPY, valC1, sum3, 16);
+    VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    coord_b.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void gemm_transb_BF16BF16toBF16(image2d_array_t inputA,
+                        image2d_array_t inputB,
+                        image2d_array_t output,
+                                    int transposeA,
+                                    int transposeB,
+                                    int adjointA,
+                                    int adjointB,
+                        uint M, uint K, uint N)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);
+    int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);
+
+    vxc_float4 sum0 = (vxc_float4)(0);
+    vxc_float4 sum1 = (vxc_float4)(0);
+    vxc_float4 sum2 = (vxc_float4)(0);
+    vxc_float4 sum3 = (vxc_float4)(0);
+
+    int8 inputA_desc, inputB_desc, output_desc;
+    _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
+    int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
+    _viv_asm(MOV, coord_a.w, baseAddr_a);
+    _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
+    int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
+    _viv_asm(MOV, coord_b.w, baseAddr_b);
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_ushort8 src0, src1;
+
+    for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;)
+    {
+        vxc_ushort8 srcA0,srcA1,srcA2,srcA3;
+        vxc_ushort8 srcB0,srcB1,srcB2,srcB3;
+        vxc_float4 tempA0, tempA1, tempA2, tempA3;
+        vxc_float4 tempB0, tempB1, tempB2, tempB3;
+        VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        coord_a.x += 4;
+        coord_b.x += 4;
+
+        VXC_DP2x8(src0, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempA0, src0, 16);
+        VXC_DP2x8(src1, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempA1, src1, 16);
+        VXC_DP2x8(src0, srcA2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempA2, src0, 16);
+        VXC_DP2x8(src1, srcA3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempA3, src1, 16);
+
+        VXC_DP2x8(src0, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempB0, src0, 16);
+        VXC_DP2x8(src1, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempB1, src1, 16);
+        VXC_DP2x8(src0, srcB2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempB2, src0, 16);
+        VXC_DP2x8(src1, srcB3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempB3, src1, 16);
+
+        sum0 += (float4)(dot(tempA0, tempB0), dot(tempA0, tempB1), dot(tempA0, tempB2), dot(tempA0, tempB3));
+        sum1 += (float4)(dot(tempA1, tempB0), dot(tempA1, tempB1), dot(tempA1, tempB2), dot(tempA1, tempB3));
+        sum2 += (float4)(dot(tempA2, tempB0), dot(tempA2, tempB1), dot(tempA2, tempB2), dot(tempA2, tempB3));
+        sum3 += (float4)(dot(tempA3, tempB0), dot(tempA3, tempB1), dot(tempA3, tempB2), dot(tempA3, tempB3));
+    }
+
+    vxc_ushort8 valC0, valC1, valDst;
+    _viv_asm(COPY, valC0, sum0, 16);
+    _viv_asm(COPY, valC1, sum1, 16);
+    VXC_DP2x8(valDst, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    _viv_asm(COPY, valC0, sum2, 16);
+    _viv_asm(COPY, valC1, sum3, 16);
+    VXC_DP2x8(valDst, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx
index 9d2ef89..bd211d4 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx
@@ -11,6 +11,9 @@ _viv_uniform int ac2zero;
 _viv_uniform int bc2zero;
 
 _viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
 
 #if (VX_VERSION==2)
 __kernel void gemm_F16F16toF16(image2d_array_t inputA,
@@ -192,14 +195,9 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA,
 }
 #endif
 
-__kernel void gemm_F32F32toF32(image2d_array_t inputA,
-                               image2d_array_t inputB,
-                               image2d_array_t output,
-                                    int transposeA,
-                                    int transposeB,
-                                    int adjointA,
-                                    int adjointB,
-                        uint M, uint K, uint N)
+__kernel void gemm_F32F32toF32(
+        image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output,
+        int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N)
 {
     uint gidx = get_global_id(0);
     uint gidy = get_global_id(1);
@@ -207,10 +205,8 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA,
     int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0);
     int4 coord_b = (int4)(gidx, 0, (bc2zero ? 0 : get_global_id(2)), 0);
 
-    vxc_float4 sum0 = (vxc_float4)(0);
-    vxc_float4 sum1 = (vxc_float4)(0);
-    vxc_float4 sum2 = (vxc_float4)(0);
-    vxc_float4 sum3 = (vxc_float4)(0);
+    vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);
+    vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);
 
     vxc_int4 tmpOut0, tmpOut1;
     vxc_uchar16 outC;
@@ -224,7 +220,6 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA,
 
         coord_a.x = i;
         coord_a.y = gidy;
-
         coord_b.x = gidx;
         coord_b.y = i;
 
@@ -257,4 +252,4 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA,
     write_imagef(output, coord_b, sum2);
     coord_b.y++;
     write_imagef(output, coord_b, sum3);
-}
\ No newline at end of file
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/maximum.vx b/src/tim/vx/internal/src/libnnext/ops/vx/maximum.vx
index c40c720..cb7c067 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/maximum.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/maximum.vx
@@ -222,6 +222,62 @@ __kernel void maximum_U8U8toU8_2D
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
 }
 
+__kernel void maximum_U8U8toI16
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_uchar16 src0, src1;
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    vxc_short8 dst0, dst1, dst;
+    vxc_ushort8 mp0, mp1;
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift0_Lo_2x8);
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift1_Lo_2x8);
+    dst = max(dst0, dst1);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void maximum_U8U8toI16_2D
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+
+    vxc_uchar16 src0, src1;
+    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    vxc_short8 dst0, dst1, dst;
+    vxc_ushort8 mp0, mp1;
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift0_Lo_2x8);
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift1_Lo_2x8);
+    dst = max(dst0, dst1);
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
 _viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8;
 _viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8;
 __kernel void maximum_I16I16toI16
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/maximum_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/maximum_i16.vx
index 15ab020..aab5d72 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/maximum_i16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/maximum_i16.vx
@@ -170,4 +170,64 @@ __kernel void maximum_F16F16toI16_2D
     tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);
     VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
+__kernel void maximum_I16I16toU8
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src0, src1;
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    vxc_uchar16 dst0, dst1, dst;
+    vxc_ushort8 mp0, mp1;
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift0_Lo_2x8);
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift1_Lo_2x8);
+    dst = max(dst0, dst1);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void maximum_I16I16toU8_2D
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
+
+    vxc_short8 src0, src1;
+    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    vxc_uchar16 dst0, dst1, dst;
+    vxc_ushort8 mp0, mp1;
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift0_Lo_2x8);
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift1_Lo_2x8);
+    dst = max(dst0, dst1);
+
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
 }
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/minimum.vx b/src/tim/vx/internal/src/libnnext/ops/vx/minimum.vx
index 4bfe529..0b3ef97 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/minimum.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/minimum.vx
@@ -224,6 +224,62 @@ __kernel void minimum_U8U8toU8_2D
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
 }
 
+__kernel void minimum_U8U8toI16
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_uchar16 src0, src1;
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    vxc_short8 dst0, dst1, dst;
+    vxc_ushort8 mp0, mp1;
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift0_Lo_2x8);
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift1_Lo_2x8);
+    dst = min(dst0, dst1);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void minimum_U8U8toI16_2D
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+
+    vxc_uchar16 src0, src1;
+    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    vxc_short8 dst0, dst1, dst;
+    vxc_ushort8 mp0, mp1;
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift0_Lo_2x8);
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift1_Lo_2x8);
+    dst = min(dst0, dst1);
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
 _viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8;
 _viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8;
 __kernel void minimum_I16I16toI16
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/minimum_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/minimum_i16.vx
index a314ca9..c2f5ca5 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/minimum_i16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/minimum_i16.vx
@@ -173,5 +173,65 @@ __kernel void minimum_F16F16toI16_2D
     tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);
     VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
 
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
+__kernel void minimum_I16I16toU8
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src0, src1;
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    vxc_uchar16 dst0, dst1, dst;
+    vxc_ushort8 mp0, mp1;
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift0_Lo_2x8);
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift1_Lo_2x8);
+    dst = min(dst0, dst1);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void minimum_I16I16toU8_2D
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
+
+    vxc_short8 src0, src1;
+    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    vxc_uchar16 dst0, dst1, dst;
+    vxc_ushort8 mp0, mp1;
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift0_Lo_2x8);
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift1_Lo_2x8);
+    dst = min(dst0, dst1);
+
     VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
 }
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis0.vx
index 3d8dd53..2652b0f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis0.vx
@@ -17,6 +17,9 @@ _viv_uniform float e2InScale;
 _viv_uniform float rowSumScale;
 _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
 _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
 
 #define MOMENTS_AXIS0_QINT(src0_type_name, read0_type) \
 __kernel void moments_axis0_##src0_type_name##toF16( \
@@ -262,6 +265,88 @@ __kernel void moments_axis0_I16toF16_2D(
     VXC_DP2x8(tmpVal, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
     _viv_asm(COPY, dst, tmpVal, 16);
 
+    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void moments_axis0_BF16toBF16(
+    image2d_array_t input,
+    image2d_t output_mean,
+    image2d_t output_vari,
+              int axis, int axis_num)
+{
+    int gidy = get_global_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(0, gidy, gidz, 0);
+    vxc_ushort8 src0, src1;
+    vxc_ushort8 val;
+    vxc_float4 mean_vari0 = (vxc_float4)(0);
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);
+    for(coord.x = 0; coord.x < width; coord.x += 8)
+    {
+        VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        float4 vec0, vec1;
+        VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, vec0, src0, 16);
+        VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, vec1, src1, 16);
+        mean_vari0.x += dot(vec0, one) + dot(vec1, one);
+        mean_vari0.y += dot(vec0, vec0) + dot(vec1, vec1);
+    }
+
+    mean_vari0 *= dimRatio;
+    mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0;
+
+    int2 coord_out = (int2)(gidy, gidz);
+
+    vxc_short8 dst;
+    _viv_asm(COPY, src0, mean_vari0, 16);
+    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void moments_axis0_BF16toBF16_2D(
+    image2d_t input,
+    image2d_t output_mean,
+    image2d_t output_vari,
+              int axis, int axis_num)
+{
+    int gidy = get_global_id(0);
+    int2 coord = (int2)(0, gidy);
+    vxc_ushort8 src0, src1;
+    vxc_ushort8 val;
+    vxc_float4 mean_vari0 = (vxc_float4)(0);
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);
+
+    for(coord.x = 0; coord.x < width; coord.x += 8)
+    {
+        VXC_ReadImage(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        float4 vec0, vec1;
+        VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, vec0, src0, 16);
+        VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, vec1, src1, 16);
+        mean_vari0.x += dot(vec0, one) + dot(vec1, one);
+        mean_vari0.y += dot(vec0, vec0) + dot(vec1, vec1);
+    }
+    mean_vari0 *= dimRatio;
+    mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0;
+
+    int2 coord_out = (int2)(gidy, 0);
+
+    vxc_short8 dst;
+    _viv_asm(COPY, src0, mean_vari0, 16);
+    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+
     VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis012.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis012.vx
index 6afb0a5..617719e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis012.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis012.vx
@@ -18,6 +18,9 @@ _viv_uniform float e2InScale;
 _viv_uniform float rowSumScale;
 _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
 _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
 
 #define MOMENTS_AXIS012_QINT(src0_type_name, read0_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_##src0_type_name##toF16( \
@@ -236,4 +239,79 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_I1
         VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
         VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_BF16toBF16(
+    image2d_array_t input,
+    image2d_t output_mean,
+    image2d_t output_vari,
+              int axis,
+              int axis_num)
+{
+    int gidx = get_global_id(0) << 3;
+    int lidx = get_local_id(0);
+    int4 coord = (int4)(gidx, 0, 0, 0);
+    vxc_float4 sumsqr;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+    float tmpSum = 0;
+    float tmpSqr = 0;
+    vxc_ushort8 src0, src1;
+    vxc_ushort8 val;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);
+
+    for(coord.z = 0; coord.z < channel; coord.z++)
+    {
+        for(coord.x = gidx; coord.x < width; coord.x += 128)
+        {
+            for(coord.y = 0; coord.y < height;)
+            {
+                VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+                coord.y++;
+                float4 vec0, vec1;
+                VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                        uniConvBF16toF32_Part0_2x8);
+                _viv_asm(COPY, vec0, src0, 16);
+                VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                        uniConvBF16toF32_Part1_2x8);
+                _viv_asm(COPY, vec1, src1, 16);
+                tmpSum += dot(vec0, one) + dot(vec1, one);
+                tmpSqr += dot(vec0, vec0) + dot(vec1, vec1);
+            }
+        }
+    }
+    lcl_sum[lidx] = tmpSum;
+    lcl_sqr[lidx] = tmpSqr;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(0, 0);
+    if(lidx == 0)
+    {
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        float sum = (float)(0);
+        float sqr = (float)(0);
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 mean_vari;
+        mean_vari.x = sum * dimRatio;
+        mean_vari.y = sqr * dimRatio;
+        mean_vari.y = mean_vari.y  - mean_vari.x * mean_vari.x;
+
+        vxc_short8 dst;
+        _viv_asm(COPY, src0, mean_vari, 16);
+        VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+
+        VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+        VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    }
 }
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis1.vx
index 0be50bf..d303ed9 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis1.vx
@@ -10,6 +10,8 @@ _viv_uniform float e2InScale;
 _viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
 _viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
 _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
 
 #define MOMENTS_AXIS1_QINT(src0_type_name, read0_type) \
 __kernel void moments_axis1_##src0_type_name##toF16( \
@@ -197,3 +199,85 @@ __kernel void moments_axis1_F16toF16_2D(
     VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
     VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
 }
+
+__kernel void moments_axis1_BF16toBF16(
+    image2d_array_t input,
+    image2d_t output_mean,
+    image2d_t output_vari,
+              int axis, int axis_num)
+{
+    int gidx = get_global_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    vxc_ushort8 src0;
+    vxc_ushort8 val;
+    vxc_float4 sum = (vxc_float4)(0);
+    vxc_float4 sqr = (vxc_float4)(0);
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        float4 vec0;
+        VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, vec0, src0, 16);
+
+        sum += vec0;
+        sqr += (vec0 * vec0);
+    }
+
+    vxc_float4 mean = sum * dimRatio;
+    vxc_float4 vari = sqr * dimRatio;
+    vari = vari - mean * mean;
+
+    int2 coord_out = (int2)(gidx, gidz);
+    vxc_short8 tmpdst0, tmpdst1, dst;
+    _viv_asm(COPY, tmpdst0, mean, 16);
+    _viv_asm(COPY, tmpdst1, vari, 16);
+    VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+
+    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void moments_axis1_BF16toBF16_2D(
+    image2d_t input,
+    image2d_t output_mean,
+    image2d_t output_vari,
+              int axis, int axis_num)
+{
+    int gidx = get_global_id(0);
+    int2 coord = (int2)(gidx, 0);
+    vxc_ushort8 src0;
+    vxc_ushort8 val;
+    vxc_float4 sum = (vxc_float4)(0);
+    vxc_float4 sqr = (vxc_float4)(0);
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        float4 vec0;
+        VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, vec0, src0, 16);
+
+        sum += vec0;
+        sqr += (vec0 * vec0);
+    }
+
+    vxc_float4 mean = sum * dimRatio;
+    vxc_float4 vari = sqr * dimRatio;
+    vari = vari - mean * mean;
+
+    int2 coord_out = (int2)(gidx, 0);
+    vxc_short8 tmpdst0, tmpdst1, dst;
+    _viv_asm(COPY, tmpdst0, mean, 16);
+    _viv_asm(COPY, tmpdst1, vari, 16);
+    VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis2.vx
index c47c34f..ce473c0 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis2.vx
@@ -9,6 +9,8 @@ _viv_uniform float e2InScale;
 _viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
 _viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
 _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
 
 #define MOMENTS_AXIS2_QINT(src0_type_name, read0_type) \
 __kernel void moments_axis2_##src0_type_name##toF16( \
@@ -95,6 +97,50 @@ __kernel void moments_axis2_F16toF16(
     _viv_asm(CONV, tmpVari, vari);
     VXC_DP2x8(tmpVal, tmpMean, tmpVari, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
     _viv_asm(COPY, dst, tmpVal, 16);
+    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void moments_axis2_BF16toBF16(
+    image2d_array_t input,
+    image2d_t output_mean,
+    image2d_t output_vari,
+              int axis,
+              int axis_num)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int4 coord = (int4)(gidx, gidy, 0, 0);
+    vxc_ushort8 src0;
+    vxc_ushort8 val;
+    vxc_float4 sum = (vxc_float4)(0);
+    vxc_float4 sqr = (vxc_float4)(0);
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+
+    for(coord.z = 0; coord.z < channel; coord.z++)
+    {
+        VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        float4 vec0;
+        VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, vec0, src0, 16);
+
+        sum += vec0;
+        sqr += (vec0 * vec0);
+    }
+
+    vxc_float4 mean = sum * dimRatio;
+    vxc_float4 vari = sqr * dimRatio;
+    vari = vari - mean * mean;
+
+    int2 coord_out = (int2)(gidx, gidy);
+
+    vxc_short8 tmpdst0, tmpdst1, dst;
+    _viv_asm(COPY, tmpdst0, mean, 16);
+    _viv_asm(COPY, tmpdst1, vari, 16);
+    VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+
     VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
     VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
 }
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_u8_axis012.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_u8_axis012.vx
index b456ee6..073c237 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/moments_u8_axis012.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_u8_axis012.vx
@@ -15,6 +15,9 @@ _viv_uniform float rowSumScale;
 _viv_uniform float4 output_ZP;
 _viv_uniform float4 outputScale;
 _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
 
 #define MOMENTS_AXIS012_QINT_U8(src0_type_name, read0_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_##src0_type_name##toU8( \
@@ -72,4 +75,141 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_##
         VXC_WriteImage(output_vari, coord_out, src0.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
     } \
 }
-MOMENTS_AXIS012_QINT_U8(U8, vxc_uchar16)
\ No newline at end of file
+MOMENTS_AXIS012_QINT_U8(U8, vxc_uchar16)
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_BF16toBF16(
+    image2d_array_t input, image2d_t output_mean, image2d_t output_vari,
+    int axis, int axis_num)
+{
+    int gidx = get_global_id(0) << 3;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+    float tmpSum = 0;
+    float tmpSqr = 0;
+    vxc_ushort8 src0, src1;
+    vxc_ushort8 val;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);
+
+    for(coord.x = gidx; coord.x < width; coord.x += 128)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            float4 vec0, vec1;
+            VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part0_2x8);
+            _viv_asm(COPY, vec0, src0, 16);
+            VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part1_2x8);
+            _viv_asm(COPY, vec1, src1, 16);
+            tmpSum += dot(vec0, one) + dot(vec1, one);
+            tmpSqr += dot(vec0, vec0) + dot(vec1, vec1);
+        }
+    }
+
+    lcl_sum[lidx] = tmpSum;
+    lcl_sqr[lidx] = tmpSqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(gidz, 0);
+    if(lidx == 0)
+    {
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        float sum = 0.0f;
+        float sqr = 0.0f;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 mean_vari;
+        mean_vari.x = sum * dimRatio;
+        mean_vari.y = sqr * dimRatio;
+        mean_vari.y = mean_vari.y  - mean_vari.x * mean_vari.x;
+
+        vxc_short8 dst;
+        _viv_asm(COPY, src0, mean_vari, 16);
+        VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+
+        VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+        VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_BF16toBF16_2D(
+    image2d_array_t input, image2d_t output_mean, image2d_t output_vari,
+    int axis, int axis_num)
+{
+    int gidx = get_global_id(0) << 3;
+    int lidx = get_local_id(0);
+    int2 coord = (int2)(gidx, 0);
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+    float tmpSum = 0;
+    float tmpSqr = 0;
+    vxc_ushort8 src0, src1;
+    vxc_ushort8 val;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);
+
+    for(coord.x = gidx; coord.x < width; coord.x += 128)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            VXC_ReadImage(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+
+            float4 vec0, vec1;
+            VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part0_2x8);
+            _viv_asm(COPY, vec0, src0, 16);
+            VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part1_2x8);
+            _viv_asm(COPY, vec1, src1, 16);
+            tmpSum += dot(vec0, one) + dot(vec1, one);
+            tmpSqr += dot(vec0, vec0) + dot(vec1, vec1);
+        }
+    }
+
+    lcl_sum[lidx] = tmpSum;
+    lcl_sqr[lidx] = tmpSqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(0, 0);
+    if(lidx == 0)
+    {
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        float sum = 0.0f;
+        float sqr = 0.0f;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+        float4 mean_vari;
+        mean_vari.x = sum * dimRatio;
+        mean_vari.y = sqr * dimRatio;
+        mean_vari.y = mean_vari.y  - mean_vari.x * mean_vari.x;
+
+        vxc_short8 dst;
+        _viv_asm(COPY, src0, mean_vari, 16);
+        VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+
+        VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+        VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx b/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx
index 6d3cd52..eb248fb 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx
@@ -203,3 +203,91 @@ __kernel void one_hot_##name0##to##name1##_2D \
 ONE_HOT_ASYM_SH_IMPL_2D(U8,  F16, vxc_uchar8,  vxc_uchar8, vxc_ushort8)
 ONE_HOT_ASYM_SH_IMPL_2D(U8,  U8,  vxc_uchar8,  vxc_uchar8, vxc_uchar8)
 
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+__kernel void one_hot_BF16toBF16
+    (
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 int             suffix_sz,
+                 int             on_val,
+                 int             off_val
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), 0, get_global_id(0), get_global_id(0));
+
+    vxc_ushort8 src0, src1;
+    vxc_ushort8 val;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+
+    VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 vec0, vec1;
+    VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, vec0, src0, 16);
+    VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, vec1, src1, 16);
+    int4 data0 = convert_int4(vec0);
+    int4 data1 = convert_int4(vec1);
+
+    do
+    {
+        int4 d0 = data0 == coord.zzzz ? on_val : off_val;
+        int4 d1 = data1 == coord.zzzz ? on_val : off_val;
+
+        vxc_short8 dst;
+        VXC_DP2x8(dst, d0, d1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+
+        VXC_WriteImage2DArray(output, coord.xzyw, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+
+        coord.z ++;
+    } while (coord.z < depth);
+}
+
+__kernel void one_hot_BF16toBF16_2D
+    (
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 int             suffix_sz,
+                 int             on_val,
+                 int             off_val
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+    vxc_ushort8 src0, src1;
+    vxc_ushort8 val;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+
+    VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 vec0;
+    VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, vec0, src0, 16);
+    int4 data = convert_int4(vec0);
+    int4 data0, data1;
+    int4 d4 = (int4)(0, 1, 2, 3);
+    do
+    {
+        coord.zw = coord.xx + (int2)(0, 1);
+        vxc_short8 dst;
+        data0 = data.xxxx == d4 ? on_val : off_val;
+        data1 = data.yyyy == d4 ? on_val : off_val;
+
+        VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+
+        VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0));
+        coord.zw = coord.zw + (int2)(2, 2);
+
+        data0 = data.zzzz == d4 ? on_val : off_val;
+        data1 = data.wwww == d4 ? on_val : off_val;
+
+        VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
+
+        VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0));
+        d4 += 4;
+        coord.y += 4;
+    } while (coord.y < depth);
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_2d.vx
index 64018e7..752813e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_2d.vx
@@ -7,13 +7,15 @@ _viv_uniform float input1Tail;
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;
 _viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;
 _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
 
 #define COMPARISONS_2D(func_name, src0_type_name, src1_type_name, \
         src0_type, src0_copy_type, src1_type, src1_copy_type, cmp_op) \
 __kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_2D( \
-    __read_only  image2d_array_t  input0, \
-    __read_only  image2d_array_t  input1, \
-    __write_only image2d_array_t  output \
+    __read_only  image2d_t  input0, \
+    __read_only  image2d_t  input1, \
+    __write_only image2d_t  output \
     ) \
 { \
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
@@ -112,3 +114,42 @@ COMPARISONS_2D(not_equal, U8,  F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half
 COMPARISONS_2D(not_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, !=)
 COMPARISONS_2D(not_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8,  !=)
 
+#define COMPARISONS_BF_2D(func_name, src0_type_name, src1_type_name, cmp_op) \
+__kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_2D( \
+    __read_only  image2d_t  input0, \
+    __read_only  image2d_t  input1, \
+    __write_only image2d_t  output \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    vxc_ushort8 src0, src1, srcA, srcB; \
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); \
+    VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 vecA0, vecA1; \
+    float4 vecB0, vecB1; \
+    VXC_DP2x8(src0, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
+    _viv_asm(COPY, vecA0, src0, 16); \
+    VXC_DP2x8(src1, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \
+    _viv_asm(COPY, vecA1, src1, 16); \
+    VXC_DP2x8(src0, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
+    _viv_asm(COPY, vecB0, src0, 16); \
+    VXC_DP2x8(src1, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \
+    _viv_asm(COPY, vecB1, src1, 16); \
+    int4 dst0, dst1; \
+    dst0 = (vecA0)cmp_op(vecB0); \
+    dst1 = (vecA1)cmp_op(vecB1); \
+ \
+    vxc_char16 dst; \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    dst &= 1; \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+
+COMPARISONS_BF_2D(less,        BF16, BF16,  <)
+COMPARISONS_BF_2D(great,       BF16, BF16,  >)
+COMPARISONS_BF_2D(less_equal,  BF16, BF16,  <=)
+COMPARISONS_BF_2D(great_equal, BF16, BF16,  >=)
+COMPARISONS_BF_2D(equal,       BF16, BF16,  ==)
+COMPARISONS_BF_2D(not_equal,   BF16, BF16,  !=)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_3d.vx
index 0fcc274..f24a924 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_3d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_3d.vx
@@ -7,6 +7,8 @@ _viv_uniform float input1Tail;
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;
 _viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;
 _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
 
 #define COMPARISONS_3D(func_name, src0_type_name, src1_type_name, \
         src0_type, src0_copy_type, src1_type, src1_copy_type, cmp_op) \
@@ -112,3 +114,42 @@ COMPARISONS_3D(not_equal, U8,  F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half
 COMPARISONS_3D(not_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, !=)
 COMPARISONS_3D(not_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8,  !=)
 
+#define COMPARISONS_BF_3D(func_name, src0_type_name, src1_type_name, cmp_op) \
+__kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_3D( \
+    __read_only  image2d_array_t  input0, \
+    __read_only  image2d_array_t  input1, \
+    __write_only image2d_array_t  output \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    vxc_ushort8 src0, src1, srcA, srcB; \
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); \
+    VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 vecA0, vecA1; \
+    float4 vecB0, vecB1; \
+    VXC_DP2x8(src0, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
+    _viv_asm(COPY, vecA0, src0, 16); \
+    VXC_DP2x8(src1, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \
+    _viv_asm(COPY, vecA1, src1, 16); \
+    VXC_DP2x8(src0, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
+    _viv_asm(COPY, vecB0, src0, 16); \
+    VXC_DP2x8(src1, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \
+    _viv_asm(COPY, vecB1, src1, 16); \
+    int4 dst0, dst1; \
+    dst0 = (vecA0)cmp_op(vecB0); \
+    dst1 = (vecA1)cmp_op(vecB1); \
+ \
+    vxc_char16 dst; \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    dst &= 1; \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+
+COMPARISONS_BF_3D(less,        BF16, BF16,  <)
+COMPARISONS_BF_3D(great,       BF16, BF16,  >)
+COMPARISONS_BF_3D(less_equal,  BF16, BF16,  <=)
+COMPARISONS_BF_3D(great_equal, BF16, BF16,  >=)
+COMPARISONS_BF_3D(equal,       BF16, BF16,  ==)
+COMPARISONS_BF_3D(not_equal,   BF16, BF16,  !=)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_1.vx
similarity index 100%
rename from src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers.vx
rename to src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_1.vx
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_2.vx
new file mode 100644
index 0000000..ecf26d6
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_2.vx
@@ -0,0 +1,129 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int out_height;
+_viv_uniform VXC_512Bits uniResize8xUp_l00_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l01_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l10_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l11_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l20_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l21_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l30_4x8;
+_viv_uniform VXC_512Bits uniResize8xUp_l31_4x8;
+__kernel void resize_bilinear_U8toU8_SAME_8x_upsample_half_pixel_centers
+    (
+    __read_only  image2d_array_t   input,
+    __write_only image2d_array_t   output,
+                             int   align_corners,
+                             int   half_pixel_centers
+    )
+{
+    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0);
+    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);
+    coord_in.x = (coord_out.x * 2 - 7) >> 4;
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;
+
+    vxc_uchar16 in0, in1, in2, tmp, dst0, dst1, dst2, dst3;
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    while (coord_out.y < out_height)
+    {
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
+        VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);
+        VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
+        VXC_DP4x8(dst2, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);
+        VXC_DP4x8(dst2, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
+        VXC_DP4x8(dst3, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);
+        VXC_DP4x8(dst3, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
+        VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
+        VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);
+        VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
+        VXC_DP4x8(dst2, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);
+        VXC_DP4x8(dst2, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
+        VXC_DP4x8(dst3, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);
+        VXC_DP4x8(dst3, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
+        VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);
+        VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
+        VXC_DP4x8(dst2, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);
+        VXC_DP4x8(dst2, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
+        VXC_DP4x8(dst3, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);
+        VXC_DP4x8(dst3, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
+        VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_in.y += 2;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
+        VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);
+        VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
+        VXC_DP4x8(dst2, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);
+        VXC_DP4x8(dst2, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
+        VXC_DP4x8(dst3, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);
+        VXC_DP4x8(dst3, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd.vx
index e02967d..994aadd 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd.vx
@@ -7,10 +7,14 @@ _viv_uniform int offsetX;
 _viv_uniform int offsetY;
 _viv_uniform int offsetZ;
 
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+
 __kernel void scatter_nd_F16toF16(
     __read_only image2d_t   input0,
     __read_only image2d_t   input1,
-    image2d_array_t  output,
+    image2d_t  output,
     int width,
     int area,
     int coord_dim
@@ -38,11 +42,53 @@ __kernel void scatter_nd_F16toF16(
     VXC_WriteImage(output, (int2)(gidx, gidy), tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 }
 
+__kernel void scatter_nd_BF16toBF16(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    image2d_t  output,
+    int width,
+    int area,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float4 sum0 = (float4)(0);
+    float4 sum1 = (float4)(0);
+    vxc_ushort8 tmpVal;
+    for(int i = 0; i < index_num; i++)
+    {
+        int4 indice = read_imagei(input0, (int2)(0, i));
+        int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ;
+        if(gidy == idx)
+        {
+            vxc_ushort8 src0, src1;
+            VXC_ReadImage(tmpVal, input1, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            float4 vec0, vec1;
+            VXC_DP2x8(src0, tmpVal, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part0_2x8);
+            _viv_asm(COPY, vec0, src0, 16);
+            VXC_DP2x8(src1, tmpVal, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part1_2x8);
+            _viv_asm(COPY, vec1, src1, 16);
+            sum0 += vec0;
+            sum1 += vec1;
+        }
+    }
+    vxc_ushort8 dst0, dst1, dst;
+    _viv_asm(COPY, dst0, sum0, 16);
+    _viv_asm(COPY, dst1, sum1, 16);
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_WriteImage(output, (int2)(gidx, gidy), dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
 #define SCATTER_ND_QINT(src0_type_name, data_type) \
 __kernel void scatter_nd_##src0_type_name##to##src0_type_name##( \
     __read_only image2d_t   input0, \
     __read_only image2d_t   input1, \
-    image2d_array_t  output, \
+    image2d_t  output, \
     int width, \
     int area, \
     int coord_dim \
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
index 324dade..93da98e 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
@@ -875,6 +875,8 @@ static const char argmax_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 _viv_uniform int4 packedArgIdx;\n\
 _viv_uniform int argLenSub1;\n\
 _viv_uniform VXC_512Bits uniExtractData_2x8;\n\
+_viv_uniform VXC_512Bits uniExtract1stU8toI16_2x8;\n\
+_viv_uniform VXC_512Bits uniExtract2ndU8toI16_2x8;\n\
 \n\
 #define TENSOR_ARGMAX_AXIS2_16BITS(src_type_name, dst_type_name,\\\n\
                 src_type, copy_type, axis_type, dst_type, inst_type) \\\n\
@@ -939,6 +941,56 @@ TENSOR_ARGMAX_AXIS2_16BITS_2D(I16, U8,  vxc_short8, vxc_short8, vxc_short8, vxc_
 #define TENSOR_ARGMAX_AXIS2_8BITS(src_type_name, dst_type_name, src_type, dst_type) \\\n\
     __kernel void argmax_axis2_##src_type_name##to##dst_type_name( \\\n\
 __read_only  image2d_array_t  input, \\\n\
+__write_only image2d_array_t  output, \\\n\
+        int  axisVal \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \\\n\
+    src_type src; \\\n\
+    src_type maxVal; \\\n\
+    VXC_ReadImage2DArray(maxVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    dst_type axis; \\\n\
+    dst_type packIdx; \\\n\
+ \\\n\
+    _viv_asm(COPY, axis, packedArgIdx, 16); \\\n\
+    _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\
+ \\\n\
+    coord.z --; \\\n\
+    do \\\n\
+    { \\\n\
+       VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+       coord.z --; \\\n\
+       packIdx --; \\\n\
+       maxVal = max(maxVal, src); \\\n\
+       src_type condition; \\\n\
+       VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\
+       axis = condition ? packIdx : axis; \\\n\
+    } while (coord.z >= 0); \\\n\
+ \\\n\
+    VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+TENSOR_ARGMAX_AXIS2_8BITS(I8,  U8,  vxc_char16,  vxc_uchar16)\n\
+TENSOR_ARGMAX_AXIS2_8BITS(U8,  U8,  vxc_uchar16, vxc_uchar16)\n\
+\n\
+#define TENSOR_ARGMAX_AXIS2_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \\\n\
+    __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_2D( \\\n\
+__read_only  image2d_array_t  input, \\\n\
+__write_only image2d_array_t  output, \\\n\
+        int  axisVal \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\
+    VXC_WriteImage(output, coord, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+TENSOR_ARGMAX_AXIS2_8BITS_2D(I8,  I16, vxc_char8,  vxc_short8)\n\
+TENSOR_ARGMAX_AXIS2_8BITS_2D(I8,  U8,  vxc_char8,  vxc_uchar8)\n\
+TENSOR_ARGMAX_AXIS2_8BITS_2D(U8,  I16, vxc_uchar8, vxc_short8)\n\
+TENSOR_ARGMAX_AXIS2_8BITS_2D(U8,  U8,  vxc_uchar8, vxc_uchar8)\n\
+\n\
+#define TENSOR_ARGMAX_AXIS2_MIX(src_type_name, dst_type_name, src_type, dst_type) \\\n\
+    __kernel void argmax_axis2_##src_type_name##to##dst_type_name( \\\n\
+__read_only  image2d_array_t  input, \\\n\
 __write_only image2d_array_t  output, \\\n\
         int  axisVal \\\n\
     ) \\\n\
@@ -967,26 +1019,49 @@ __write_only image2d_array_t  output, \\\n\
  \\\n\
     VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-TENSOR_ARGMAX_AXIS2_8BITS(I8,  I16, vxc_char8,  vxc_short8)\n\
-TENSOR_ARGMAX_AXIS2_8BITS(I8,  U8,  vxc_char8,  vxc_uchar8)\n\
-TENSOR_ARGMAX_AXIS2_8BITS(U8,  I16, vxc_uchar8, vxc_short8)\n\
-TENSOR_ARGMAX_AXIS2_8BITS(U8,  U8,  vxc_uchar8, vxc_uchar8)\n\
+TENSOR_ARGMAX_AXIS2_MIX(I8,  I16, vxc_char8,  vxc_short8)\n\
+TENSOR_ARGMAX_AXIS2_MIX(U8,  I16, vxc_uchar8, vxc_short8)\n\
 \n\
-#define TENSOR_ARGMAX_AXIS2_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \\\n\
-    __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_2D( \\\n\
+#define TENSOR_ARGMAX_AXIS2_MIX_OPT(src_type_name, dst_type_name, src_type, dst_type) \\\n\
+    __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_opt( \\\n\
 __read_only  image2d_array_t  input, \\\n\
 __write_only image2d_array_t  output, \\\n\
         int  axisVal \\\n\
     ) \\\n\
 { \\\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
-    dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\
-    VXC_WriteImage(output, coord, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \\\n\
+    src_type src; \\\n\
+    src_type maxVal; \\\n\
+    VXC_ReadImage2DArray(maxVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    vxc_uchar16 axis; \\\n\
+    vxc_uchar16 packIdx; \\\n\
+ \\\n\
+    _viv_asm(COPY, axis, packedArgIdx, 16); \\\n\
+    _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\
+ \\\n\
+    coord.z --; \\\n\
+    do \\\n\
+    { \\\n\
+       VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+       coord.z --; \\\n\
+       packIdx --; \\\n\
+       maxVal = max(maxVal, src); \\\n\
+       src_type condition; \\\n\
+       VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\
+       axis = condition ? packIdx : axis; \\\n\
+    } while (coord.z >= 0); \\\n\
+    vxc_short8 dst0, dst1; \\\n\
+    VXC_DP2x8(dst0, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+                    uniExtract1stU8toI16_2x8); \\\n\
+    VXC_DP2x8(dst1, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+                    uniExtract2ndU8toI16_2x8); \\\n\
+ \\\n\
+    VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord.x += 8; \\\n\
+    VXC_WriteImage(output, coord.xy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-TENSOR_ARGMAX_AXIS2_8BITS_2D(I8,  I16, vxc_char8,  vxc_short8)\n\
-TENSOR_ARGMAX_AXIS2_8BITS_2D(I8,  U8,  vxc_char8,  vxc_uchar8)\n\
-TENSOR_ARGMAX_AXIS2_8BITS_2D(U8,  I16, vxc_uchar8, vxc_short8)\n\
-TENSOR_ARGMAX_AXIS2_8BITS_2D(U8,  U8,  vxc_uchar8, vxc_uchar8)\n\
+TENSOR_ARGMAX_AXIS2_MIX_OPT(I8,  I16, vxc_char16,  vxc_short8)\n\
+TENSOR_ARGMAX_AXIS2_MIX_OPT(U8,  I16, vxc_uchar16, vxc_short8)\n\
 "; /* end of argmax_axis2_vx*/
 
 static const char argmin_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -2835,10 +2910,10 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(\n\
     vxc_short8 w_zp = (short)weight_ZP;\n\
     vxc_uchar16 input_val = 0, weight_val = 0;\n\
     int temp = 0, i, j;\n\
-    Tensor src_tensor = create_image_from_image2d(input, 1);\n\
-    uchar *src_ptr_base = (uchar *)src_image.ptr;\n\
+    Tensor src_tensor = create_tensor_from_image2d_array(input, 1);\n\
+    uchar *src_ptr_base = (uchar *)src_tensor.ptr;\n\
     uchar *src_ptr;\n\
-    Tensor dst_tensor = create_image_from_image2d(output, 1);\n\
+    Tensor dst_tensor = create_tensor_from_image2d_array(output, 1);\n\
     uchar *dst_ptr = (uchar *)dst_tensor.ptr;\n\
 \n\
     temp = read_imagei(bias, coord.yz).x;\n\
@@ -2847,7 +2922,7 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(\n\
 \n\
     for (i = 0; i < input_height; i++)\n\
     {\n\
-        src_ptr = src_ptr_base + (coord.x + coord.z * src_image.stride_y);\n\
+        src_ptr = src_ptr_base + (coord.x + coord.z * src_tensor.stride_y);\n\
         for (j = 0; j < kernel_cnt_x16; j++)\n\
         {\n\
             VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \\\n\
@@ -2892,6 +2967,830 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(\n\
 \n\
 "; /* end of conv1d_ovxlib_k1024_vx*/
 
+static const char custom_softmax_vx[] = "/*\n\
+ ============================================================================\n\
+ Name        : Softmax2.vx\n\
+ Author      : VSI\n\
+ Version     :\n\
+ Copyright   : Your copyright notice\n\
+ Description :\n\
+ ============================================================================\n\
+ */\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32;\n\
+_viv_uniform int  sf_size;\n\
+ #define F_MAX(a,b) ((a)>(b)?(a):(b))\n\
+__kernel void Softmax2VXC\n\
+    (\n\
+    image2d_array_t input,\n\
+    image2d_array_t output,\n\
+    int axis\n\
+    )\n\
+{\n\
+   int4 coord_in = (int4)(0,0,0,0);\n\
+   float fMax = 0.0;\n\
+   for (int i = 0; i < sf_size; i++)\n\
+   {\n\
+       vxc_char8 val;\n\
+       coord_in.x = i;\n\
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\
+       float fval;\n\
+       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\
+\n\
+       fMax = F_MAX(fMax, fval);\n\
+   }\n\
+\n\
+    float  fProbSum = 0.0f;\n\
+    vxc_short8 dst;\n\
+    for (int i = 0; i < sf_size; i++)\n\
+    {\n\
+       vxc_char8 val;\n\
+\n\
+       coord_in.x = i;\n\
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\
+       float fval;\n\
+       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\
+\n\
+       float fOut = (float)exp(fval - fMax);\n\
+       fProbSum += fOut;\n\
+       half hVal;\n\
+       _viv_asm(CONV,hVal,fOut);\n\
+       _viv_asm(COPY,dst,hVal, 4);\n\
+       VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+\n\
+    for (int i = 0; i < sf_size; i++)\n\
+    {\n\
+       vxc_short8 val;\n\
+       vxc_half8  val_h;\n\
+       coord_in.x = i;\n\
+       VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\
+       float fval;\n\
+       _viv_asm(COPY, val_h,val, 16);\n\
+       VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\
+\n\
+       float fOut =fval/fProbSum;\n\
+       half hVal;\n\
+       _viv_asm(CONV,hVal,fOut);\n\
+       _viv_asm(COPY,dst,hVal, 4);\n\
+       VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+"; /* end of custom_softmax_vx*/
+
+static const char custom_warp_affine_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float4 matrix0;\n\
+_viv_uniform float2 matrix1;\n\
+_viv_uniform float4 matrix4;\n\
+__kernel void custom_warp_affine_nearest_neighbor_U8toU8_2D\n\
+(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                 float           _m0,\n\
+                 float           _m1,\n\
+                 float           _m2,\n\
+                 float           _m3,\n\
+                 float           _m4,\n\
+                 float           _m5\n\
+)\n\
+{\n\
+    int2   coord = (int2)(get_global_id(0), get_global_id(1));\n\
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+    float4 coord_f = convert_float4(coord_in);\n\
+\n\
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+\n\
+    coord_in = convert_int4(coord_f);\n\
+\n\
+    vxc_uchar16 dst;\n\
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    coord_f = coord_f.zwzw + matrix4;\n\
+    coord_in = convert_int4(coord_f);\n\
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+    coord_f = coord_f.zwzw + matrix4;\n\
+    coord_in = convert_int4(coord_f);\n\
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+    coord_f = coord_f.zwzw + matrix4;\n\
+    coord_in = convert_int4(coord_f);\n\
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void custom_warp_affine_bilinear_U8toU8_2D\n\
+(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                 float           _m0,\n\
+                 float           _m1,\n\
+                 float           _m2,\n\
+                 float           _m3,\n\
+                 float           _m4,\n\
+                 float           _m5\n\
+)\n\
+{\n\
+    int2   coord = (int2)(get_global_id(0), get_global_id(1));\n\
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+    float4 coord_f = convert_float4(coord_in);\n\
+\n\
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+\n\
+    coord_in = convert_int4(coord_f);\n\
+\n\
+    vxc_uchar16 src0, src1, dst;\n\
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_f = coord_f.zwzw + matrix4;\n\
+    coord_in = convert_int4(coord_f);\n\
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_f = coord_f.zwzw + matrix4;\n\
+    coord_in = convert_int4(coord_f);\n\
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_f = coord_f.zwzw + matrix4;\n\
+    coord_in = convert_int4(coord_f);\n\
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void custom_warp_affine_nearest_neighbor_U8toU8\n\
+(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                 float           _m0,\n\
+                 float           _m1,\n\
+                 float           _m2,\n\
+                 float           _m3,\n\
+                 float           _m4,\n\
+                 float           _m5\n\
+)\n\
+{\n\
+    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+    float4 coord_f = convert_float4(coord_in);\n\
+\n\
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+\n\
+    coord_in = convert_int4(coord_f);\n\
+\n\
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_input.w, baseAddr);\n\
+\n\
+    vxc_uchar16 dst;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    coord_input.xy = coord_in.zw;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    coord_f = coord_f.zwzw + matrix4;\n\
+    coord_in = convert_int4(coord_f);\n\
+    coord_input.xy = coord_in.xy;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    coord_input.xy = coord_in.zw;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+    coord_f = coord_f.zwzw + matrix4;\n\
+    coord_in = convert_int4(coord_f);\n\
+    coord_input.xy = coord_in.xy;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+    coord_input.xy = coord_in.zw;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+    coord_f = coord_f.zwzw + matrix4;\n\
+    coord_in = convert_int4(coord_f);\n\
+    coord_input.xy = coord_in.xy;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+    coord_input.xy = coord_in.zw;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void custom_warp_affine_bilinear_U8toU8\n\
+(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                 float           _m0,\n\
+                 float           _m1,\n\
+                 float           _m2,\n\
+                 float           _m3,\n\
+                 float           _m4,\n\
+                 float           _m5\n\
+)\n\
+{\n\
+    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+    float4 coord_f = convert_float4(coord_in);\n\
+\n\
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+\n\
+    coord_in = convert_int4(coord_f);\n\
+\n\
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_input.w, baseAddr);\n\
+\n\
+    vxc_uchar16 src0, src1, dst;\n\
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_input.xy = coord_in.zw;\n\
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_f = coord_f.zwzw + matrix4;\n\
+    coord_in = convert_int4(coord_f);\n\
+\n\
+    coord_input.xy = coord_in.xy;\n\
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_input.xy = coord_in.zw;\n\
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_f = coord_f.zwzw + matrix4;\n\
+    coord_in = convert_int4(coord_f);\n\
+    coord_input.xy = coord_in.xy;\n\
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_input.xy = coord_in.zw;\n\
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_f = coord_f.zwzw + matrix4;\n\
+    coord_in = convert_int4(coord_f);\n\
+    coord_input.xy = coord_in.xy;\n\
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_input.xy = coord_in.zw;\n\
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of custom_warp_affine_vx*/
+
+static const char custom_warp_perspective_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float4 matrix0;\n\
+_viv_uniform float4 matrix1;\n\
+_viv_uniform float4 matrix2;\n\
+_viv_uniform float4 matrix4;\n\
+__kernel void custom_warp_perspective_nearest_neighbor_U8toU8_2D\n\
+(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                 float           _m0,\n\
+                 float           _m1,\n\
+                 float           _m2,\n\
+                 float           _m3,\n\
+                 float           _m4,\n\
+                 float           _m5,\n\
+                 float           _m6,\n\
+                 float           _m7,\n\
+                 float           _m8\n\
+)\n\
+{\n\
+    int2   coord = (int2)(get_global_id(0), get_global_id(1));\n\
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+    float4 coord_f0 = convert_float4(coord_in);\n\
+\n\
+    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\
+    z0.zw = z0.zw + 2 * matrix1.z;\n\
+    float4 z1 = z0 + 4 * matrix1.z;\n\
+\n\
+    z0 = 1.0f / z0;\n\
+    z1 = 1.0f / z1;\n\
+\n\
+    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+    float4 coord_f = coord_f0 * z0.xxyy;\n\
+\n\
+    coord_in = convert_int4(coord_f);\n\
+\n\
+    vxc_uchar16 dst;\n\
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z0.zzww;\n\
+    coord_in = convert_int4(coord_f);\n\
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z1.xxyy;\n\
+    coord_in = convert_int4(coord_f);\n\
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z1.zzww;\n\
+    coord_in = convert_int4(coord_f);\n\
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\
+(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                 float           _m0,\n\
+                 float           _m1,\n\
+                 float           _m2,\n\
+                 float           _m3,\n\
+                 float           _m4,\n\
+                 float           _m5,\n\
+                 float           _m6,\n\
+                 float           _m7,\n\
+                 float           _m8\n\
+)\n\
+{\n\
+    int2   coord = (int2)(get_global_id(0), get_global_id(1));\n\
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+    float4 coord_f0 = convert_float4(coord_in);\n\
+\n\
+    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\
+    z0.zw = z0.zw + 2 * matrix1.z;\n\
+    float4 z1 = z0 + 4 * matrix1.z;\n\
+\n\
+    z0 = 1.0f / z0;\n\
+    z1 = 1.0f / z1;\n\
+\n\
+    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+    float4 coord_f = coord_f0 * z0.xxyy;\n\
+\n\
+    coord_in = convert_int4(floor(coord_f));\n\
+\n\
+    vxc_uchar16 src0, src1, dst;\n\
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z0.zzww;\n\
+    coord_in = convert_int4(floor(coord_f));\n\
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z1.xxyy;\n\
+    coord_in = convert_int4(floor(coord_f));\n\
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z1.zzww;\n\
+    coord_in = convert_int4(floor(coord_f));\n\
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+#define IMAGE_LOAD_3D(dst, xoffset, yoffset, start, end) \\\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, VXC_5BITOFFSET_XY(xoffset, yoffset), \\\n\
+        VXC_MODIFIER(start, end, 0, VXC_RM_TowardZero, 0));\n\
+__kernel void custom_warp_perspective_nearest_neighbor_U8toU8\n\
+(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                 float           _m0,\n\
+                 float           _m1,\n\
+                 float           _m2,\n\
+                 float           _m3,\n\
+                 float           _m4,\n\
+                 float           _m5,\n\
+                 float           _m6,\n\
+                 float           _m7,\n\
+                 float           _m8\n\
+)\n\
+{\n\
+    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+    float4 coord_f0 = convert_float4(coord_in);\n\
+\n\
+    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\
+    z0.zw = z0.zw + 2 * matrix1.z;\n\
+    float4 z1 = z0 + 4 * matrix1.z;\n\
+\n\
+    z0 = 1.0f / z0;\n\
+    z1 = 1.0f / z1;\n\
+\n\
+    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+    float4 coord_f = coord_f0 * z0.xxyy;\n\
+\n\
+    coord_in = convert_int4(coord_f);\n\
+\n\
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_input.w, baseAddr);\n\
+\n\
+    vxc_uchar16 dst;\n\
+    IMAGE_LOAD_3D(dst, 0, 0, 0, 0)\n\
+    coord_input.xy = coord_in.zw;\n\
+    IMAGE_LOAD_3D(dst, 0, 0, 1, 1)\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z0.zzww;\n\
+    coord_in = convert_int4(coord_f);\n\
+    coord_input.xy = coord_in.xy;\n\
+    IMAGE_LOAD_3D(dst, 0, 0, 2, 2)\n\
+    coord_input.xy = coord_in.zw;\n\
+    IMAGE_LOAD_3D(dst, 0, 0, 3, 3)\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z1.xxyy;\n\
+    coord_in = convert_int4(coord_f);\n\
+    coord_input.xy = coord_in.xy;\n\
+    IMAGE_LOAD_3D(dst, 0, 0, 4, 4)\n\
+    coord_input.xy = coord_in.zw;\n\
+    IMAGE_LOAD_3D(dst, 0, 0, 5, 5)\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z1.zzww;\n\
+    coord_in = convert_int4(coord_f);\n\
+    coord_input.xy = coord_in.xy;\n\
+    IMAGE_LOAD_3D(dst, 0, 0, 6, 6)\n\
+    coord_input.xy = coord_in.zw;\n\
+    IMAGE_LOAD_3D(dst, 0, 0, 7, 7)\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void custom_warp_perspective_bilinear_U8toU8\n\
+(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                 float           _m0,\n\
+                 float           _m1,\n\
+                 float           _m2,\n\
+                 float           _m3,\n\
+                 float           _m4,\n\
+                 float           _m5,\n\
+                 float           _m6,\n\
+                 float           _m7,\n\
+                 float           _m8\n\
+)\n\
+{\n\
+    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+    float4 coord_f0 = convert_float4(coord_in);\n\
+\n\
+    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\
+    z0.zw = z0.zw + 2 * matrix1.z;\n\
+    float4 z1 = z0 + 4 * matrix1.z;\n\
+\n\
+    z0 = 1.0f / z0;\n\
+    z1 = 1.0f / z1;\n\
+\n\
+    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+    float4 coord_f = coord_f0 * z0.xxyy;\n\
+\n\
+    coord_in = convert_int4(floor(coord_f));\n\
+\n\
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_input.w, baseAddr);\n\
+\n\
+    vxc_uchar16 src0, src1, dst;\n\
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)\n\
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_input.xy = coord_in.zw;\n\
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)\n\
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z0.zzww;\n\
+    coord_in = convert_int4(floor(coord_f));\n\
+    coord_input.xy = coord_in.xy;\n\
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)\n\
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_input.xy = coord_in.zw;\n\
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)\n\
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z1.xxyy;\n\
+    coord_in = convert_int4(floor(coord_f));\n\
+    coord_input.xy = coord_in.xy;\n\
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)\n\
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_input.xy = coord_in.zw;\n\
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)\n\
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z1.zzww;\n\
+    coord_in = convert_int4(floor(coord_f));\n\
+    coord_input.xy = coord_in.xy;\n\
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)\n\
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_input.xy = coord_in.zw;\n\
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)\n\
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of custom_warp_perspective_vx*/
+
 static const char depth2space_crd_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\
@@ -3198,7 +4097,8 @@ __kernel void depth2space_crd_F16toI16_blk2(\n\
     VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     coord_out.x += 8;\n\
     VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of depth2space_crd_vx*/
+}\n\
+"; /* end of depth2space_crd_vx*/
 
 static const char depthwise_conv1d_src0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -4047,6 +4947,11 @@ float4 eltwise_unary_sin(float4 x)\n\
     return native_sin(x);\n\
 }\n\
 \n\
+float4 eltwise_unary_cos(float4 x)\n\
+{\n\
+    return native_cos(x);\n\
+}\n\
+\n\
 #define logE        (1.44269502f)\n\
 #define twoLogE     (logE * 2.0f)\n\
 float4 eltwise_unary_exp(float4 x)\n\
@@ -4228,6 +5133,17 @@ ELTSISE_UNARY_2D(sin, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_u
 ELTSISE_UNARY_2D(sin, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
 ELTSISE_UNARY_2D(sin, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
 ELTSISE_UNARY_2D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+//COS\n\
+ELTSISE_UNARY_2D(cos, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(cos, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(cos, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(cos, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(cos, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(cos, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(cos, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(cos, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(cos, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
 //LOG\n\
 ELTSISE_UNARY_2D(log, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
 ELTSISE_UNARY_2D(log, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
@@ -4354,6 +5270,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
 ELTSISE_UNARY_BF16_2D(exp)\n\
 //SIN\n\
 ELTSISE_UNARY_BF16_2D(sin)\n\
+//COS\n\
+ELTSISE_UNARY_BF16_2D(cos)\n\
 //LOG\n\
 ELTSISE_UNARY_BF16_2D(log)\n\
 //ELU\n\
@@ -4382,6 +5300,11 @@ float4 eltwise_unary_sin(float4 x)\n\
     return native_sin(x);\n\
 }\n\
 \n\
+float4 eltwise_unary_cos(float4 x)\n\
+{\n\
+    return native_cos(x);\n\
+}\n\
+\n\
 #define logE        (1.44269502f)\n\
 #define twoLogE     (logE * 2.0f)\n\
 float4 eltwise_unary_exp(float4 x)\n\
@@ -4563,6 +5486,17 @@ ELTSISE_UNARY_3D(sin, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_u
 ELTSISE_UNARY_3D(sin, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
 ELTSISE_UNARY_3D(sin, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
 ELTSISE_UNARY_3D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+//COS\n\
+ELTSISE_UNARY_3D(cos, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(cos, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(cos, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(cos, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(cos, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(cos, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(cos, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(cos, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(cos, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
 //LOG\n\
 ELTSISE_UNARY_3D(log, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
 ELTSISE_UNARY_3D(log, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
@@ -4688,6 +5622,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
 ELTSISE_UNARY_BF16(exp)\n\
 //SIN\n\
 ELTSISE_UNARY_BF16(sin)\n\
+//COS\n\
+ELTSISE_UNARY_BF16(cos)\n\
 //LOG\n\
 ELTSISE_UNARY_BF16(log)\n\
 //ELU\n\
@@ -5216,8 +6152,6 @@ __kernel void gather_F16toF16(\n\
     int gidz = get_global_id(2);  // block_num\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
-\n\
-\n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
@@ -5491,6 +6425,245 @@ GATHER_AXIS0_ARRAY(I8, vxc_char16,  char, vxc_char4)\n\
 GATHER_AXIS0_ARRAY(I16, vxc_short8, short, vxc_short4)\n\
 GATHER_AXIS0_ARRAY(F16, vxc_short8, short, vxc_short4)"; /* end of gather_array_vx*/
 
+static const char gather_batch_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int indices_num;\n\
+_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8;\n\
+_viv_uniform int batch;\n\
+\n\
+__kernel void gather_batch_I8toI8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+    int gidz = get_global_id(2);  // block_num\n\
+\n\
+    int2 coord_idx = (int2)(gidy, 0);\n\
+    int4 coord_in = (int4)(gidx, 0, 0, 0);\n\
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\
+    for(; coord_idx.y < batch;)\n\
+    {\n\
+        int4 indice = read_imagei(input1, coord_idx);\n\
+        coord_idx.y++;\n\
+        coord_in.y = gidz * axis_num + indice.x;\n\
+\n\
+        vxc_char16 src;\n\
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.z++;\n\
+        VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord.z++;\n\
+    }\n\
+}\n\
+\n\
+__kernel void gather_batch_U8toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+    int gidz = get_global_id(2);  // block_num\n\
+\n\
+    int2 coord_idx = (int2)(gidy, 0);\n\
+    int4 coord_in = (int4)(gidx, 0, 0, 0);\n\
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\
+    for(; coord_idx.y < batch;)\n\
+    {\n\
+        int4 indice = read_imagei(input1, coord_idx);\n\
+        coord_idx.y++;\n\
+        coord_in.y = gidz * axis_num + indice.x;\n\
+\n\
+        vxc_uchar16 src;\n\
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.z++;\n\
+        VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord.z++;\n\
+    }\n\
+}\n\
+\n\
+__kernel void gather_batch_I16toI16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+    int gidz = get_global_id(2);  // block_num\n\
+\n\
+    int2 coord_idx = (int2)(gidy, 0);\n\
+    int4 coord_in = (int4)(gidx, 0, 0, 0);\n\
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\
+    for(; coord_idx.y < batch;)\n\
+    {\n\
+        int4 indice = read_imagei(input1, coord_idx);\n\
+        coord_idx.y++;\n\
+        coord_in.y = gidz * axis_num + indice.x;\n\
+\n\
+        vxc_short8 src;\n\
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.z++;\n\
+        VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord.z++;\n\
+    }\n\
+}\n\
+\n\
+__kernel void gather_batch_F16toF16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+    int gidz = get_global_id(2);  // block_num\n\
+\n\
+    int2 coord_idx = (int2)(gidy, 0);\n\
+    int4 coord_in = (int4)(gidx, 0, 0, 0);\n\
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\
+    for(; coord_idx.y < batch;)\n\
+    {\n\
+        int4 indice = read_imagei(input1, coord_idx);\n\
+        coord_idx.y++;\n\
+        coord_in.y = gidz * axis_num + indice.x;\n\
+\n\
+        vxc_short8 src;\n\
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.z++;\n\
+        VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord.z++;\n\
+    }\n\
+}\n\
+\n\
+__kernel void gather_batch_I8toI8_axis0(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 indices = read_imagei(input1, coord.xz);\n\
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_char16 src, dst;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.y;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.z;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.w;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                     uniExtraCopyDpKeepinEvis_2x8);\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gather_batch_U8toU8_axis0(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 indices = read_imagei(input1, coord.xz);\n\
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_uchar16 src, dst;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.y;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.z;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.w;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                     uniExtraCopyDpKeepinEvis_2x8);\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gather_batch_I16toI16_axis0(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 indices = read_imagei(input1, coord.xz);\n\
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.y;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.z;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.w;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                     uniExtraCopyDpKeepinEvis_2x8);\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gather_batch_F16toF16_axis0(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 indices = read_imagei(input1, coord.xz);\n\
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.y;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.z;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.w;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                     uniExtraCopyDpKeepinEvis_2x8);\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of gather_batch_vx*/
+
 static const char gather_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int indices_num;\n\
@@ -5698,6 +6871,244 @@ __kernel void gather_I16toF16_axis0(\n\
 }\n\
 "; /* end of gather_mix_vx*/
 
+static const char gather_mix_batch_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int indices_num;\n\
+_viv_uniform int batch;\n\
+\n\
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8;\n\
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\
+\n\
+#define GATHER_BATCH_8BITS_TO_F16(src0_type_name, read_type) \\\n\
+__kernel void gather_batch_##src0_type_name##toF16( \\\n\
+    __read_only image2d_t   input0, \\\n\
+    __read_only image2d_t   input1, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int block_size, \\\n\
+    int block_num, \\\n\
+    int axis_num \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int gidz = get_global_id(2); \\\n\
+ \\\n\
+    int2 coord_idx = (int2)(gidy, 0); \\\n\
+    int4 coord_in = (int4)(gidx, 0, 0, 0); \\\n\
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+ \\\n\
+    for(; coord_idx.y < batch;) \\\n\
+    { \\\n\
+        int4 indice = read_imagei(input1, coord_idx); \\\n\
+        coord_idx.y++; \\\n\
+        coord_in.y = gidz * axis_num + indice.x; \\\n\
+ \\\n\
+        read_type src; \\\n\
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.z++; \\\n\
+        vxc_half8  src0, src1; \\\n\
+        vxc_short8 dst0, dst1; \\\n\
+ \\\n\
+        VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        VXC_DP2x8(src1,src,ms0, VXC_MODIFIER(0,7,8, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        _viv_asm(COPY, dst0, src0, 16); \\\n\
+        _viv_asm(COPY, dst1, src1, 16); \\\n\
+        VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x += 8; \\\n\
+        VXC_WriteImage2DArray(output, coord, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.z++; \\\n\
+        coord.x = gidx; \\\n\
+    } \\\n\
+}\n\
+GATHER_BATCH_8BITS_TO_F16(U8, vxc_uchar16)\n\
+GATHER_BATCH_8BITS_TO_F16(I8, vxc_char16)\n\
+\n\
+#define GATHER_BATCH_F16_TO_QINT(src1_type_name, write_type) \\\n\
+__kernel void gather_batch_F16to##src1_type_name( \\\n\
+    __read_only image2d_t   input0, \\\n\
+    __read_only image2d_t   input1, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int block_size, \\\n\
+    int block_num, \\\n\
+    int axis_num \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int gidz = get_global_id(2); \\\n\
+ \\\n\
+    int2 coord_idx = (int2)(gidy, 0); \\\n\
+    int4 coord_in = (int4)(gidx, 0, 0, 0); \\\n\
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); \\\n\
+    vxc_ushort8 mp1; \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    for(; coord_idx.y < batch;) \\\n\
+    { \\\n\
+        int4 indice = read_imagei(input1, coord_idx); \\\n\
+        coord_idx.y++; \\\n\
+        coord_in.y = gidz * axis_num + indice.x; \\\n\
+ \\\n\
+        vxc_short8 src; \\\n\
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.z++; \\\n\
+ \\\n\
+        vxc_half8 data; \\\n\
+        write_type dst; \\\n\
+        _viv_asm(COPY, data, src, 16); \\\n\
+        VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.z++; \\\n\
+    } \\\n\
+}\n\
+GATHER_BATCH_F16_TO_QINT(U8, vxc_uchar16)\n\
+GATHER_BATCH_F16_TO_QINT(I8, vxc_char16)\n\
+GATHER_BATCH_F16_TO_QINT(I16, vxc_short8)\n\
+\n\
+__kernel void gather_batch_I16toF16(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+\n\
+    int2 coord_idx = (int2)(gidy, 0);\n\
+    int4 coord_in = (int4)(gidx, 0, 0, 0);\n\
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\
+    vxc_half8  src0;\n\
+    vxc_short8 dst0;\n\
+    vxc_ushort8 ms0;\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\
+    for(; coord_idx.y < batch;)\n\
+    {\n\
+        int4 indice = read_imagei(input1, coord_idx);\n\
+        coord_idx.y++;\n\
+        coord_in.y = gidz * axis_num + indice.x;\n\
+\n\
+        vxc_short8 src;\n\
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.z++;\n\
+        VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniU8MulAndPostShift_0_Lo_2x8);\n\
+        _viv_asm(COPY, dst0, src0, 16);\n\
+        VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord.z++;\n\
+    }\n\
+}\n\
+\n\
+#define GATHER_BATCH_8BITS_TO_F16_AXIS0(src0_type_name, read_type) \\\n\
+__kernel void gather_batch_##src0_type_name##toF16_axis0( \\\n\
+    __read_only image2d_t   input0, \\\n\
+    __read_only image2d_t   input1, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int block_size, \\\n\
+    int block_num, \\\n\
+    int axis_num \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int4 indices = read_imagei(input1, coord.xz); \\\n\
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    read_type src; \\\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = indices.y; \\\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = indices.z; \\\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = indices.w; \\\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_half8  src0; \\\n\
+    vxc_short8 dst0; \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+    _viv_asm(COPY, dst0, src0, 16); \\\n\
+    VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GATHER_BATCH_8BITS_TO_F16_AXIS0(U8, vxc_uchar16)\n\
+GATHER_BATCH_8BITS_TO_F16_AXIS0(I8, vxc_char16)\n\
+\n\
+#define GATHER_BATCH_F16_TO_QINT_AXIS0(src1_type_name, write_type) \\\n\
+__kernel void gather_batch_F16to##src1_type_name##_axis0( \\\n\
+    __read_only image2d_t   input0, \\\n\
+    __read_only image2d_t   input1, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int block_size, \\\n\
+    int block_num, \\\n\
+    int axis_num \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int4 indices = read_imagei(input1, coord.xz); \\\n\
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = indices.y; \\\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = indices.z; \\\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = indices.w; \\\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_ushort8 mp1; \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    vxc_half8 data; \\\n\
+    write_type dst; \\\n\
+    _viv_asm(COPY, data, src, 16); \\\n\
+    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GATHER_BATCH_F16_TO_QINT_AXIS0(U8, vxc_uchar16)\n\
+GATHER_BATCH_F16_TO_QINT_AXIS0(I8, vxc_char16)\n\
+GATHER_BATCH_F16_TO_QINT_AXIS0(I16, vxc_short8)\n\
+\n\
+__kernel void gather_batch_I16toF16_axis0(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 indices = read_imagei(input1, coord.xz);\n\
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.y;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.z;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.w;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    vxc_half8  src0;\n\
+    vxc_short8 dst0;\n\
+    vxc_ushort8 ms0;\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\
+    VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniU8MulAndPostShift_0_Lo_2x8);\n\
+    _viv_asm(COPY, dst0, src0, 16);\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of gather_mix_batch_vx*/
+
 static const char gather_nd_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 __kernel void gather_nd_I8toI8_1D(\n\
@@ -12425,7 +13836,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\
 { \\\n\
     int lidx = get_local_id(0); \\\n\
     int offset  = get_global_id(0); \\\n\
-    Image src_img = create_image_from_image2d(input, 1);\n\
+    Image src_img = create_image_from_image2d(input, 1); \\\n\
     uchar *src_ptr_base = (uchar *)src_img.ptr; \\\n\
     uchar *src_ptr; \\\n\
     vxc_uchar8 src0, src1; \\\n\
@@ -16772,6 +18183,8 @@ __kernel void logical_not_I8toI8_2D(\n\
 
 static const char logical_ops_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
+_viv_uniform VXC_512Bits uniConvertInt16toInt8_2x8;\n\
+\n\
 #define TENSORLOGICAL_PROCESS(input_type, copy_type, output_type, out_copy_type,\\\n\
 lgc_op, lgc_op2, read_fun, write_fun) \\\n\
     input_type vA;\\\n\
@@ -16831,7 +18244,7 @@ out_copy_type, lgc_op, lgc_op2,  read_fun, write_fun) \\\n\
     VXC_DP2x8(tmpOut,dst,dst,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero, 0),uniMulShortMinus1toFp16_2x8); \\\n\
     out_copy_type data; \\\n\
     _viv_asm(COPY, data, tmpOut, 16); \\\n\
-    write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
 \n\
 #define TENSORLOGICAL_FP(name0, src_type_name, dst_type_name, input_type,\\\n\
@@ -16858,6 +18271,47 @@ copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \\\n\
     VXC_ReadImage, VXC_WriteImage) \\\n\
 }\n\
 \n\
+#define TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type,\\\n\
+out_copy_type, lgc_op, lgc_op2,  read_fun, write_fun) \\\n\
+    input_type vA;\\\n\
+    copy_type  src0;\\\n\
+    input_type vB;\\\n\
+    copy_type  src1;\\\n\
+    read_fun(vA,in0,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\\\n\
+    _viv_asm(COPY, src0, vA, 16); \\\n\
+    read_fun(vB,in1,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\\\n\
+    _viv_asm(COPY, src1, vB, 16); \\\n\
+    output_type dst; \\\n\
+    dst = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \\\n\
+    vxc_char8 data; \\\n\
+    VXC_DP2x8(data,dst,dst,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero, 1),uniConvertInt16toInt8_2x8); \\\n\
+    data &= 1; \\\n\
+    write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+#define TENSORLOGICAL_BFP16(name0, src_type_name, dst_type_name, input_type,\\\n\
+copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \\\n\
+    __kernel void logical_##name0##_##src_type_name##to##dst_type_name( \\\n\
+    __read_only  image2d_array_t in0, \\\n\
+    __read_only  image2d_array_t in1, \\\n\
+    __write_only image2d_array_t output) \\\n\
+{\\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\\\n\
+    TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2,\\\n\
+    VXC_ReadImage2DArray, VXC_WriteImage2DArray) \\\n\
+}\n\
+\n\
+#define TENSORLOGICAL_BFP16_2D(name0, src_type_name, dst_type_name, input_type,\\\n\
+copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \\\n\
+    __kernel void logical_##name0##_##src_type_name##to##dst_type_name##_2D( \\\n\
+    __read_only  image2d_array_t in0, \\\n\
+    __read_only  image2d_array_t in1, \\\n\
+    __write_only image2d_array_t output) \\\n\
+{\\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));\\\n\
+    TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2,\\\n\
+    VXC_ReadImage, VXC_WriteImage) \\\n\
+}\n\
+\n\
 //          name0, src_name, dst_name, input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2\n\
 TENSORLOGICAL(or,      I8,     I8,     vxc_char8,   vxc_char8,   vxc_char8,   vxc_char8,   ||, )\n\
 //TENSORLOGICAL(or,      U8,     U8,     vxc_uchar8,  vxc_uchar8,  vxc_char8,   vxc_uchar8,  ||, )\n\
@@ -16872,6 +18326,10 @@ TENSORLOGICAL(xor,     I8,     I8,     vxc_char8,   vxc_char8,   vxc_char8,   vx
 //TENSORLOGICAL(xor,     I16,    I16,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)\n\
 //TENSORLOGICAL_FP(xor,  F16,    F16,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)\n\
 \n\
+TENSORLOGICAL_BFP16(or,   BF16,   I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ||, )\n\
+TENSORLOGICAL_BFP16(and,  BF16,   I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  &&, )\n\
+TENSORLOGICAL_BFP16(xor,  BF16,   I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)\n\
+\n\
 TENSORLOGICAL_2D(or,      I8,     I8,     vxc_char8,   vxc_char8,   vxc_char8,   vxc_char8,   ||, )\n\
 //TENSORLOGICAL_2D(or,      U8,     U8,     vxc_uchar8,  vxc_uchar8,  vxc_char8,   vxc_uchar8,  ||, )\n\
 //TENSORLOGICAL_2D(or,      I16,    I16,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ||, )\n\
@@ -16884,6 +18342,10 @@ TENSORLOGICAL_2D(xor,     I8,     I8,     vxc_char8,   vxc_char8,   vxc_char8,
 //TENSORLOGICAL_2D(xor,     U8,     U8,     vxc_uchar8,  vxc_uchar8,  vxc_char8,   vxc_uchar8,  ^, !!)\n\
 //TENSORLOGICAL_2D(xor,     I16,    I16,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)\n\
 //TENSORLOGICAL_FP_2D(xor,  F16,    F16,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)\n\
+\n\
+TENSORLOGICAL_BFP16_2D(or,   BF16,  I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ||, )\n\
+TENSORLOGICAL_BFP16_2D(and,  BF16,  I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  &&, )\n\
+TENSORLOGICAL_BFP16_2D(xor,  BF16,  I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)\n\
 "; /* end of logical_ops_vx*/
 
 static const char lstmunit_activation_BP_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -20706,6 +22168,280 @@ LSTMUNIT_S_U8_FP16(U8,  HARD_SIGMOID, int4,  vxc_uchar4, vxc_uchar4, hard_sigmoi
 LSTMUNIT_S_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4,  vxc_short4, hard_sigmoid)\n\
 "; /* end of lstmunit_activation_S_U8_vx*/
 
+static const char matrixmul_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int ac2zero;\n\
+_viv_uniform int bc2zero;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
+\n\
+__kernel void gemm_BF16BF16toBF16(image2d_array_t inputA,\n\
+            image2d_array_t inputB, image2d_array_t output,\n\
+            int transposeA, int transposeB,\n\
+            int adjointA, int adjointB, uint M, uint K, uint N)\n\
+{\n\
+    uint gidy = get_global_id(1);\n\
+    int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0);\n\
+    int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0);\n\
+\n\
+    vxc_ushort8 valC0, valC1, src0, src1;\n\
+    vxc_ushort8 srcA0, srcB0, srcA1, srcB1, outC;\n\
+    vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);\n\
+    vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);\n\
+\n\
+    int8 inputA_desc, inputB_desc, output_desc;\n\
+    _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\
+    int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\
+    _viv_asm(MOV, coord_a.w, baseAddr_a);\n\
+    _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\
+    int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\
+    _viv_asm(MOV, coord_b.w, baseAddr_b);\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+\n\
+    for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)\n\
+    {\n\
+        vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\
+        vxc_float4 tempB0, tempB1, tempB2, tempB3;\n\
+        VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+                    VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+                    VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+                    VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+                    VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_a.x += 4;\n\
+        coord_b.y += 4;\n\
+\n\
+        VXC_DP2x8(src0, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, tempA0, src0, 16);\n\
+        VXC_DP2x8(src1, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, tempA1, src1, 16);\n\
+        VXC_DP2x8(src0, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, tempA2, src0, 16);\n\
+        VXC_DP2x8(src1, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, tempA3, src1, 16);\n\
+        VXC_DP2x8(src0, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, tempB0, src0, 16);\n\
+        VXC_DP2x8(src1, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, tempB1, src1, 16);\n\
+        VXC_DP2x8(src0, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, tempB2, src0, 16);\n\
+        VXC_DP2x8(src1, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, tempB3, src1, 16);\n\
+\n\
+        sum0 += (tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3);\n\
+        sum1 += (tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3);\n\
+        sum2 += (tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3);\n\
+        sum3 += (tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3);\n\
+    }\n\
+    coord_b.y = gidy;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_b.w, baseAddr);\n\
+    _viv_asm(COPY, valC0, sum0, 16);\n\
+    _viv_asm(COPY, valC1, sum1, 16);\n\
+    VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+    coord_b.y++;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+    coord_b.y++;\n\
+    _viv_asm(COPY, valC0, sum2, 16);\n\
+    _viv_asm(COPY, valC1, sum3, 16);\n\
+    VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+    coord_b.y++;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gemm_transa_BF16BF16toBF16(\n\
+                        image2d_array_t inputA,\n\
+                        image2d_array_t inputB,\n\
+                        image2d_array_t output,\n\
+                                    int transposeA,\n\
+                                    int transposeB,\n\
+                                    int adjointA,\n\
+                                    int adjointB,\n\
+                        uint M, uint K, uint N)\n\
+{\n\
+    uint gidy = get_global_id(1);\n\
+\n\
+    vxc_ushort8 valC0, valC1;\n\
+    vxc_ushort8 srcA, srcB, outC, src0, src1;\n\
+\n\
+    int4 coord_a = (int4)(gidy, 0, (ac2zero ? 0 : get_global_id(2)), 0);\n\
+    int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0);\n\
+\n\
+    vxc_float4 sum0 = (vxc_float4)(0);\n\
+    vxc_float4 sum1 = (vxc_float4)(0);\n\
+    vxc_float4 sum2 = (vxc_float4)(0);\n\
+    vxc_float4 sum3 = (vxc_float4)(0);\n\
+\n\
+    int8 inputA_desc, inputB_desc, output_desc;\n\
+    _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\
+    int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\
+    _viv_asm(MOV, coord_a.w, baseAddr_a);\n\
+    _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\
+    int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\
+    _viv_asm(MOV, coord_b.w, baseAddr_b);\n\
+\n\
+    vxc_float4 tempA0;\n\
+    vxc_float4 tempB0;\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;)\n\
+    {\n\
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        coord_a.y++;\n\
+        coord_b.y++;\n\
+\n\
+        VXC_DP2x8(src0, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, tempA0, src0, 16);\n\
+\n\
+        VXC_DP2x8(src1, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, tempB0, src1, 16);\n\
+\n\
+        sum0 = (sum0 + tempA0.x * tempB0);\n\
+        sum1 = (sum1 + tempA0.y * tempB0);\n\
+        sum2 = (sum2 + tempA0.z * tempB0);\n\
+        sum3 = (sum3 + tempA0.w * tempB0);\n\
+    }\n\
+    coord_b.y = gidy;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_b.w, baseAddr);\n\
+    _viv_asm(COPY, valC0, sum0, 16);\n\
+    _viv_asm(COPY, valC1, sum1, 16);\n\
+    VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_b.y++;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_b.y++;\n\
+    _viv_asm(COPY, valC0, sum2, 16);\n\
+    _viv_asm(COPY, valC1, sum3, 16);\n\
+    VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_b.y++;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gemm_transb_BF16BF16toBF16(image2d_array_t inputA,\n\
+                        image2d_array_t inputB,\n\
+                        image2d_array_t output,\n\
+                                    int transposeA,\n\
+                                    int transposeB,\n\
+                                    int adjointA,\n\
+                                    int adjointB,\n\
+                        uint M, uint K, uint N)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\
+    int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\
+\n\
+    vxc_float4 sum0 = (vxc_float4)(0);\n\
+    vxc_float4 sum1 = (vxc_float4)(0);\n\
+    vxc_float4 sum2 = (vxc_float4)(0);\n\
+    vxc_float4 sum3 = (vxc_float4)(0);\n\
+\n\
+    int8 inputA_desc, inputB_desc, output_desc;\n\
+    _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\
+    int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\
+    _viv_asm(MOV, coord_a.w, baseAddr_a);\n\
+    _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\
+    int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\
+    _viv_asm(MOV, coord_b.w, baseAddr_b);\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    vxc_ushort8 src0, src1;\n\
+\n\
+    for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;)\n\
+    {\n\
+        vxc_ushort8 srcA0,srcA1,srcA2,srcA3;\n\
+        vxc_ushort8 srcB0,srcB1,srcB2,srcB3;\n\
+        vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\
+        vxc_float4 tempB0, tempB1, tempB2, tempB3;\n\
+        VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        coord_a.x += 4;\n\
+        coord_b.x += 4;\n\
+\n\
+        VXC_DP2x8(src0, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, tempA0, src0, 16);\n\
+        VXC_DP2x8(src1, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, tempA1, src1, 16);\n\
+        VXC_DP2x8(src0, srcA2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, tempA2, src0, 16);\n\
+        VXC_DP2x8(src1, srcA3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, tempA3, src1, 16);\n\
+\n\
+        VXC_DP2x8(src0, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, tempB0, src0, 16);\n\
+        VXC_DP2x8(src1, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, tempB1, src1, 16);\n\
+        VXC_DP2x8(src0, srcB2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, tempB2, src0, 16);\n\
+        VXC_DP2x8(src1, srcB3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, tempB3, src1, 16);\n\
+\n\
+        sum0 += (float4)(dot(tempA0, tempB0), dot(tempA0, tempB1), dot(tempA0, tempB2), dot(tempA0, tempB3));\n\
+        sum1 += (float4)(dot(tempA1, tempB0), dot(tempA1, tempB1), dot(tempA1, tempB2), dot(tempA1, tempB3));\n\
+        sum2 += (float4)(dot(tempA2, tempB0), dot(tempA2, tempB1), dot(tempA2, tempB2), dot(tempA2, tempB3));\n\
+        sum3 += (float4)(dot(tempA3, tempB0), dot(tempA3, tempB1), dot(tempA3, tempB2), dot(tempA3, tempB3));\n\
+    }\n\
+\n\
+    vxc_ushort8 valC0, valC1, valDst;\n\
+    _viv_asm(COPY, valC0, sum0, 16);\n\
+    _viv_asm(COPY, valC1, sum1, 16);\n\
+    VXC_DP2x8(valDst, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    _viv_asm(COPY, valC0, sum2, 16);\n\
+    _viv_asm(COPY, valC1, sum3, 16);\n\
+    VXC_DP2x8(valDst, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of matrixmul_bf16_vx*/
+
 static const char matrixmul_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
@@ -20719,6 +22455,9 @@ _viv_uniform int ac2zero;\n\
 _viv_uniform int bc2zero;\n\
 \n\
 _viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
 \n\
 #if (VX_VERSION==2)\n\
 __kernel void gemm_F16F16toF16(image2d_array_t inputA,\n\
@@ -20900,14 +22639,9 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA,\n\
 }\n\
 #endif\n\
 \n\
-__kernel void gemm_F32F32toF32(image2d_array_t inputA,\n\
-                               image2d_array_t inputB,\n\
-                               image2d_array_t output,\n\
-                                    int transposeA,\n\
-                                    int transposeB,\n\
-                                    int adjointA,\n\
-                                    int adjointB,\n\
-                        uint M, uint K, uint N)\n\
+__kernel void gemm_F32F32toF32(\n\
+        image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output,\n\
+        int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N)\n\
 {\n\
     uint gidx = get_global_id(0);\n\
     uint gidy = get_global_id(1);\n\
@@ -20915,10 +22649,8 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA,\n\
     int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0);\n\
     int4 coord_b = (int4)(gidx, 0, (bc2zero ? 0 : get_global_id(2)), 0);\n\
 \n\
-    vxc_float4 sum0 = (vxc_float4)(0);\n\
-    vxc_float4 sum1 = (vxc_float4)(0);\n\
-    vxc_float4 sum2 = (vxc_float4)(0);\n\
-    vxc_float4 sum3 = (vxc_float4)(0);\n\
+    vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);\n\
+    vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);\n\
 \n\
     vxc_int4 tmpOut0, tmpOut1;\n\
     vxc_uchar16 outC;\n\
@@ -20932,7 +22664,6 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA,\n\
 \n\
         coord_a.x = i;\n\
         coord_a.y = gidy;\n\
-\n\
         coord_b.x = gidx;\n\
         coord_b.y = i;\n\
 \n\
@@ -20965,7 +22696,8 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA,\n\
     write_imagef(output, coord_b, sum2);\n\
     coord_b.y++;\n\
     write_imagef(output, coord_b, sum3);\n\
-}"; /* end of matrixmul_f16_vx*/
+}\n\
+"; /* end of matrixmul_f16_vx*/
 
 static const char matrixmul_f16f16_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -23660,6 +25392,62 @@ __kernel void maximum_U8U8toU8_2D\n\
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
+__kernel void maximum_U8U8toI16\n\
+    (\n\
+    __read_only  image2d_array_t    input0,\n\
+    __read_only  image2d_array_t    input1,\n\
+    __write_only image2d_array_t    output\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_uchar16 src0, src1;\n\
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    vxc_short8 dst0, dst1, dst;\n\
+    vxc_ushort8 mp0, mp1;\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
+        uniU8MulAndPostShift0_Lo_2x8);\n\
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
+        uniU8MulAndPostShift1_Lo_2x8);\n\
+    dst = max(dst0, dst1);\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void maximum_U8U8toI16_2D\n\
+    (\n\
+    __read_only  image2d_array_t    input0,\n\
+    __read_only  image2d_array_t    input1,\n\
+    __write_only image2d_array_t    output\n\
+    )\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+\n\
+    vxc_uchar16 src0, src1;\n\
+    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    vxc_short8 dst0, dst1, dst;\n\
+    vxc_ushort8 mp0, mp1;\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
+        uniU8MulAndPostShift0_Lo_2x8);\n\
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
+        uniU8MulAndPostShift1_Lo_2x8);\n\
+    dst = max(dst0, dst1);\n\
+\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
 _viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8;\n\
 _viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8;\n\
 __kernel void maximum_I16I16toI16\n\
@@ -24198,6 +25986,66 @@ __kernel void maximum_F16F16toI16_2D\n\
     tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);\n\
     VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;\n\
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\
+__kernel void maximum_I16I16toU8\n\
+    (\n\
+    __read_only  image2d_array_t    input0,\n\
+    __read_only  image2d_array_t    input1,\n\
+    __write_only image2d_array_t    output\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src0, src1;\n\
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    vxc_uchar16 dst0, dst1, dst;\n\
+    vxc_ushort8 mp0, mp1;\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
+        uniU8MulAndPostShift0_Lo_2x8);\n\
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
+        uniU8MulAndPostShift1_Lo_2x8);\n\
+    dst = max(dst0, dst1);\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void maximum_I16I16toU8_2D\n\
+    (\n\
+    __read_only  image2d_array_t    input0,\n\
+    __read_only  image2d_array_t    input1,\n\
+    __write_only image2d_array_t    output\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
+\n\
+    vxc_short8 src0, src1;\n\
+    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    vxc_uchar16 dst0, dst1, dst;\n\
+    vxc_ushort8 mp0, mp1;\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
+        uniU8MulAndPostShift0_Lo_2x8);\n\
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
+        uniU8MulAndPostShift1_Lo_2x8);\n\
+    dst = max(dst0, dst1);\n\
+\n\
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
 }"; /* end of maximum_i16_vx*/
 
 static const char minimum_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -24426,6 +26274,62 @@ __kernel void minimum_U8U8toU8_2D\n\
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
+__kernel void minimum_U8U8toI16\n\
+    (\n\
+    __read_only  image2d_array_t    input0,\n\
+    __read_only  image2d_array_t    input1,\n\
+    __write_only image2d_array_t    output\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_uchar16 src0, src1;\n\
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    vxc_short8 dst0, dst1, dst;\n\
+    vxc_ushort8 mp0, mp1;\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
+        uniU8MulAndPostShift0_Lo_2x8);\n\
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
+        uniU8MulAndPostShift1_Lo_2x8);\n\
+    dst = min(dst0, dst1);\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void minimum_U8U8toI16_2D\n\
+    (\n\
+    __read_only  image2d_array_t    input0,\n\
+    __read_only  image2d_array_t    input1,\n\
+    __write_only image2d_array_t    output\n\
+    )\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+\n\
+    vxc_uchar16 src0, src1;\n\
+    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    vxc_short8 dst0, dst1, dst;\n\
+    vxc_ushort8 mp0, mp1;\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
+        uniU8MulAndPostShift0_Lo_2x8);\n\
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
+        uniU8MulAndPostShift1_Lo_2x8);\n\
+    dst = min(dst0, dst1);\n\
+\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
 _viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8;\n\
 _viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8;\n\
 __kernel void minimum_I16I16toI16\n\
@@ -24968,6 +26872,66 @@ __kernel void minimum_F16F16toI16_2D\n\
     VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
 \n\
     VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;\n\
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\
+__kernel void minimum_I16I16toU8\n\
+    (\n\
+    __read_only  image2d_array_t    input0,\n\
+    __read_only  image2d_array_t    input1,\n\
+    __write_only image2d_array_t    output\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src0, src1;\n\
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    vxc_uchar16 dst0, dst1, dst;\n\
+    vxc_ushort8 mp0, mp1;\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
+        uniU8MulAndPostShift0_Lo_2x8);\n\
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
+        uniU8MulAndPostShift1_Lo_2x8);\n\
+    dst = min(dst0, dst1);\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void minimum_I16I16toU8_2D\n\
+    (\n\
+    __read_only  image2d_array_t    input0,\n\
+    __read_only  image2d_array_t    input1,\n\
+    __write_only image2d_array_t    output\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
+\n\
+    vxc_short8 src0, src1;\n\
+    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    vxc_uchar16 dst0, dst1, dst;\n\
+    vxc_ushort8 mp0, mp1;\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
+        uniU8MulAndPostShift0_Lo_2x8);\n\
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
+        uniU8MulAndPostShift1_Lo_2x8);\n\
+    dst = min(dst0, dst1);\n\
+\n\
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
 }"; /* end of minimum_i16_vx*/
 
 static const char moments_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -24989,6 +26953,9 @@ _viv_uniform float e2InScale;\n\
 _viv_uniform float rowSumScale;\n\
 _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
 _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
 \n\
 #define MOMENTS_AXIS0_QINT(src0_type_name, read0_type) \\\n\
 __kernel void moments_axis0_##src0_type_name##toF16( \\\n\
@@ -25236,6 +27203,88 @@ __kernel void moments_axis0_I16toF16_2D(\n\
 \n\
     VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
     VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void moments_axis0_BF16toBF16(\n\
+    image2d_array_t input,\n\
+    image2d_t output_mean,\n\
+    image2d_t output_vari,\n\
+              int axis, int axis_num)\n\
+{\n\
+    int gidy = get_global_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(0, gidy, gidz, 0);\n\
+    vxc_ushort8 src0, src1;\n\
+    vxc_ushort8 val;\n\
+    vxc_float4 mean_vari0 = (vxc_float4)(0);\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);\n\
+    for(coord.x = 0; coord.x < width; coord.x += 8)\n\
+    {\n\
+        VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        float4 vec0, vec1;\n\
+        VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, vec0, src0, 16);\n\
+        VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, vec1, src1, 16);\n\
+        mean_vari0.x += dot(vec0, one) + dot(vec1, one);\n\
+        mean_vari0.y += dot(vec0, vec0) + dot(vec1, vec1);\n\
+    }\n\
+\n\
+    mean_vari0 *= dimRatio;\n\
+    mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0;\n\
+\n\
+    int2 coord_out = (int2)(gidy, gidz);\n\
+\n\
+    vxc_short8 dst;\n\
+    _viv_asm(COPY, src0, mean_vari0, 16);\n\
+    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void moments_axis0_BF16toBF16_2D(\n\
+    image2d_t input,\n\
+    image2d_t output_mean,\n\
+    image2d_t output_vari,\n\
+              int axis, int axis_num)\n\
+{\n\
+    int gidy = get_global_id(0);\n\
+    int2 coord = (int2)(0, gidy);\n\
+    vxc_ushort8 src0, src1;\n\
+    vxc_ushort8 val;\n\
+    vxc_float4 mean_vari0 = (vxc_float4)(0);\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);\n\
+\n\
+    for(coord.x = 0; coord.x < width; coord.x += 8)\n\
+    {\n\
+        VXC_ReadImage(val, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        float4 vec0, vec1;\n\
+        VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, vec0, src0, 16);\n\
+        VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, vec1, src1, 16);\n\
+        mean_vari0.x += dot(vec0, one) + dot(vec1, one);\n\
+        mean_vari0.y += dot(vec0, vec0) + dot(vec1, vec1);\n\
+    }\n\
+    mean_vari0 *= dimRatio;\n\
+    mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0;\n\
+\n\
+    int2 coord_out = (int2)(gidy, 0);\n\
+\n\
+    vxc_short8 dst;\n\
+    _viv_asm(COPY, src0, mean_vari0, 16);\n\
+    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+\n\
+    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }"; /* end of moments_axis0_vx*/
 
 static const char moments_axis01_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -25683,6 +27732,9 @@ _viv_uniform float e2InScale;\n\
 _viv_uniform float rowSumScale;\n\
 _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
 _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
 \n\
 #define MOMENTS_AXIS012_QINT(src0_type_name, read0_type) \\\n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_##src0_type_name##toF16( \\\n\
@@ -25901,6 +27953,81 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_I1
         VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
         VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
     }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_BF16toBF16(\n\
+    image2d_array_t input,\n\
+    image2d_t output_mean,\n\
+    image2d_t output_vari,\n\
+              int axis,\n\
+              int axis_num)\n\
+{\n\
+    int gidx = get_global_id(0) << 3;\n\
+    int lidx = get_local_id(0);\n\
+    int4 coord = (int4)(gidx, 0, 0, 0);\n\
+    vxc_float4 sumsqr;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+    float tmpSum = 0;\n\
+    float tmpSqr = 0;\n\
+    vxc_ushort8 src0, src1;\n\
+    vxc_ushort8 val;\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);\n\
+\n\
+    for(coord.z = 0; coord.z < channel; coord.z++)\n\
+    {\n\
+        for(coord.x = gidx; coord.x < width; coord.x += 128)\n\
+        {\n\
+            for(coord.y = 0; coord.y < height;)\n\
+            {\n\
+                VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+                coord.y++;\n\
+                float4 vec0, vec1;\n\
+                VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                        uniConvBF16toF32_Part0_2x8);\n\
+                _viv_asm(COPY, vec0, src0, 16);\n\
+                VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                        uniConvBF16toF32_Part1_2x8);\n\
+                _viv_asm(COPY, vec1, src1, 16);\n\
+                tmpSum += dot(vec0, one) + dot(vec1, one);\n\
+                tmpSqr += dot(vec0, vec0) + dot(vec1, vec1);\n\
+            }\n\
+        }\n\
+    }\n\
+    lcl_sum[lidx] = tmpSum;\n\
+    lcl_sqr[lidx] = tmpSqr;\n\
+\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int2 coord_out = (int2)(0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        float sum = (float)(0);\n\
+        float sqr = (float)(0);\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 mean_vari;\n\
+        mean_vari.x = sum * dimRatio;\n\
+        mean_vari.y = sqr * dimRatio;\n\
+        mean_vari.y = mean_vari.y  - mean_vari.x * mean_vari.x;\n\
+\n\
+        vxc_short8 dst;\n\
+        _viv_asm(COPY, src0, mean_vari, 16);\n\
+        VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+\n\
+        VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
 }"; /* end of moments_axis012_vx*/
 
 static const char moments_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -25915,6 +28042,8 @@ _viv_uniform float e2InScale;\n\
 _viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
 _viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
 _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
 \n\
 #define MOMENTS_AXIS1_QINT(src0_type_name, read0_type) \\\n\
 __kernel void moments_axis1_##src0_type_name##toF16( \\\n\
@@ -26102,6 +28231,88 @@ __kernel void moments_axis1_F16toF16_2D(\n\
     VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
     VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
+\n\
+__kernel void moments_axis1_BF16toBF16(\n\
+    image2d_array_t input,\n\
+    image2d_t output_mean,\n\
+    image2d_t output_vari,\n\
+              int axis, int axis_num)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+    vxc_ushort8 src0;\n\
+    vxc_ushort8 val;\n\
+    vxc_float4 sum = (vxc_float4)(0);\n\
+    vxc_float4 sqr = (vxc_float4)(0);\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        float4 vec0;\n\
+        VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, vec0, src0, 16);\n\
+\n\
+        sum += vec0;\n\
+        sqr += (vec0 * vec0);\n\
+    }\n\
+\n\
+    vxc_float4 mean = sum * dimRatio;\n\
+    vxc_float4 vari = sqr * dimRatio;\n\
+    vari = vari - mean * mean;\n\
+\n\
+    int2 coord_out = (int2)(gidx, gidz);\n\
+    vxc_short8 tmpdst0, tmpdst1, dst;\n\
+    _viv_asm(COPY, tmpdst0, mean, 16);\n\
+    _viv_asm(COPY, tmpdst1, vari, 16);\n\
+    VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+\n\
+    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void moments_axis1_BF16toBF16_2D(\n\
+    image2d_t input,\n\
+    image2d_t output_mean,\n\
+    image2d_t output_vari,\n\
+              int axis, int axis_num)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int2 coord = (int2)(gidx, 0);\n\
+    vxc_ushort8 src0;\n\
+    vxc_ushort8 val;\n\
+    vxc_float4 sum = (vxc_float4)(0);\n\
+    vxc_float4 sqr = (vxc_float4)(0);\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_ReadImage(val, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        float4 vec0;\n\
+        VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, vec0, src0, 16);\n\
+\n\
+        sum += vec0;\n\
+        sqr += (vec0 * vec0);\n\
+    }\n\
+\n\
+    vxc_float4 mean = sum * dimRatio;\n\
+    vxc_float4 vari = sqr * dimRatio;\n\
+    vari = vari - mean * mean;\n\
+\n\
+    int2 coord_out = (int2)(gidx, 0);\n\
+    vxc_short8 tmpdst0, tmpdst1, dst;\n\
+    _viv_asm(COPY, tmpdst0, mean, 16);\n\
+    _viv_asm(COPY, tmpdst1, vari, 16);\n\
+    VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
 "; /* end of moments_axis1_vx*/
 
 static const char moments_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -26115,6 +28326,8 @@ _viv_uniform float e2InScale;\n\
 _viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
 _viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
 _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
 \n\
 #define MOMENTS_AXIS2_QINT(src0_type_name, read0_type) \\\n\
 __kernel void moments_axis2_##src0_type_name##toF16( \\\n\
@@ -26203,6 +28416,50 @@ __kernel void moments_axis2_F16toF16(\n\
     _viv_asm(COPY, dst, tmpVal, 16);\n\
     VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
     VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void moments_axis2_BF16toBF16(\n\
+    image2d_array_t input,\n\
+    image2d_t output_mean,\n\
+    image2d_t output_vari,\n\
+              int axis,\n\
+              int axis_num)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    int4 coord = (int4)(gidx, gidy, 0, 0);\n\
+    vxc_ushort8 src0;\n\
+    vxc_ushort8 val;\n\
+    vxc_float4 sum = (vxc_float4)(0);\n\
+    vxc_float4 sqr = (vxc_float4)(0);\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+\n\
+    for(coord.z = 0; coord.z < channel; coord.z++)\n\
+    {\n\
+        VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        float4 vec0;\n\
+        VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                uniConvBF16toF32_Part0_2x8);\n\
+        _viv_asm(COPY, vec0, src0, 16);\n\
+\n\
+        sum += vec0;\n\
+        sqr += (vec0 * vec0);\n\
+    }\n\
+\n\
+    vxc_float4 mean = sum * dimRatio;\n\
+    vxc_float4 vari = sqr * dimRatio;\n\
+    vari = vari - mean * mean;\n\
+\n\
+    int2 coord_out = (int2)(gidx, gidy);\n\
+\n\
+    vxc_short8 tmpdst0, tmpdst1, dst;\n\
+    _viv_asm(COPY, tmpdst0, mean, 16);\n\
+    _viv_asm(COPY, tmpdst1, vari, 16);\n\
+    VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+\n\
+    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
 }"; /* end of moments_axis2_vx*/
 
 static const char moments_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -26570,6 +28827,9 @@ _viv_uniform float rowSumScale;\n\
 _viv_uniform float4 output_ZP;\n\
 _viv_uniform float4 outputScale;\n\
 _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
 \n\
 #define MOMENTS_AXIS012_QINT_U8(src0_type_name, read0_type) \\\n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_##src0_type_name##toU8( \\\n\
@@ -26627,7 +28887,144 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_##
         VXC_WriteImage(output_vari, coord_out, src0.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
     } \\\n\
 }\n\
-MOMENTS_AXIS012_QINT_U8(U8, vxc_uchar16)"; /* end of moments_u8_axis012_vx*/
+MOMENTS_AXIS012_QINT_U8(U8, vxc_uchar16)\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_BF16toBF16(\n\
+    image2d_array_t input, image2d_t output_mean, image2d_t output_vari,\n\
+    int axis, int axis_num)\n\
+{\n\
+    int gidx = get_global_id(0) << 3;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+    float tmpSum = 0;\n\
+    float tmpSqr = 0;\n\
+    vxc_ushort8 src0, src1;\n\
+    vxc_ushort8 val;\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);\n\
+\n\
+    for(coord.x = gidx; coord.x < width; coord.x += 128)\n\
+    {\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            float4 vec0, vec1;\n\
+            VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part0_2x8);\n\
+            _viv_asm(COPY, vec0, src0, 16);\n\
+            VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part1_2x8);\n\
+            _viv_asm(COPY, vec1, src1, 16);\n\
+            tmpSum += dot(vec0, one) + dot(vec1, one);\n\
+            tmpSqr += dot(vec0, vec0) + dot(vec1, vec1);\n\
+        }\n\
+    }\n\
+\n\
+    lcl_sum[lidx] = tmpSum;\n\
+    lcl_sqr[lidx] = tmpSqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int2 coord_out = (int2)(gidz, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        float sum = 0.0f;\n\
+        float sqr = 0.0f;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 mean_vari;\n\
+        mean_vari.x = sum * dimRatio;\n\
+        mean_vari.y = sqr * dimRatio;\n\
+        mean_vari.y = mean_vari.y  - mean_vari.x * mean_vari.x;\n\
+\n\
+        vxc_short8 dst;\n\
+        _viv_asm(COPY, src0, mean_vari, 16);\n\
+        VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+\n\
+        VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_BF16toBF16_2D(\n\
+    image2d_array_t input, image2d_t output_mean, image2d_t output_vari,\n\
+    int axis, int axis_num)\n\
+{\n\
+    int gidx = get_global_id(0) << 3;\n\
+    int lidx = get_local_id(0);\n\
+    int2 coord = (int2)(gidx, 0);\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+    float tmpSum = 0;\n\
+    float tmpSqr = 0;\n\
+    vxc_ushort8 src0, src1;\n\
+    vxc_ushort8 val;\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);\n\
+\n\
+    for(coord.x = gidx; coord.x < width; coord.x += 128)\n\
+    {\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            VXC_ReadImage(val, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+\n\
+            float4 vec0, vec1;\n\
+            VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part0_2x8);\n\
+            _viv_asm(COPY, vec0, src0, 16);\n\
+            VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part1_2x8);\n\
+            _viv_asm(COPY, vec1, src1, 16);\n\
+            tmpSum += dot(vec0, one) + dot(vec1, one);\n\
+            tmpSqr += dot(vec0, vec0) + dot(vec1, vec1);\n\
+        }\n\
+    }\n\
+\n\
+    lcl_sum[lidx] = tmpSum;\n\
+    lcl_sqr[lidx] = tmpSqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int2 coord_out = (int2)(0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        float sum = 0.0f;\n\
+        float sqr = 0.0f;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+        float4 mean_vari;\n\
+        mean_vari.x = sum * dimRatio;\n\
+        mean_vari.y = sqr * dimRatio;\n\
+        mean_vari.y = mean_vari.y  - mean_vari.x * mean_vari.x;\n\
+\n\
+        vxc_short8 dst;\n\
+        _viv_asm(COPY, src0, mean_vari, 16);\n\
+        VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+\n\
+        VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}"; /* end of moments_u8_axis012_vx*/
 
 static const char one_hot_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -26834,7 +29231,94 @@ __kernel void one_hot_##name0##to##name1##_2D \\\n\
 ONE_HOT_ASYM_SH_IMPL_2D(U8,  F16, vxc_uchar8,  vxc_uchar8, vxc_ushort8)\n\
 ONE_HOT_ASYM_SH_IMPL_2D(U8,  U8,  vxc_uchar8,  vxc_uchar8, vxc_uchar8)\n\
 \n\
-"; /* end of one_hot_vx*/
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+__kernel void one_hot_BF16toBF16\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                 int             suffix_sz,\n\
+                 int             on_val,\n\
+                 int             off_val\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), 0, get_global_id(0), get_global_id(0));\n\
+\n\
+    vxc_ushort8 src0, src1;\n\
+    vxc_ushort8 val;\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+\n\
+    VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 vec0, vec1;\n\
+    VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, vec0, src0, 16);\n\
+    VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, vec1, src1, 16);\n\
+    int4 data0 = convert_int4(vec0);\n\
+    int4 data1 = convert_int4(vec1);\n\
+\n\
+    do\n\
+    {\n\
+        int4 d0 = data0 == coord.zzzz ? on_val : off_val;\n\
+        int4 d1 = data1 == coord.zzzz ? on_val : off_val;\n\
+\n\
+        vxc_short8 dst;\n\
+        VXC_DP2x8(dst, d0, d1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\
+\n\
+        VXC_WriteImage2DArray(output, coord.xzyw, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+        coord.z ++;\n\
+    } while (coord.z < depth);\n\
+}\n\
+\n\
+__kernel void one_hot_BF16toBF16_2D\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                 int             suffix_sz,\n\
+                 int             on_val,\n\
+                 int             off_val\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+    vxc_ushort8 src0, src1;\n\
+    vxc_ushort8 val;\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+\n\
+    VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 vec0;\n\
+    VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, vec0, src0, 16);\n\
+    int4 data = convert_int4(vec0);\n\
+    int4 data0, data1;\n\
+    int4 d4 = (int4)(0, 1, 2, 3);\n\
+    do\n\
+    {\n\
+        coord.zw = coord.xx + (int2)(0, 1);\n\
+        vxc_short8 dst;\n\
+        data0 = data.xxxx == d4 ? on_val : off_val;\n\
+        data1 = data.yyyy == d4 ? on_val : off_val;\n\
+\n\
+        VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\
+\n\
+        VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+        VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0));\n\
+        coord.zw = coord.zw + (int2)(2, 2);\n\
+\n\
+        data0 = data.zzzz == d4 ? on_val : off_val;\n\
+        data1 = data.wwww == d4 ? on_val : off_val;\n\
+\n\
+        VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\
+\n\
+        VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+        VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0));\n\
+        d4 += 4;\n\
+        coord.y += 4;\n\
+    } while (coord.y < depth);\n\
+}"; /* end of one_hot_vx*/
 
 static const char poolwithargmax_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -35039,13 +37523,15 @@ _viv_uniform float input1Tail;\n\
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
 _viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;\n\
 _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
 \n\
 #define COMPARISONS_2D(func_name, src0_type_name, src1_type_name, \\\n\
         src0_type, src0_copy_type, src1_type, src1_copy_type, cmp_op) \\\n\
 __kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_2D( \\\n\
-    __read_only  image2d_array_t  input0, \\\n\
-    __read_only  image2d_array_t  input1, \\\n\
-    __write_only image2d_array_t  output \\\n\
+    __read_only  image2d_t  input0, \\\n\
+    __read_only  image2d_t  input1, \\\n\
+    __write_only image2d_t  output \\\n\
     ) \\\n\
 { \\\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
@@ -35144,6 +37630,45 @@ COMPARISONS_2D(not_equal, U8,  F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half
 COMPARISONS_2D(not_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, !=)\n\
 COMPARISONS_2D(not_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8,  !=)\n\
 \n\
+#define COMPARISONS_BF_2D(func_name, src0_type_name, src1_type_name, cmp_op) \\\n\
+__kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_2D( \\\n\
+    __read_only  image2d_t  input0, \\\n\
+    __read_only  image2d_t  input1, \\\n\
+    __write_only image2d_t  output \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    vxc_ushort8 src0, src1, srcA, srcB; \\\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\
+    VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 vecA0, vecA1; \\\n\
+    float4 vecB0, vecB1; \\\n\
+    VXC_DP2x8(src0, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
+    _viv_asm(COPY, vecA0, src0, 16); \\\n\
+    VXC_DP2x8(src1, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\
+    _viv_asm(COPY, vecA1, src1, 16); \\\n\
+    VXC_DP2x8(src0, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
+    _viv_asm(COPY, vecB0, src0, 16); \\\n\
+    VXC_DP2x8(src1, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\
+    _viv_asm(COPY, vecB1, src1, 16); \\\n\
+    int4 dst0, dst1; \\\n\
+    dst0 = (vecA0)cmp_op(vecB0); \\\n\
+    dst1 = (vecA1)cmp_op(vecB1); \\\n\
+ \\\n\
+    vxc_char16 dst; \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    dst &= 1; \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+\n\
+COMPARISONS_BF_2D(less,        BF16, BF16,  <)\n\
+COMPARISONS_BF_2D(great,       BF16, BF16,  >)\n\
+COMPARISONS_BF_2D(less_equal,  BF16, BF16,  <=)\n\
+COMPARISONS_BF_2D(great_equal, BF16, BF16,  >=)\n\
+COMPARISONS_BF_2D(equal,       BF16, BF16,  ==)\n\
+COMPARISONS_BF_2D(not_equal,   BF16, BF16,  !=)\n\
 "; /* end of relational_ops_2d_vx*/
 
 static const char relational_ops_3d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -35155,6 +37680,8 @@ _viv_uniform float input1Tail;\n\
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
 _viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;\n\
 _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
 \n\
 #define COMPARISONS_3D(func_name, src0_type_name, src1_type_name, \\\n\
         src0_type, src0_copy_type, src1_type, src1_copy_type, cmp_op) \\\n\
@@ -35260,6 +37787,45 @@ COMPARISONS_3D(not_equal, U8,  F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half
 COMPARISONS_3D(not_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, !=)\n\
 COMPARISONS_3D(not_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8,  !=)\n\
 \n\
+#define COMPARISONS_BF_3D(func_name, src0_type_name, src1_type_name, cmp_op) \\\n\
+__kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_3D( \\\n\
+    __read_only  image2d_array_t  input0, \\\n\
+    __read_only  image2d_array_t  input1, \\\n\
+    __write_only image2d_array_t  output \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    vxc_ushort8 src0, src1, srcA, srcB; \\\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\
+    VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 vecA0, vecA1; \\\n\
+    float4 vecB0, vecB1; \\\n\
+    VXC_DP2x8(src0, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
+    _viv_asm(COPY, vecA0, src0, 16); \\\n\
+    VXC_DP2x8(src1, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\
+    _viv_asm(COPY, vecA1, src1, 16); \\\n\
+    VXC_DP2x8(src0, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
+    _viv_asm(COPY, vecB0, src0, 16); \\\n\
+    VXC_DP2x8(src1, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\
+    _viv_asm(COPY, vecB1, src1, 16); \\\n\
+    int4 dst0, dst1; \\\n\
+    dst0 = (vecA0)cmp_op(vecB0); \\\n\
+    dst1 = (vecA1)cmp_op(vecB1); \\\n\
+ \\\n\
+    vxc_char16 dst; \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    dst &= 1; \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+\n\
+COMPARISONS_BF_3D(less,        BF16, BF16,  <)\n\
+COMPARISONS_BF_3D(great,       BF16, BF16,  >)\n\
+COMPARISONS_BF_3D(less_equal,  BF16, BF16,  <=)\n\
+COMPARISONS_BF_3D(great_equal, BF16, BF16,  >=)\n\
+COMPARISONS_BF_3D(equal,       BF16, BF16,  ==)\n\
+COMPARISONS_BF_3D(not_equal,   BF16, BF16,  !=)\n\
 "; /* end of relational_ops_3d_vx*/
 
 static const char relu_keras_vx[] = "\n\
@@ -38763,7 +41329,7 @@ __kernel void resize_bilinear_U8toU8_DOWN\n\
 }\n\
 "; /* end of resize_bilinear_U8_vx*/
 
-static const char resize_bilinear_U8_half_pixel_centers_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char resize_bilinear_U8_half_pixel_centers_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniResize2xUp_0_4x8;\n\
 _viv_uniform VXC_512Bits uniResize2xUp_1_4x8;\n\
@@ -38992,7 +41558,138 @@ __kernel void resize_bilinear_U8toU8_SAME_3x_upsample_half_pixel_centers\n\
     VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\
         VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
-"; /* end of resize_bilinear_U8_half_pixel_centers_vx*/
+"; /* end of resize_bilinear_U8_half_pixel_centers_1_vx*/
+
+static const char resize_bilinear_U8_half_pixel_centers_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int out_height;\n\
+_viv_uniform VXC_512Bits uniResize8xUp_l00_4x8;\n\
+_viv_uniform VXC_512Bits uniResize8xUp_l01_4x8;\n\
+_viv_uniform VXC_512Bits uniResize8xUp_l10_4x8;\n\
+_viv_uniform VXC_512Bits uniResize8xUp_l11_4x8;\n\
+_viv_uniform VXC_512Bits uniResize8xUp_l20_4x8;\n\
+_viv_uniform VXC_512Bits uniResize8xUp_l21_4x8;\n\
+_viv_uniform VXC_512Bits uniResize8xUp_l30_4x8;\n\
+_viv_uniform VXC_512Bits uniResize8xUp_l31_4x8;\n\
+__kernel void resize_bilinear_U8toU8_SAME_8x_upsample_half_pixel_centers\n\
+    (\n\
+    __read_only  image2d_array_t   input,\n\
+    __write_only image2d_array_t   output,\n\
+                             int   align_corners,\n\
+                             int   half_pixel_centers\n\
+    )\n\
+{\n\
+    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\
+    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);\n\
+    coord_in.x = (coord_out.x * 2 - 7) >> 4;\n\
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;\n\
+\n\
+    vxc_uchar16 in0, in1, in2, tmp, dst0, dst1, dst2, dst3;\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    while (coord_out.y < out_height)\n\
+    {\n\
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);\n\
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);\n\
+        VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);\n\
+        VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);\n\
+        VXC_DP4x8(dst2, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);\n\
+        VXC_DP4x8(dst2, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);\n\
+        VXC_DP4x8(dst3, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);\n\
+        VXC_DP4x8(dst3, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);\n\
+        VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);\n\
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);\n\
+        VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);\n\
+        VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);\n\
+        VXC_DP4x8(dst2, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);\n\
+        VXC_DP4x8(dst2, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);\n\
+        VXC_DP4x8(dst3, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);\n\
+        VXC_DP4x8(dst3, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);\n\
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);\n\
+        VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);\n\
+        VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);\n\
+        VXC_DP4x8(dst2, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);\n\
+        VXC_DP4x8(dst2, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);\n\
+        VXC_DP4x8(dst3, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);\n\
+        VXC_DP4x8(dst3, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);\n\
+        VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.y += 2;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l30_4x8);\n\
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);\n\
+        VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l20_4x8);\n\
+        VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);\n\
+        VXC_DP4x8(dst2, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l10_4x8);\n\
+        VXC_DP4x8(dst2, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);\n\
+        VXC_DP4x8(dst3, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize8xUp_l00_4x8);\n\
+        VXC_DP4x8(dst3, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+    }\n\
+}\n\
+"; /* end of resize_bilinear_U8_half_pixel_centers_2_vx*/
 
 static const char resize_bilinear_U8_opt_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -39651,10 +42348,14 @@ _viv_uniform int offsetX;\n\
 _viv_uniform int offsetY;\n\
 _viv_uniform int offsetZ;\n\
 \n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
+\n\
 __kernel void scatter_nd_F16toF16(\n\
     __read_only image2d_t   input0,\n\
     __read_only image2d_t   input1,\n\
-    image2d_array_t  output,\n\
+    image2d_t  output,\n\
     int width,\n\
     int area,\n\
     int coord_dim\n\
@@ -39682,11 +42383,53 @@ __kernel void scatter_nd_F16toF16(\n\
     VXC_WriteImage(output, (int2)(gidx, gidy), tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
+__kernel void scatter_nd_BF16toBF16(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    image2d_t  output,\n\
+    int width,\n\
+    int area,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    float4 sum0 = (float4)(0);\n\
+    float4 sum1 = (float4)(0);\n\
+    vxc_ushort8 tmpVal;\n\
+    for(int i = 0; i < index_num; i++)\n\
+    {\n\
+        int4 indice = read_imagei(input0, (int2)(0, i));\n\
+        int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ;\n\
+        if(gidy == idx)\n\
+        {\n\
+            vxc_ushort8 src0, src1;\n\
+            VXC_ReadImage(tmpVal, input1, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            float4 vec0, vec1;\n\
+            VXC_DP2x8(src0, tmpVal, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part0_2x8);\n\
+            _viv_asm(COPY, vec0, src0, 16);\n\
+            VXC_DP2x8(src1, tmpVal, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part1_2x8);\n\
+            _viv_asm(COPY, vec1, src1, 16);\n\
+            sum0 += vec0;\n\
+            sum1 += vec1;\n\
+        }\n\
+    }\n\
+    vxc_ushort8 dst0, dst1, dst;\n\
+    _viv_asm(COPY, dst0, sum0, 16);\n\
+    _viv_asm(COPY, dst1, sum1, 16);\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    VXC_WriteImage(output, (int2)(gidx, gidy), dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
 #define SCATTER_ND_QINT(src0_type_name, data_type) \\\n\
 __kernel void scatter_nd_##src0_type_name##to##src0_type_name##( \\\n\
     __read_only image2d_t   input0, \\\n\
     __read_only image2d_t   input1, \\\n\
-    image2d_array_t  output, \\\n\
+    image2d_t  output, \\\n\
     int width, \\\n\
     int area, \\\n\
     int coord_dim \\\n\
@@ -42914,7 +45657,7 @@ static const char vsi_nn_kernel_header_vx[] = "/*\n\
  Description :\n\
  ============================================================================\n\
  */\n\
-#include \"cl_viv_vx_ext.h\"\n\
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
 \n\
 typedef struct Image\n\
 {\n\
@@ -44567,6 +47310,45 @@ CAST_TO_BOOL_FUN_2D(U32, uint4,  read_imageui)\n\
 \n\
 "; /* end of cast_cl*/
 
+static const char clip_BF16_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\
+\n\
+__kernel void clip_BF16toBF16(\n\
+    __read_only  image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+                           float  minData,\n\
+                           float  maxData)\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    uint4 src0 = read_imageui(input, coord);\n\
+    src0 = src0 << 16;\n\
+    float4 src;\n\
+    _viv_asm(COPY, src, src0, 16);\n\
+    float4 dst0 = clamp(src, minData, maxData);\n\
+    uint4 dst;\n\
+    _viv_asm(COPY, dst, dst0, 16);\n\
+    dst = dst >> 16;\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void clip_BF16toBF16_2D(\n\
+    __read_only  image2d_t  input,\n\
+    __write_only image2d_t  output,\n\
+                     float  minData,\n\
+                     float  maxData)\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+    uint4 src0 = read_imageui(input, coord);\n\
+    src0 = src0 << 16;\n\
+    float4 src;\n\
+    _viv_asm(COPY, src, src0, 16);\n\
+    float4 dst0 = clamp(src, minData, maxData);\n\
+    uint4 dst;\n\
+    _viv_asm(COPY, dst, dst0, 16);\n\
+    dst = dst >> 16;\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+"; /* end of clip_BF16_cl*/
+
 static const char clip_F32_cl[] = "__kernel void clip_F32toF32(\n\
     __read_only  image2d_array_t  input,\n\
     __write_only image2d_array_t  output,\n\
@@ -44708,6 +47490,25 @@ __kernel void clip_U8toF32_2D(\n\
 }\n\
 "; /* end of clip_U8_cl*/
 
+static const char depth2space_crd_cl[] = "\n\
+__kernel void depth2space_crd_F32toF32(\n\
+    image2d_array_t input, image2d_array_t output, int block_size)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord_out = (int4)(gidx, gidy, gidz, 0);\n\
+    int block_e2 = block_size * block_size;\n\
+    ushort blk = (ushort)block_size;\n\
+    int inx = (int)((ushort)gidx / blk);\n\
+    int iny = (int)((ushort)gidy / blk);\n\
+    int inz = (gidx  % block_size) + (gidy % block_size) * block_size + gidz * block_e2;\n\
+    int4 coord_in = (int4)(inx, iny, inz, 0);\n\
+    float4 data = read_imagef(input, coord_in);\n\
+    write_imagef(output, coord_out, data);\n\
+}\n\
+"; /* end of depth2space_crd_cl*/
+
 static const char detect_post_box_cl[] = "float exp_(float x, float logE)\n\
 {\n\
     x *= logE;\n\
@@ -44919,6 +47720,11 @@ static const char eltwise_unary_cl[] = "float eltwise_unary_sin(float x, float a
     return native_sin(x);\n\
 }\n\
 \n\
+float eltwise_unary_cos(float x, float alpha, float beta)\n\
+{\n\
+    return native_cos(x);\n\
+}\n\
+\n\
 #define logE        (1.44269502f)\n\
 #define twoLogE     (logE * 2.0f)\n\
 float eltwise_unary_exp(float x, float alpha, float beta)\n\
@@ -45051,6 +47857,7 @@ __kernel void func_name##_F32toF32 \\\n\
     write_imagef(output, coord, dst.xxxx); \\\n\
 }\n\
 ELTWISE_UNARY_F32(sin)\n\
+ELTWISE_UNARY_F32(cos)\n\
 ELTWISE_UNARY_F32(exp)\n\
 ELTWISE_UNARY_F32(log)\n\
 ELTWISE_UNARY_F32(elu)\n\
@@ -45084,6 +47891,7 @@ __kernel void func_name##_F32toF32_2D \\\n\
     write_imagef(output, coord, dst.xxxx); \\\n\
 }\n\
 ELTWISE_UNARY_F32_2D(sin)\n\
+ELTWISE_UNARY_F32_2D(cos)\n\
 ELTWISE_UNARY_F32_2D(exp)\n\
 ELTWISE_UNARY_F32_2D(log)\n\
 ELTWISE_UNARY_F32_2D(elu)\n\
@@ -45118,6 +47926,7 @@ __kernel void func_name##_U8toU8 \\\n\
     write_imageui(output, coord, dst); \\\n\
 }\n\
 ELTWISE_UNARY_U8(sin)\n\
+ELTWISE_UNARY_U8(cos)\n\
 ELTWISE_UNARY_U8(exp)\n\
 ELTWISE_UNARY_U8(log)\n\
 ELTWISE_UNARY_U8(elu)\n\
@@ -45152,6 +47961,7 @@ __kernel void func_name##_U8toU8_2D \\\n\
     write_imageui(output, coord, dst); \\\n\
 }\n\
 ELTWISE_UNARY_U8_2D(sin)\n\
+ELTWISE_UNARY_U8_2D(cos)\n\
 ELTWISE_UNARY_U8_2D(exp)\n\
 ELTWISE_UNARY_U8_2D(log)\n\
 ELTWISE_UNARY_U8_2D(elu)\n\
@@ -45319,10 +48129,18 @@ __kernel void func_name##_U8toU8_2D \\\n\
 ELTWISE_UNARY_U8_2D(erf)\n\
 "; /* end of erf_cl*/
 
-static const char floordiv_cl[] = "__kernel void floordiv_F32F32toF32(\n\
+static const char floordiv_cl[] = "__kernel void floordiv_F32F32toF32\n\
+    (\n\
     __read_only  image2d_array_t  input,\n\
     __read_only  image2d_array_t  input1,\n\
-    __write_only image2d_array_t  output)\n\
+    __write_only image2d_array_t  output,\n\
+                 float            input0Scale,\n\
+                 float            input0Tail,\n\
+                 float            input1Scale,\n\
+                 float            input1Tail,\n\
+                 float            outputScale,\n\
+                 float            outputTail\n\
+     )\n\
 {\n\
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
     float4 src0;\n\
@@ -45333,10 +48151,18 @@ static const char floordiv_cl[] = "__kernel void floordiv_F32F32toF32(\n\
     write_imagef(output, coord, dst);\n\
 }\n\
 \n\
-__kernel void floordiv_F32F32toF32_2D(\n\
-    __read_only  image2d_t  input,\n\
-    __read_only  image2d_t  input1,\n\
-    __write_only image2d_t  output)\n\
+__kernel void floordiv_F32F32toF32_2D\n\
+    (\n\
+    __read_only  image2d_t input,\n\
+    __read_only  image2d_t input1,\n\
+    __write_only image2d_t output,\n\
+                 float     input0Scale,\n\
+                 float     input0Tail,\n\
+                 float     input1Scale,\n\
+                 float     input1Tail,\n\
+                 float     outputScale,\n\
+                 float     outputTail\n\
+     )\n\
 {\n\
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
     float4 src0 = read_imagef(input, coord);\n\
@@ -45345,33 +48171,8 @@ __kernel void floordiv_F32F32toF32_2D(\n\
     write_imagef(output, coord, dst);\n\
 }\n\
 \n\
-__kernel void floordiv_I32I32toI32(\n\
-    __read_only  image2d_array_t  input,\n\
-    __read_only  image2d_array_t  input1,\n\
-    __write_only image2d_array_t  output)\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-    int4 src0;\n\
-    int4 src1;\n\
-    READ_IMAGEI_2DARRAY(src0, input, coord);\n\
-    READ_IMAGEI_2DARRAY(src1, input1, coord);\n\
-    int4 dst  = convert_int4(floor(convert_float4(src0) / convert_float4(src1)));\n\
-    write_imagei(output, coord, dst);\n\
-}\n\
-\n\
-__kernel void floordiv_I32I32toI32_2D(\n\
-    __read_only  image2d_t  input,\n\
-    __read_only  image2d_t  input1,\n\
-    __write_only image2d_t  output)\n\
-{\n\
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
-    int4 src0 = read_imagei(input, coord);\n\
-    int4 src1 = read_imagei(input1, coord);\n\
-    int4 dst  = convert_int4(floor(convert_float4(src0) / convert_float4(src1)));\n\
-    write_imagei(output, coord, dst);\n\
-}\n\
-\n\
-__kernel void floordiv_I32I32toU8(\n\
+__kernel void floordiv_I32I32toI32\n\
+    (\n\
     __read_only  image2d_array_t  input,\n\
     __read_only  image2d_array_t  input1,\n\
     __write_only image2d_array_t  output,\n\
@@ -45380,7 +48181,56 @@ __kernel void floordiv_I32I32toU8(\n\
                  float            input1Scale,\n\
                  float            input1Tail,\n\
                  float            outputScale,\n\
-                 float            outputTail )\n\
+                 float            outputTail\n\
+     )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 src0;\n\
+    int4 src1;\n\
+    READ_IMAGEI_2DARRAY(src0, input, coord);\n\
+    READ_IMAGEI_2DARRAY(src1, input1, coord);\n\
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;\n\
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;\n\
+    float4 out = floor(in0 / in1) * outputScale + outputTail;\n\
+    int4 dst  = convert_int4(out);\n\
+    write_imagei(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void floordiv_I32I32toI32_2D\n\
+    (\n\
+    __read_only  image2d_t input,\n\
+    __read_only  image2d_t input1,\n\
+    __write_only image2d_t output,\n\
+                 float     input0Scale,\n\
+                 float     input0Tail,\n\
+                 float     input1Scale,\n\
+                 float     input1Tail,\n\
+                 float     outputScale,\n\
+                 float     outputTail\n\
+     )\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+    int4 src0 = read_imagei(input, coord);\n\
+    int4 src1 = read_imagei(input1, coord);\n\
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;\n\
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;\n\
+    float4 out = floor(in0 / in1) * outputScale + outputTail;\n\
+    int4 dst  = convert_int4(out);\n\
+    write_imagei(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void floordiv_I32I32toU8\n\
+    (\n\
+    __read_only  image2d_array_t  input,\n\
+    __read_only  image2d_array_t  input1,\n\
+    __write_only image2d_array_t  output,\n\
+                 float            input0Scale,\n\
+                 float            input0Tail,\n\
+                 float            input1Scale,\n\
+                 float            input1Tail,\n\
+                 float            outputScale,\n\
+                 float            outputTail\n\
+     )\n\
 {\n\
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
     int4 src0;\n\
@@ -45394,16 +48244,18 @@ __kernel void floordiv_I32I32toU8(\n\
     write_imageui(output, coord, dst);\n\
 }\n\
 \n\
-__kernel void floordiv_I32I32toU8_2D(\n\
-    __read_only  image2d_t  input,\n\
-    __read_only  image2d_t  input1,\n\
-    __write_only image2d_t  output,\n\
-                 float      input0Scale,\n\
-                 float      input0Tail,\n\
-                 float      input1Scale,\n\
-                 float      input1Tail,\n\
-                 float      outputScale,\n\
-                 float      outputTail )\n\
+__kernel void floordiv_I32I32toU8_2D\n\
+    (\n\
+    __read_only  image2d_t input,\n\
+    __read_only  image2d_t input1,\n\
+    __write_only image2d_t output,\n\
+                 float     input0Scale,\n\
+                 float     input0Tail,\n\
+                 float     input1Scale,\n\
+                 float     input1Tail,\n\
+                 float     outputScale,\n\
+                 float     outputTail\n\
+     )\n\
 {\n\
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
     int4 src0 = read_imagei(input, coord);\n\
@@ -45415,7 +48267,8 @@ __kernel void floordiv_I32I32toU8_2D(\n\
     write_imageui(output, coord, dst);\n\
 }\n\
 \n\
-__kernel void floordiv_U8U8toU8(\n\
+__kernel void floordiv_U8U8toU8\n\
+    (\n\
     __read_only  image2d_array_t  input,\n\
     __read_only  image2d_array_t  input1,\n\
     __write_only image2d_array_t  output,\n\
@@ -45424,7 +48277,8 @@ __kernel void floordiv_U8U8toU8(\n\
                  float            input1Scale,\n\
                  float            input1Tail,\n\
                  float            outputScale,\n\
-                 float            outputTail )\n\
+                 float            outputTail\n\
+     )\n\
 {\n\
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
     uint4 src0, src1;\n\
@@ -45438,16 +48292,18 @@ __kernel void floordiv_U8U8toU8(\n\
     write_imageui(output, coord, dst);\n\
 }\n\
 \n\
-__kernel void floordiv_U8U8toU8_2D(\n\
-    __read_only  image2d_t  input,\n\
-    __read_only  image2d_t  input1,\n\
-    __write_only image2d_t  output,\n\
-                 float      input0Scale,\n\
-                 float      input0Tail,\n\
-                 float      input1Scale,\n\
-                 float      input1Tail,\n\
-                 float      outputScale,\n\
-                 float      outputTail )\n\
+__kernel void floordiv_U8U8toU8_2D\n\
+    (\n\
+    __read_only  image2d_t input,\n\
+    __read_only  image2d_t input1,\n\
+    __write_only image2d_t output,\n\
+                 float     input0Scale,\n\
+                 float     input0Tail,\n\
+                 float     input1Scale,\n\
+                 float     input1Tail,\n\
+                 float     outputScale,\n\
+                 float     outputTail\n\
+     )\n\
 {\n\
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
     uint4 src0 = read_imageui(input, coord);\n\
@@ -45460,7 +48316,8 @@ __kernel void floordiv_U8U8toU8_2D(\n\
     write_imageui(output, coord, dst);\n\
 }\n\
 \n\
-__kernel void floordiv_U8I32toU8(\n\
+__kernel void floordiv_U8I32toU8\n\
+    (\n\
     __read_only  image2d_array_t  input,\n\
     __read_only  image2d_array_t  input1,\n\
     __write_only image2d_array_t  output,\n\
@@ -45469,7 +48326,8 @@ __kernel void floordiv_U8I32toU8(\n\
                  float            input1Scale,\n\
                  float            input1Tail,\n\
                  float            outputScale,\n\
-                 float            outputTail )\n\
+                 float            outputTail\n\
+     )\n\
 {\n\
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
     uint4 src0;\n\
@@ -45484,16 +48342,18 @@ __kernel void floordiv_U8I32toU8(\n\
     write_imageui(output, coord, dst);\n\
 }\n\
 \n\
-__kernel void floordiv_U8I32toU8_2D(\n\
-    __read_only  image2d_t  input,\n\
-    __read_only  image2d_t  input1,\n\
-    __write_only image2d_t  output,\n\
-                 float      input0Scale,\n\
-                 float      input0Tail,\n\
-                 float      input1Scale,\n\
-                 float      input1Tail,\n\
-                 float      outputScale,\n\
-                 float      outputTail )\n\
+__kernel void floordiv_U8I32toU8_2D\n\
+    (\n\
+    __read_only  image2d_t input,\n\
+    __read_only  image2d_t input1,\n\
+    __write_only image2d_t output,\n\
+                 float     input0Scale,\n\
+                 float     input0Tail,\n\
+                 float     input1Scale,\n\
+                 float     input1Tail,\n\
+                 float     outputScale,\n\
+                 float     outputTail\n\
+     )\n\
 {\n\
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
     uint4 src0 = read_imageui(input, coord);\n\
@@ -45514,7 +48374,8 @@ static const char gather_cl[] = "__kernel void gather_U8toU8(\n\
     int block_size,\n\
     int block_num,\n\
     int axis_num,\n\
-    int indices_num\n\
+    int indices_num,\n\
+    int batch\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
@@ -45538,7 +48399,8 @@ __kernel void gather_F16toF16(\n\
     int block_size,\n\
     int block_num,\n\
     int axis_num,\n\
-    int indices_num\n\
+    int indices_num,\n\
+    int batch\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
@@ -45562,7 +48424,8 @@ __kernel void gather_I32toI32(\n\
     int block_size,\n\
     int block_num,\n\
     int axis_num,\n\
-    int indices_num\n\
+    int indices_num,\n\
+    int batch\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
@@ -45586,7 +48449,8 @@ __kernel void gather_F32toF32(\n\
     int block_size,\n\
     int block_num,\n\
     int axis_num,\n\
-    int indices_num\n\
+    int indices_num,\n\
+    int batch\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
@@ -45604,6 +48468,131 @@ __kernel void gather_F32toF32(\n\
 }\n\
 "; /* end of gather_cl*/
 
+static const char gather_batch_cl[] = "__kernel void gather_batch_U8toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num,\n\
+    int indices_num,\n\
+    int batch\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+    int gidz = get_global_id(2);  // block_num\n\
+\n\
+    int2 coord_idx = (int2)(gidy, 0);\n\
+    int4 coord_in = (int4)(gidx, 0, 0, 0);\n\
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\
+    for(; coord_idx.y < batch;)\n\
+    {\n\
+        int4 indice = read_imagei(input1, coord_idx);\n\
+        coord_idx.y++;\n\
+        coord_in.y = gidz * axis_num + indice.x;\n\
+\n\
+        uint4 data = read_imageui(input0, coord_in);\n\
+        coord_in.z++;\n\
+        write_imageui(output, coord, data);\n\
+        coord.z++;\n\
+    }\n\
+}\n\
+\n\
+__kernel void gather_batch_F16toF16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num,\n\
+    int indices_num,\n\
+    int batch\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+    int gidz = get_global_id(2);  // block_num\n\
+\n\
+    int2 coord_idx = (int2)(gidy, 0);\n\
+    int4 coord_in = (int4)(gidx, 0, 0, 0);\n\
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\
+    for(; coord_idx.y < batch;)\n\
+    {\n\
+        int4 indice = read_imagei(input1, coord_idx);\n\
+        coord_idx.y++;\n\
+        coord_in.y = gidz * axis_num + indice.x;\n\
+\n\
+        float4 data = read_imagef(input0, coord_in);\n\
+        coord_in.z++;\n\
+        write_imagef(output, coord, data);\n\
+        coord.z++;\n\
+    }\n\
+}\n\
+\n\
+__kernel void gather_batch_I32toI32(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num,\n\
+    int indices_num,\n\
+    int batch\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+    int gidz = get_global_id(2);  // block_num\n\
+\n\
+    int2 coord_idx = (int2)(gidy, 0);\n\
+    int4 coord_in = (int4)(gidx, 0, 0, 0);\n\
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\
+    for(; coord_idx.y < batch;)\n\
+    {\n\
+        int4 indice = read_imagei(input1, coord_idx);\n\
+        coord_idx.y++;\n\
+        coord_in.y = gidz * axis_num + indice.x;\n\
+\n\
+        int4 data = read_imagei(input0, coord_in);\n\
+        coord_in.z++;\n\
+        write_imagei(output, coord, data);\n\
+        coord.z++;\n\
+    }\n\
+}\n\
+\n\
+__kernel void gather_batch_F32toF32(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num,\n\
+    int indices_num,\n\
+    int batch\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+    int gidz = get_global_id(2);  // block_num\n\
+\n\
+    int2 coord_idx = (int2)(gidy, 0);\n\
+    int4 coord_in = (int4)(gidx, 0, 0, 0);\n\
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\
+    for(; coord_idx.y < batch;)\n\
+    {\n\
+        int4 indice = read_imagei(input1, coord_idx);\n\
+        coord_idx.y++;\n\
+        coord_in.y = gidz * axis_num + indice.x;\n\
+\n\
+        float4 data = read_imagef(input0, coord_in);\n\
+        coord_in.z++;\n\
+        write_imagef(output, coord, data);\n\
+        coord.z++;\n\
+    }\n\
+}\n\
+"; /* end of gather_batch_cl*/
+
 static const char gather_nd_cl[] = "__kernel void gather_nd_U8toU8_1D(\n\
     __read_only image2d_t   input0,\n\
     __read_only image2d_t   input1,\n\
@@ -52908,6 +55897,48 @@ __kernel void moments_axis0_I32toF32(\n\
     int2 coord_out = (int2)(gidy, gidz);\n\
     write_imagef(output_mean, coord_out, mean);\n\
     write_imagef(output_vari, coord_out, vari);\n\
+}\n\
+\n\
+__kernel void moments_axis0_BF16toF32(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_t  output_mean,\n\
+    __write_only image2d_t  output_vari,\n\
+    int axis,\n\
+    int axis_num,\n\
+    int input_zp,\n\
+    float input_scale,\n\
+    int width,\n\
+    int height,\n\
+    int chn,\n\
+    float dimRatio\n\
+    )\n\
+{\n\
+    int gidy = get_global_id(0);\n\
+    int gidz = get_global_id(1);\n\
+\n\
+    int4 coord0 = (int4)(0, gidy, gidz, 0);\n\
+    float4 data;\n\
+    float sum = 0, sqr = 0;\n\
+\n\
+    for(coord0.x = 0; coord0.x < width;)\n\
+    {\n\
+        uint4 src0 = read_imageui(input, coord0);\n\
+        src0 = src0 << 16;\n\
+        _viv_asm(COPY, data, src0, 16);\n\
+        coord0.x++;\n\
+\n\
+        sum = sum + data.x;\n\
+        sqr = sqr + data.x * data.x;\n\
+    }\n\
+\n\
+    float4 mean, vari;\n\
+    mean.x = sum * dimRatio;\n\
+    vari.x = sqr * dimRatio;\n\
+    vari.x = vari.x - mean.x * mean.x;\n\
+\n\
+    int2 coord_out = (int2)(gidy, gidz);\n\
+    write_imagef(output_mean, coord_out, mean);\n\
+    write_imagef(output_vari, coord_out, vari);\n\
 }"; /* end of moments_axis0_cl*/
 
 static const char moments_axis01_cl[] = "__kernel void moments_axis01_U8toF32(\n\
@@ -53084,6 +56115,66 @@ __kernel void moments_axis01_I32toF32(\n\
         write_imagef(output_vari, coord_out, vari);\n\
     }\n\
 }\n\
+\n\
+__kernel void moments_axis01_BF16toF32(\n\
+    image2d_array_t   input, image2d_t  output_mean, image2d_t  output_vari,\n\
+    int axis, int axis_num, int input_zp, float input_scale,\n\
+    int width, int height, int chn, float dimRatio\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int lidx = get_local_id(0);\n\
+\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+    float4 data;\n\
+    float sum = 0, sqr = 0;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    for(coord.x = gidx; coord.x < width; coord.x += 16)\n\
+    {\n\
+        float tmpSum = 0, tmpSqr = 0;\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            uint4 src0 = read_imageui(input, coord);\n\
+            src0 = src0 << 16;\n\
+            _viv_asm(COPY, data, src0, 16);\n\
+            coord.y++;\n\
+\n\
+            tmpSum = tmpSum + data.x;\n\
+            tmpSqr = tmpSqr + data.x * data.x;\n\
+        }\n\
+        sqr += tmpSqr;\n\
+        sum += tmpSum;\n\
+    }\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int2 coord_out = (int2)(gidz, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 mean, vari;\n\
+        mean.x = sum * dimRatio;\n\
+        vari.x = sqr * dimRatio;\n\
+        vari.x = vari.x - mean.x * mean.x;\n\
+        write_imagef(output_mean, coord_out, mean);\n\
+        write_imagef(output_vari, coord_out, vari);\n\
+    }\n\
+}\n\
 "; /* end of moments_axis01_cl*/
 
 static const char moments_axis012_cl[] = "__kernel void moments_axis012_U8toF32(\n\
@@ -53265,6 +56356,67 @@ __kernel void moments_axis012_I32toF32(\n\
         write_imagef(output_vari, coord_out, vari);\n\
     }\n\
 }\n\
+\n\
+__kernel void moments_axis012_BF16toF32(\n\
+    image2d_array_t   input, image2d_t  output_mean, image2d_t  output_vari,\n\
+    int axis, int axis_num, int input_zp, float input_scale,\n\
+    int width, int height, int chn, float dimRatio\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int lidx = get_local_id(0);\n\
+\n\
+    int4 coord = (int4)(gidx, 0, 0, 0);\n\
+    float4 data;\n\
+    float sum = 0, sqr = 0;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    for(coord.z = 0; coord.z < chn; coord.z++)\n\
+    {\n\
+        for(coord.x = gidx; coord.x < width; coord.x += 16)\n\
+        {\n\
+            float tmpSum = 0, tmpSqr = 0;\n\
+            for(coord.y = 0; coord.y < height;)\n\
+            {\n\
+                uint4 src0 = read_imageui(input, coord);\n\
+                src0 = src0 << 16;\n\
+                _viv_asm(COPY, data, src0, 16);\n\
+                coord.y++;\n\
+                tmpSum = tmpSum + data.x;\n\
+                tmpSqr = tmpSqr + data.x * data.x;\n\
+            }\n\
+            sqr += tmpSqr;\n\
+            sum += tmpSum;\n\
+        }\n\
+    }\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int2 coord_out = (int2)(0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 mean, vari;\n\
+        mean.x = sum * dimRatio;\n\
+        vari.x = sqr * dimRatio;\n\
+        vari.x = vari.x - mean.x * mean.x;\n\
+        write_imagef(output_mean, coord_out, mean);\n\
+        write_imagef(output_vari, coord_out, vari);\n\
+    }\n\
+}\n\
 "; /* end of moments_axis012_cl*/
 
 static const char moments_axis1_cl[] = "__kernel void moments_axis1_U8toF32(\n\
@@ -53378,6 +56530,47 @@ __kernel void moments_axis1_I32toF32(\n\
     int2 coord_out = (int2)(gidx, gidz);\n\
     write_imagef(output_mean, coord_out, mean);\n\
     write_imagef(output_vari, coord_out, vari);\n\
+}\n\
+\n\
+__kernel void moments_axis1_BF16toF32(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_t  output_mean,\n\
+    __write_only image2d_t  output_vari,\n\
+    int axis,\n\
+    int axis_num,\n\
+    int input_zp,\n\
+    float input_scale,\n\
+    int width,\n\
+    int height,\n\
+    int chn,\n\
+    float dimRatio\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidz = get_global_id(1);\n\
+\n\
+    int4 coord0 = (int4)(gidx, 0, gidz, 0);\n\
+    float4 data;\n\
+    float sum = 0, sqr = 0;\n\
+\n\
+    for(coord0.y = 0; coord0.y < height;)\n\
+    {\n\
+        uint4 src0 = read_imageui(input, coord0);\n\
+        src0 = src0 << 16;\n\
+        _viv_asm(COPY, data, src0, 16);\n\
+        coord0.y++;\n\
+        sum = sum + data.x;\n\
+        sqr = sqr + data.x * data.x;\n\
+    }\n\
+\n\
+    float4 mean, vari;\n\
+    mean.x = sum * dimRatio;\n\
+    vari.x = sqr * dimRatio;\n\
+    vari.x = vari.x - mean.x * mean.x;\n\
+\n\
+    int2 coord_out = (int2)(gidx, gidz);\n\
+    write_imagef(output_mean, coord_out, mean);\n\
+    write_imagef(output_vari, coord_out, vari);\n\
 }"; /* end of moments_axis1_cl*/
 
 static const char moments_axis2_cl[] = "__kernel void moments_axis2_U8toF32(\n\
@@ -53505,7 +56698,50 @@ __kernel void moments_axis2_I32toF32(\n\
     int2 coord_out = (int2)(gidx, gidy);\n\
     write_imagef(output_mean, coord_out, mean);\n\
     write_imagef(output_vari, coord_out, vari);\n\
-}"; /* end of moments_axis2_cl*/
+}\n\
+\n\
+__kernel void moments_axis2_BF16toF32(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_t  output_mean,\n\
+    __write_only image2d_t  output_vari,\n\
+    int axis,\n\
+    int axis_num,\n\
+    int input_zp,\n\
+    float input_scale,\n\
+    int width,\n\
+    int height,\n\
+    int chn,\n\
+    float dimRatio\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+\n\
+    int4 coord0 = (int4)(gidx, gidy, 0, 0);\n\
+    float4 data;\n\
+    float sum = 0, sqr = 0;\n\
+\n\
+    for(coord0.z = 0; coord0.z < chn;)\n\
+    {\n\
+        uint4 src0 = read_imageui(input, coord0);\n\
+        src0 = src0 << 16;\n\
+        _viv_asm(COPY, data, src0, 16);\n\
+        coord0.z++;\n\
+\n\
+        sum = sum + data.x;\n\
+        sqr = sqr + data.x * data.x;\n\
+    }\n\
+\n\
+    float4 mean, vari;\n\
+    mean.x = sum * dimRatio;\n\
+    vari.x = sqr * dimRatio;\n\
+    vari.x = vari.x - mean.x * mean.x;\n\
+\n\
+    int2 coord_out = (int2)(gidx, gidy);\n\
+    write_imagef(output_mean, coord_out, mean);\n\
+    write_imagef(output_vari, coord_out, vari);\n\
+}\n\
+"; /* end of moments_axis2_cl*/
 
 static const char one_hot_cl[] = "__kernel void one_hot_F32toF32\n\
     (\n\
@@ -57468,6 +60704,259 @@ TILE_2D(F32, F32, float4, read_imagef,  write_imagef)\n\
 \n\
 "; /* end of tile_cl*/
 
+static const char topk_cl[] = "#define TOPK_F32(LOCAL_SIZE0, STAGES) \\\n\
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toF32_I32 \\\n\
+ ( \\\n\
+  __read_only  image2d_t input, \\\n\
+  __write_only image2d_t output, \\\n\
+  __write_only image2d_t indices, \\\n\
+               int       num_stages, \\\n\
+               int       width \\\n\
+  ) \\\n\
+ { \\\n\
+    uint local_id = get_local_id(0); \\\n\
+    uint work_group_size = get_local_size(0); \\\n\
+    uint offset = 0; \\\n\
+ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    __local float local_data[128]; \\\n\
+    __local uint local_indices[128]; \\\n\
+ \\\n\
+    float left = read_imagef(input, coord.xy).x; \\\n\
+    coord.z += work_group_size; \\\n\
+    float data = read_imagef(input, coord.zy).x; \\\n\
+    float right = coord.z < width ? data : -2147483647.0f; \\\n\
+ \\\n\
+    local_data[local_id] = left; \\\n\
+    local_indices[local_id] = local_id; \\\n\
+    local_data[local_id + work_group_size] = right; \\\n\
+    local_indices[local_id + work_group_size] = local_id + work_group_size; \\\n\
+ \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+ \\\n\
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\
+    { \\\n\
+        uint signo = (local_id >> stage) & 1; \\\n\
+ \\\n\
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
+        { \\\n\
+            uint postShift = (stage - passOfStage); \\\n\
+            uint pairDistance = 1 << postShift; \\\n\
+ \\\n\
+            uint left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \\\n\
+            uint right_id = left_id + pairDistance; \\\n\
+ \\\n\
+            uint left_idx = local_indices[left_id]; \\\n\
+            uint right_idx = local_indices[right_id]; \\\n\
+ \\\n\
+            float left_elem = local_data[left_id]; \\\n\
+            float right_elem = local_data[right_id]; \\\n\
+ \\\n\
+            if ((left_elem < right_elem) ^ signo) \\\n\
+            { \\\n\
+                local_data[left_id] = right_elem; \\\n\
+                local_data[right_id] = left_elem; \\\n\
+ \\\n\
+                local_indices[left_id] = right_idx; \\\n\
+                local_indices[right_id] = left_idx; \\\n\
+            } \\\n\
+ \\\n\
+            barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    float4 dst; \\\n\
+    dst.x = local_data[local_id]; \\\n\
+    dst.y = local_data[local_id + work_group_size]; \\\n\
+ \\\n\
+    write_imagef(output, coord.xy, dst.xxxx); \\\n\
+    write_imagef(output, coord.zy, dst.yyyy); \\\n\
+ \\\n\
+    int4 index; \\\n\
+    index.x = ((int*)local_indices)[local_id]; \\\n\
+    index.y = ((int*)local_indices)[local_id + work_group_size]; \\\n\
+ \\\n\
+    write_imagei(indices, coord.xy, index.xxxx); \\\n\
+    write_imagei(indices, coord.zy, index.yyyy); \\\n\
+ }\n\
+TOPK_F32(1 << 0, 0)\n\
+TOPK_F32(1 << 1, 1)\n\
+TOPK_F32(1 << 2, 2)\n\
+TOPK_F32(1 << 3, 3)\n\
+TOPK_F32(1 << 4, 4)\n\
+TOPK_F32(1 << 5, 5)\n\
+TOPK_F32(1 << 6, 6)\n\
+\n\
+#define TOPK_U32(LOCAL_SIZE0, STAGES) \\\n\
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_U32toU32_I32 \\\n\
+ ( \\\n\
+  __read_only  image2d_t input, \\\n\
+  __write_only image2d_t output, \\\n\
+  __write_only image2d_t indices, \\\n\
+               int       num_stages, \\\n\
+               int       width \\\n\
+  ) \\\n\
+ { \\\n\
+    uint local_id = get_local_id(0); \\\n\
+    uint work_group_size = get_local_size(0); \\\n\
+    uint offset = 0; \\\n\
+ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    __local uint local_data[128]; \\\n\
+    __local uint local_indices[128]; \\\n\
+ \\\n\
+    uint left = read_imageui(input, coord.xy).x; \\\n\
+    coord.z += work_group_size; \\\n\
+    uint data = read_imageui(input, coord.zy).x; \\\n\
+    uint right = coord.z < width ? data : 0; \\\n\
+ \\\n\
+    local_data[local_id] = left; \\\n\
+    local_indices[local_id] = local_id; \\\n\
+    local_data[local_id + work_group_size] = right; \\\n\
+    local_indices[local_id + work_group_size] = local_id + work_group_size; \\\n\
+ \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+ \\\n\
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\
+    { \\\n\
+        uint signo = (local_id >> stage) & 1; \\\n\
+ \\\n\
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
+        { \\\n\
+            uint postShift = (stage - passOfStage); \\\n\
+            uint pairDistance = 1 << postShift; \\\n\
+ \\\n\
+            uint left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \\\n\
+            uint right_id = left_id + pairDistance; \\\n\
+ \\\n\
+            uint left_idx = local_indices[left_id]; \\\n\
+            uint right_idx = local_indices[right_id]; \\\n\
+ \\\n\
+            uint left_elem = local_data[left_id]; \\\n\
+            uint right_elem = local_data[right_id]; \\\n\
+ \\\n\
+            if ((left_elem < right_elem) ^ signo) \\\n\
+            { \\\n\
+                local_data[left_id] = right_elem; \\\n\
+                local_data[right_id] = left_elem; \\\n\
+ \\\n\
+                local_indices[left_id] = right_idx; \\\n\
+                local_indices[right_id] = left_idx; \\\n\
+            } \\\n\
+ \\\n\
+            barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    uint4 dst; \\\n\
+    dst.x = local_data[local_id]; \\\n\
+    dst.y = local_data[local_id + work_group_size]; \\\n\
+ \\\n\
+    write_imageui(output, coord.xy, dst.xxxx); \\\n\
+    write_imageui(output, coord.zy, dst.yyyy); \\\n\
+ \\\n\
+    int4 index; \\\n\
+    index.x = ((int*)local_indices)[local_id]; \\\n\
+    index.y = ((int*)local_indices)[local_id + work_group_size]; \\\n\
+ \\\n\
+    write_imagei(indices, coord.xy, index.xxxx); \\\n\
+    write_imagei(indices, coord.zy, index.yyyy); \\\n\
+ }\n\
+TOPK_U32(1 << 0, 0)\n\
+TOPK_U32(1 << 1, 1)\n\
+TOPK_U32(1 << 2, 2)\n\
+TOPK_U32(1 << 3, 3)\n\
+TOPK_U32(1 << 4, 4)\n\
+TOPK_U32(1 << 5, 5)\n\
+TOPK_U32(1 << 6, 6)\n\
+\n\
+#define TOPK_I32(LOCAL_SIZE0, STAGES) \\\n\
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_I32toI32_I32 \\\n\
+ ( \\\n\
+  __read_only  image2d_t input, \\\n\
+  __write_only image2d_t output, \\\n\
+  __write_only image2d_t indices, \\\n\
+               int       num_stages, \\\n\
+               int       width \\\n\
+  ) \\\n\
+ { \\\n\
+    int local_id = get_local_id(0); \\\n\
+    int work_group_size = get_local_size(0); \\\n\
+    int offset = 0; \\\n\
+ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    __local int local_data[128]; \\\n\
+    __local int local_indices[128]; \\\n\
+ \\\n\
+    int left = read_imagei(input, coord.xy).x; \\\n\
+    coord.z += work_group_size; \\\n\
+    int data = read_imagei(input, coord.zy).x; \\\n\
+    int right = coord.z < width ? data : -2147483647; \\\n\
+ \\\n\
+    local_data[local_id] = left; \\\n\
+    local_indices[local_id] = local_id; \\\n\
+    local_data[local_id + work_group_size] = right; \\\n\
+    local_indices[local_id + work_group_size] = local_id + work_group_size; \\\n\
+ \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+ \\\n\
+    for (int stage = 0; stage < num_stages + 1; ++stage) \\\n\
+    { \\\n\
+        int signo = (local_id >> stage) & 1; \\\n\
+ \\\n\
+        for (int passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
+        { \\\n\
+            int postShift = (stage - passOfStage); \\\n\
+            int pairDistance = 1 << postShift; \\\n\
+ \\\n\
+            int left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \\\n\
+            int right_id = left_id + pairDistance; \\\n\
+ \\\n\
+            int left_idx = local_indices[left_id]; \\\n\
+            int right_idx = local_indices[right_id]; \\\n\
+ \\\n\
+            int left_elem = local_data[left_id]; \\\n\
+            int right_elem = local_data[right_id]; \\\n\
+ \\\n\
+            if ((left_elem < right_elem) ^ signo) \\\n\
+            { \\\n\
+                local_data[left_id] = right_elem; \\\n\
+                local_data[right_id] = left_elem; \\\n\
+ \\\n\
+                local_indices[left_id] = right_idx; \\\n\
+                local_indices[right_id] = left_idx; \\\n\
+            } \\\n\
+ \\\n\
+            barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    int4 dst; \\\n\
+    dst.x = local_data[local_id]; \\\n\
+    dst.y = local_data[local_id + work_group_size]; \\\n\
+ \\\n\
+    write_imagei(output, coord.xy, dst.xxxx); \\\n\
+    write_imagei(output, coord.zy, dst.yyyy); \\\n\
+ \\\n\
+    int4 index; \\\n\
+    index.x = ((int*)local_indices)[local_id]; \\\n\
+    index.y = ((int*)local_indices)[local_id + work_group_size]; \\\n\
+ \\\n\
+    write_imagei(indices, coord.xy, index.xxxx); \\\n\
+    write_imagei(indices, coord.zy, index.yyyy); \\\n\
+ }\n\
+TOPK_I32(1 << 0, 0)\n\
+TOPK_I32(1 << 1, 1)\n\
+TOPK_I32(1 << 2, 2)\n\
+TOPK_I32(1 << 3, 3)\n\
+TOPK_I32(1 << 4, 4)\n\
+TOPK_I32(1 << 5, 5)\n\
+TOPK_I32(1 << 6, 6)\n\
+"; /* end of topk_cl*/
+
 static const char upsample_cl[] = "\n\
 #define UPSAMPLE_PROCESS(data_type, read_fun, write_fun) \\\n\
     data_type src  = 0; \\\n\
@@ -57701,6 +61190,9 @@ static const source_map_t evis_resource[] =
     {"clip_U8_vx", clip_U8_vx},
     {"conv1d_ovxlib_vx", conv1d_ovxlib_vx},
     {"conv1d_ovxlib_k1024_vx", conv1d_ovxlib_k1024_vx},
+    {"custom_softmax_vx", custom_softmax_vx},
+    {"custom_warp_affine_vx", custom_warp_affine_vx},
+    {"custom_warp_perspective_vx", custom_warp_perspective_vx},
     {"depth2space_crd_vx", depth2space_crd_vx},
     {"depthwise_conv1d_src0_vx", depthwise_conv1d_src0_vx},
     {"depthwise_conv1d_src1_vx", depthwise_conv1d_src1_vx},
@@ -57714,7 +61206,9 @@ static const source_map_t evis_resource[] =
     {"floordiv_vx", floordiv_vx},
     {"gather_vx", gather_vx},
     {"gather_array_vx", gather_array_vx},
+    {"gather_batch_vx", gather_batch_vx},
     {"gather_mix_vx", gather_mix_vx},
+    {"gather_mix_batch_vx", gather_mix_batch_vx},
     {"gather_nd_vx", gather_nd_vx},
     {"gather_nd_2d_vx", gather_nd_2d_vx},
     {"gather_nd_2d_mix_vx", gather_nd_2d_mix_vx},
@@ -57785,6 +61279,7 @@ static const source_map_t evis_resource[] =
     {"lstmunit_activation_SP_U8_vx", lstmunit_activation_SP_U8_vx},
     {"lstmunit_activation_S_F16_vx", lstmunit_activation_S_F16_vx},
     {"lstmunit_activation_S_U8_vx", lstmunit_activation_S_U8_vx},
+    {"matrixmul_bf16_vx", matrixmul_bf16_vx},
     {"matrixmul_f16_vx", matrixmul_f16_vx},
     {"matrixmul_f16f16_u8_vx", matrixmul_f16f16_u8_vx},
     {"matrixmul_f16i16_i16_vx", matrixmul_f16i16_i16_vx},
@@ -57877,7 +61372,8 @@ static const source_map_t evis_resource[] =
     {"resize_bilinear_I16_vx", resize_bilinear_I16_vx},
     {"resize_bilinear_I8_vx", resize_bilinear_I8_vx},
     {"resize_bilinear_U8_vx", resize_bilinear_U8_vx},
-    {"resize_bilinear_U8_half_pixel_centers_vx", resize_bilinear_U8_half_pixel_centers_vx},
+    {"resize_bilinear_U8_half_pixel_centers_1_vx", resize_bilinear_U8_half_pixel_centers_1_vx},
+    {"resize_bilinear_U8_half_pixel_centers_2_vx", resize_bilinear_U8_half_pixel_centers_2_vx},
     {"resize_bilinear_U8_opt_vx", resize_bilinear_U8_opt_vx},
     {"resize_bilinear_nhwc_vx", resize_bilinear_nhwc_vx},
     {"resize_nearest_vx", resize_nearest_vx},
@@ -57916,14 +61412,17 @@ static const source_map_t cl_resource[] =
     {"argmin_axis2_cl", argmin_axis2_cl},
     {"batchnorm_single_cl", batchnorm_single_cl},
     {"cast_cl", cast_cl},
+    {"clip_BF16_cl", clip_BF16_cl},
     {"clip_F32_cl", clip_F32_cl},
     {"clip_U8_cl", clip_U8_cl},
+    {"depth2space_crd_cl", depth2space_crd_cl},
     {"detect_post_box_cl", detect_post_box_cl},
     {"eltwise_ops_helper_cl", eltwise_ops_helper_cl},
     {"eltwise_unary_cl", eltwise_unary_cl},
     {"erf_cl", erf_cl},
     {"floordiv_cl", floordiv_cl},
     {"gather_cl", gather_cl},
+    {"gather_batch_cl", gather_batch_cl},
     {"gather_nd_cl", gather_nd_cl},
     {"gather_nd_3d_cl", gather_nd_3d_cl},
     {"group_normalization_f32_cl", group_normalization_f32_cl},
@@ -58015,6 +61514,7 @@ static const source_map_t cl_resource[] =
     {"space2depth_internal_cl", space2depth_internal_cl},
     {"swish_cl", swish_cl},
     {"tile_cl", tile_cl},
+    {"topk_cl", topk_cl},
     {"upsample_cl", upsample_cl},
 };
 
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
index cffc314..69f987a 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
@@ -49,7 +49,7 @@ uint8_t * vsi_nn_LoadBinarySource
 
     buf = NULL;
 
-    fp = fopen( (char *)file, "rb" );
+    fp = vsi_nn_fopen( (char *)file, "rb" );
 
     VSILOGI( "Loading program from binary file." );
     if( NULL == fp )
@@ -234,11 +234,13 @@ static vsi_status vsi_nn_RegisterVXKernel
     if(evis == VSI_NN_HW_EVIS_NONE)
     {
         // set default evis version is 2
-        sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va);
+        snprintf(cmd, MAX_BUILDPROGRAM_LEN,
+            "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va);
     }
     else
     {
-        sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va);
+        snprintf(cmd, MAX_BUILDPROGRAM_LEN,
+            "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va);
     }
     status = vxBuildProgram(program, cmd);
 
@@ -319,14 +321,16 @@ static vsi_status vsi_nn_RegisterBinKernel
     if(evis == VSI_NN_HW_EVIS_NONE)
     {
         // set default evis version is 2
-        sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va);
+        snprintf(cmd, MAX_BUILDPROGRAM_LEN,
+            "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va);
     }
     else
     {
-        sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va);
+        snprintf(cmd, MAX_BUILDPROGRAM_LEN,
+            "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va);
     }
 #else
-    sprintf(cmd, "-cl-viv-vx-extension");
+    snprintf(cmd, MAX_BUILDPROGRAM_LEN, "-cl-viv-vx-extension");
 #endif
     status = vxBuildProgram(program, cmd);
 
@@ -530,7 +534,7 @@ void vsi_nn_VxResourceSetPath
     char* path
     )
 {
-    strncpy(s_vx_resource_path, path, VSI_NN_MAX_PATH - 1);
+    vsi_nn_strncpy(s_vx_resource_path, path, VSI_NN_MAX_PATH - 1);
 } /* vsi_nn_VxResourceSetPath() */
 
 const uint8_t * vsi_nn_VxBinResourceGetResource
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c b/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c
index 1ce386a..f1141ba 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c
@@ -51,6 +51,7 @@ static vsi_status op_compute
     {
         status = VSI_FAILURE;
     }
+    self->n = (vx_node)n;
 
     return status;
 } /* op_compute() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
index 70ff65e..06d439b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
@@ -131,7 +131,8 @@ static vsi_status _static_batchnorm
     )
 {
     vsi_status         status;
-    vx_tensor vx_input,vx_output;
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_tensor_t* reshape_tensors[6] = { NULL };
     status = VSI_FAILURE;
 
     status = _try_set_high_presision_tensor(inputs);
@@ -142,29 +143,35 @@ static vsi_status _static_batchnorm
     }
     if(_is_3d_batchnorm(self, inputs))
     {
-        vx_input  = self->nn_param.batch_norm.local->reshaped_input->t;
-        vx_output = self->nn_param.batch_norm.local->reshaped_output->t;
+        reshape_tensors[0] = self->nn_param.batch_norm.local->reshaped_input;
+        reshape_tensors[5] = self->nn_param.batch_norm.local->reshaped_output;
     }
     else
     {
-        vx_input  = inputs[0]->t;
-        vx_output = outputs[0]->t;
+        reshape_tensors[0] = inputs[0];
+        reshape_tensors[5] = outputs[0];
     }
 
-    self->n = vxBatchNormalizationLayer(
-        self->graph->g,
-        self->nn_param.batch_norm.eps,
-        inputs[1]->t,
-        inputs[2]->t,
-        inputs[3]->t,
-        inputs[4]->t,
-        vx_input,
-        vx_output
-        );
-    if( NULL == self->n )
+    reshape_tensors[1] = inputs[1];
+    reshape_tensors[2] = inputs[2];
+    reshape_tensors[3] = inputs[3];
+    reshape_tensors[4] = inputs[4];
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_float32( param, "eps", self->nn_param.batch_norm.eps );
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+        "batch_norm",
+        reshape_tensors, 5,
+        &reshape_tensors[5], 1, param );
+
+    if( self->n )
     {
-        status = VSI_FAILURE;
+        status = VSI_SUCCESS;
     }
+
+    vsi_nn_kernel_param_release( &param );
+
     return status;
 }
 
@@ -439,7 +446,6 @@ static vsi_bool op_check
     }
 } /* op_check() */
 
-
 static vsi_bool op_setup
     (
     vsi_nn_node_t * self,
@@ -492,7 +498,6 @@ static vsi_status op_deinit
     return VSI_SUCCESS;
 }
 
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -512,4 +517,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
index 4399d22..87aa2ba 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
@@ -379,7 +379,7 @@ static vsi_status op_deinit
     )
 {
     vsi_status status = VSI_SUCCESS;
-
+    vsi_nn_internal_deinit_node_wksp( self );
     return status;
 } /* op_deinit() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
index fec61bb..be82720 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@@ -219,7 +219,7 @@ static vsi_bool op_check
         IO_TYPE(D_I16,        D_U8)
         IO_TYPE(D_I8|Q_DFP,   D_F32)
         IO_TYPE(D_I8|Q_DFP,   D_F16)
-        IO_TYPE(D_I8|Q_DFP,   D_I32)
+        IO_TYPE(D_I8|Q_DFP,   D_I32|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,   D_U32)
         IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,   D_I8|Q_ASYM)
@@ -247,7 +247,7 @@ static vsi_bool op_check
         IO_TYPE(D_U8|Q_ASYM,  D_I8|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM,  D_I16|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_I32)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_U32)
         IO_TYPE(D_U8|Q_ASYM,  D_F32)
         IO_TYPE(D_U8,         D_U8|Q_ASYM)
@@ -286,6 +286,8 @@ static vsi_bool op_check
         IO_TYPE(D_U32,        D_U16)
         IO_TYPE(D_U32,        D_U8|Q_ASYM)
         IO_TYPE(D_U32,        D_U8)
+        IO_TYPE(D_BF16,       D_I32)
+        IO_TYPE(D_I32,        D_BF16)
 
         /* HW 9.0.1 */
         IO_TYPE(D_I8|Q_DFP,   D_BF16)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c
new file mode 100644
index 0000000..7048f51
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c
@@ -0,0 +1,302 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _deconv3d_local_data_t {
+    int32_t placeholder;
+} deconv3d_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+
+#define COMPUTE_DECONV_SZ( in, ksize, pad_1, pad_2, stride, output_padding )\
+    (( in - 1 ) * stride + ksize - pad_1 - pad_2 + output_padding)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+
+    // Create kernel param
+    vsi_nn_kernel_param_t * param;
+    //vsi_nn_kernel_node_t    n;
+    param = vsi_nn_kernel_param_create();
+
+    // Add params
+#define MAP_PARAM(type_name, value) {\
+    vsi_nn_kernel_param_add_int32( param, type_name, value); \
+    }
+
+    MAP_PARAM("stride_w",self->nn_param.deconv3d.stride[0]);
+    MAP_PARAM("stride_h",self->nn_param.deconv3d.stride[1]);
+    MAP_PARAM("stride_d",self->nn_param.deconv3d.stride[2]);
+
+    MAP_PARAM("outpadding_w",self->nn_param.deconv3d.output_padding[0]);
+    MAP_PARAM("outpadding_h",self->nn_param.deconv3d.output_padding[1]);
+    MAP_PARAM("outpadding_d",self->nn_param.deconv3d.output_padding[2]);
+
+    MAP_PARAM("pad_left",self->nn_param.deconv3d.pad[0]);
+    MAP_PARAM("pad_right",self->nn_param.deconv3d.pad[1]);
+    MAP_PARAM("pad_top",self->nn_param.deconv3d.pad[2]);
+    MAP_PARAM("pad_bottom",self->nn_param.deconv3d.pad[3]);
+    MAP_PARAM("pad_front",self->nn_param.deconv3d.pad[4]);
+    MAP_PARAM("pad_end",self->nn_param.deconv3d.pad[5]);
+
+    MAP_PARAM("weights",self->nn_param.deconv3d.weights);
+    MAP_PARAM("group",self->nn_param.deconv3d.group);
+
+    MAP_PARAM("overflow_policy",self->vx_param.overflow_policy);
+    MAP_PARAM("rounding_policy",self->vx_param.rounding_policy);
+    MAP_PARAM("down_scale_size_rounding",self->vx_param.down_scale_size_rounding);
+
+#undef MAP_PARAM
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "deconv3d",
+            inputs, 3, outputs, 1, param );
+    if( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    vsi_nn_kernel_param_release( &param );
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = FALSE;
+
+    ret = vsi_nn_OpCheck(VSI_NN_OP_DECONVOLUTION, self, inputs, outputs);
+
+    return ret;
+} /* op_check() */
+
+void _rotate_weight_data(
+    vsi_nn_graph_t  * graph,
+    vsi_nn_tensor_t * weights)
+{
+    vsi_ssize_t oc = 0, ic = 0;
+    uint8_t* weight_data = NULL;
+    uint8_t* buffer = NULL;
+    vsi_ssize_t kernel_size_w = weights->attr.size[0];
+    vsi_ssize_t kernel_size_h = weights->attr.size[1];
+    vsi_ssize_t kernel_size_d = weights->attr.size[2];
+    vsi_ssize_t weight_ic = weights->attr.size[3];
+    vsi_ssize_t weight_oc = weights->attr.size[4];
+    vsi_ssize_t slice_size = kernel_size_w * kernel_size_h;
+    vsi_ssize_t depth_size = slice_size * kernel_size_d;
+    int32_t item_size = vsi_nn_TypeGetBytes(weights->attr.dtype.vx_type);
+
+    weight_data = vsi_nn_ConvertTensorToData(graph, weights);
+    buffer = (uint8_t*)malloc(item_size * depth_size * weight_ic * weight_oc);
+    memset(buffer, 0x00, item_size * depth_size * weight_ic * weight_oc);
+    //memcpy(buffer, weight_data, item_size * slice_size * weight_ic * weight_oc);
+    for(oc = 0; oc < weight_oc; oc++)
+    {
+        for(ic = 0; ic < weight_ic; ic++)
+        {
+            vsi_ssize_t d, h, w;
+            vsi_ssize_t offset = item_size * depth_size * (oc * weight_ic + ic);
+            for(d = 0; d < kernel_size_d; d++)
+            {
+                uint8_t *src_depth = weight_data + offset +  (kernel_size_d - d - 1) * item_size * slice_size;
+                uint8_t *dst_depth = buffer + offset + d * item_size * slice_size;
+                for(h = 0; h < kernel_size_h; h ++)
+                {
+                    uint8_t *dst_height = dst_depth + h * kernel_size_w * item_size;
+                    uint8_t *src_height = src_depth + (kernel_size_h - 1 - h) * kernel_size_w * item_size;
+                    for(w = 0; w < kernel_size_w; w++)
+                    {
+                        memcpy(dst_height + w * item_size,
+                            src_height + (kernel_size_w - 1 - w) * item_size,
+                            item_size);
+                    }
+                }
+            }
+        }
+    }
+
+    vsi_nn_CopyDataToTensor( graph, weights, buffer );
+    vsi_nn_Free( buffer );
+    vsi_nn_safe_free( weight_data );
+}
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_deconv3d_param *nn_param;
+
+    if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 )
+    {
+        self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
+    }
+
+    /* Rotate 180 degrees for weights data */
+    if (TRUE == inputs[1]->attr.is_const)
+    {
+        _rotate_weight_data(self->graph, inputs[1]);
+    }
+    else
+    {
+         VSILOGE("deconv3d: do not support dynamic weight");
+    }
+
+    nn_param = &self->nn_param.deconv3d;
+
+    nn_param->group = ( 0 == nn_param->group ) ? 1 : nn_param->group;
+    nn_param->ksize[0] = (uint32_t)inputs[1]->attr.size[0];
+    nn_param->ksize[1] = (uint32_t)inputs[1]->attr.size[1];
+    nn_param->ksize[2] = (uint32_t)inputs[1]->attr.size[2];
+
+    if(nn_param->group != 1)
+    {
+        VSILOGE("deconv3d: only support group == 1, but group is %d", nn_param->group);
+        return FALSE;
+    }
+
+    if(nn_param->ksize[2] < nn_param->stride[2])
+    {
+        VSILOGE("deconv3d: only support kernel_depth < stride_depth,but \
+            kernel_depth = %d, stried_depth = %d", nn_param->ksize[2], nn_param->stride[2]);
+        return FALSE;
+    }
+
+    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.size[0] = COMPUTE_DECONV_SZ(
+            inputs[0]->attr.size[0],
+            nn_param->ksize[0],
+            nn_param->pad[0],
+            nn_param->pad[1],
+            nn_param->stride[0],
+            nn_param->output_padding[0]
+        );
+
+        outputs[0]->attr.size[1] = COMPUTE_DECONV_SZ(
+            inputs[0]->attr.size[1],
+            nn_param->ksize[1],
+            nn_param->pad[2],
+            nn_param->pad[3],
+            nn_param->stride[1],
+            nn_param->output_padding[1]
+        );
+        outputs[0]->attr.size[2] = COMPUTE_DECONV_SZ(
+            inputs[0]->attr.size[2],
+            nn_param->ksize[2],
+            nn_param->pad[4],
+            nn_param->pad[5],
+            nn_param->stride[2],
+            nn_param->output_padding[2]
+        );
+        if(self->nn_param.deconv3d.weights > 0)
+        {
+            outputs[0]->attr.size[3] = self->nn_param.deconv3d.weights;
+        }
+        else
+        {
+            outputs[0]->attr.size[3] = inputs[1]->attr.size[3];
+        }
+        outputs[0]->attr.size[4] = inputs[0]->attr.size[4];
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    /* TODO
+    //self->nn_param.deconv3d.local = \
+    //    (deconv3d_local_data_t*)malloc(sizeof(deconv3d_local_data_t));
+    */
+
+    return VSI_SUCCESS;
+} /* op_init() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    status = vsi_nn_op_common_deinit(self);
+
+    /* TODO
+    //vsi_nn_safe_free(self->nn_param.deconv3d.local);
+    */
+
+    return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ DECONV3D,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c
index d9de8b9..1f39eb7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c
@@ -114,6 +114,8 @@ static vsi_bool op_check
         IO_TYPE(D_I8|Q_DFP,   D_F16)
         IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,  D_F16)
+        IO_TYPE(D_BF16,  D_BF16)
+        IO_TYPE(D_F32,   D_F32)
     END_IO_TYPE_DECL(DEPTH2SPACE_INTERNAL)
     if(!VALIDATE_OP_IO_TYPES(DEPTH2SPACE_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
index 2373688..19a5303 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
@@ -198,32 +198,30 @@ static vsi_bool op_check_minimum
 {
     /* check inputs outputs data type */
     BEGIN_IO_TYPE_DECL(MINIMUM, 2, 1)
-        IO_TYPE(D_F16,  D_F16,  D_F16)
-        IO_TYPE(D_F16,  D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_F16,  D_U8|Q_ASYM)
-
-        IO_TYPE(D_F16,  D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_F16,  D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I8|Q_DFP,   D_F16)
-        IO_TYPE(D_F16,  D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
-
-        IO_TYPE(D_I16|Q_DFP, D_F16, D_F16)
-        IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM)
-
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
-
-        IO_TYPE(D_BF16, D_BF16, D_BF16)
-        IO_TYPE(D_F32,  D_F32,  D_F32)
-        IO_TYPE(D_I32,  D_I32,  D_I32)
+        IO_TYPE(D_F16,          D_F16,          D_F16)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_BF16,         D_BF16,         D_BF16)
+        IO_TYPE(D_F32,          D_F32,          D_F32)
+        IO_TYPE(D_I32,          D_I32,          D_I32)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_I16|Q_SYM)
     END_IO_TYPE_DECL(MINIMUM)
     if(!VALIDATE_OP_IO_TYPES(MINIMUM, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
@@ -245,32 +243,30 @@ static vsi_bool op_check_maximum
 {
     /* check inputs outputs data type */
     BEGIN_IO_TYPE_DECL(MAXIMUM, 2, 1)
-        IO_TYPE(D_F16,  D_F16,  D_F16)
-        IO_TYPE(D_F16,  D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_F16,  D_U8|Q_ASYM)
-
-        IO_TYPE(D_F16,  D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_F16,  D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I8|Q_DFP,   D_F16)
-        IO_TYPE(D_F16,  D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
-
-        IO_TYPE(D_I16|Q_DFP, D_F16, D_F16)
-        IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM)
-
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
-
-        IO_TYPE(D_BF16, D_BF16, D_BF16)
-        IO_TYPE(D_F32,  D_F32,  D_F32)
-        IO_TYPE(D_I32,  D_I32,  D_I32)
+        IO_TYPE(D_F16,          D_F16,          D_F16)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_BF16,         D_BF16,         D_BF16)
+        IO_TYPE(D_F32,          D_F32,          D_F32)
+        IO_TYPE(D_I32,          D_I32,          D_I32)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_I16|Q_SYM)
     END_IO_TYPE_DECL(MAXIMUM)
     if(!VALIDATE_OP_IO_TYPES(MAXIMUM, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
index d8ae9d9..a3a054e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
@@ -104,11 +104,11 @@ static vsi_bool op_setup
 
     out_rank = inputs[0]->attr.dim_num;
 
-    for(i = 0; i < out_rank; i++)
+    for (i = 0; i < out_rank; i++)
     {
         shape[i] = inputs[0]->attr.size[i];
     }
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = out_rank;
         memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) );
@@ -120,7 +120,7 @@ static vsi_bool op_setup
         total_size_expected = vsi_nn_ShapeProduct( shape, out_rank );
         total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num );
-        if( total_size_expected != total_size_got )
+        if ( total_size_expected != total_size_got )
         {
             VSILOGW("Output size mismatch, expect %"VSI_SIZE_T_SPECIFIER", but got %"VSI_SIZE_T_SPECIFIER"",
                     total_size_expected, total_size_got);
@@ -225,6 +225,7 @@ DEF_OP_REG(name, op_init_##kernel_name, op_compute_##kernel_name, \
     vsi_nn_op_common_deinit, op_check, op_setup, NULL, 1, 1)
 
 DEF_ELEMENT_WISE_UNARY_OP( SIN, sin );
+DEF_ELEMENT_WISE_UNARY_OP( COS, cos );
 DEF_ELEMENT_WISE_UNARY_OP( EXP, exp );
 DEF_ELEMENT_WISE_UNARY_OP( LOG, log );
 DEF_ELEMENT_WISE_UNARY_OP( ELU, elu );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
index 325e9c1..0c57380 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
@@ -117,6 +117,12 @@ static vsi_bool op_check
         IO_TYPE(D_I32,          D_I16|Q_DFP,    D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_U8|Q_ASYM)
         IO_TYPE(D_I16,          D_I32,          D_I32)
+        IO_TYPE(D_I8|Q_DFP,     D_I32,          D_I8|Q_DFP)
+        IO_TYPE(D_I32,          D_I32,          D_I8|Q_DFP)
+        IO_TYPE(D_I32,          D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,    D_I32,          D_I16|Q_DFP)
+        IO_TYPE(D_I32,          D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I32,          D_I32,          D_I16|Q_DFP)
     END_IO_TYPE_DECL(FLOORDIV)
     if (!VALIDATE_OP_IO_TYPES(FLOORDIV, self, inputs, self->input.num, outputs, self->output.num))
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c
index cf19eeb..6c1bdc2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c
@@ -165,6 +165,20 @@ static vsi_bool op_check
             /* NN Support - F32 */
             IO_TYPE(D_F32, D_BF16, D_F32, D_F32)
             IO_TYPE(D_F32, D_BF16, D_F32, D_BF16)
+            /* HW 9.0.1 */
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_F16)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_BF16)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_F32)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
+
         END_IO_TYPE_DECL(FCL_RELU)
         ret = VALIDATE_OP_IO_TYPES(FCL_RELU, self, inputs, self->input.num, outputs, self->output.num);
 
@@ -347,4 +361,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
index 8120757..34bcd78 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
@@ -51,6 +51,7 @@ static vsi_status op_compute
     uint32_t i = 0;
     vsi_size_t block_size = 1, block_num = 1, axis_num = 0, indices_num = 1;
     int32_t axis = self->nn_param.gather.axis;
+    int32_t batch_dims = self->nn_param.gather.batch_dims;
     vsi_size_t *input_size = inputs[0]->attr.size;
     uint32_t dims_num = inputs[0]->attr.dim_num;
 
@@ -62,11 +63,11 @@ static vsi_status op_compute
     }
 
     axis_num = input_size[axis];
-    for(i = axis + 1; i < dims_num; ++i)
+    for(i = axis + 1; i < dims_num - batch_dims; ++i)
     {
         block_num *= input_size[i];
     }
-    for(i = 0; i < (uint32_t)inputs[1]->attr.dim_num; ++i)
+    for(i = 0; i < (uint32_t)inputs[1]->attr.dim_num - batch_dims; ++i)
     {
         indices_num *= inputs[1]->attr.size[i];
     }
@@ -76,6 +77,7 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "axis_num", (int32_t)axis_num );
     vsi_nn_kernel_param_add_int32( param, "axis", (int32_t)axis );
     vsi_nn_kernel_param_add_int32( param, "indices_num", (int32_t)indices_num );
+    vsi_nn_kernel_param_add_int32( param, "batch_dims", (int32_t)batch_dims );
     n = vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, outputs, 1, param );
     if( n != NULL )
     {
@@ -125,6 +127,18 @@ static vsi_bool op_check
     return TRUE;
 } /* op_check() */
 
+static vsi_status op_init
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    self->nn_param.gather.batch_dims = 0;
+
+    return status;
+} /* op_init() */
+
 static vsi_bool op_setup
     (
     vsi_nn_node_t * self,
@@ -186,7 +200,7 @@ extern "C" {
 DEF_OP_REG
     (
     /* op_name    */ GATHER,
-    /* init       */ NULL,
+    /* init       */ op_init,
     /* compute    */ op_compute,
     /* deinit     */ op_deinit,
     /* check      */ op_check,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c
index 28a490c..6890763 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c
@@ -196,6 +196,12 @@ static vsi_status op_compute
             VSILOGE("Add vxConvolutionLayer fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__);
             return VSI_FAILURE;
         }
+        else
+        {
+            // no need to maintain self->n
+            vxReleaseNode( &self->n );
+            self->n = NULL;
+        }
     }
     return VSI_SUCCESS;
 } /* op_compute() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
index ed652c3..cdead0c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
@@ -37,6 +37,7 @@
 #include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 
 #define _INPUT_NUM          (3)
 #define _OUTPUT_NUM         (1)
@@ -72,18 +73,46 @@ static vsi_status _try_set_high_presision_tensor
     return status;
 }
 
-static vsi_bool _is_3d_instance_norm
+static void vsi_nn_optimize_instance_norm_shape
     (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs
+    const vsi_size_t* shape_x, const vsi_size_t rank_x,
+    vsi_size_t* out_shape_x, vsi_size_t* out_rank_x
     )
 {
-    if( 3 == inputs[0]->attr.dim_num )
+    vsi_size_t rank = rank_x;
+    vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = { {0} };
+
+    if (rank_x > 4)
     {
-        return TRUE;
+        memcpy(shape[0], shape_x, (rank_x - 2) * sizeof(vsi_size_t));
+
+        vsi_nn_kernel_optimize_element_shape(shape[0], rank_x - 2, shape[1], &rank);
     }
-    return FALSE;
-} /* _is_3d_instance_norm() */
+
+    if (rank_x == 3)
+    {
+        out_shape_x[0] = shape_x[0];
+        out_shape_x[1] = 1;
+        out_shape_x[2] = shape_x[1];
+        out_shape_x[3] = shape_x[2];
+
+        *out_rank_x = 4;
+    }
+    /****reshape [n, c, d0, d1, ..., dn] to [n, c, h, w]***/
+    else if (rank_x > 4 && rank == 2)
+    {
+        memcpy(out_shape_x, shape[1], 2 * sizeof(vsi_size_t));
+        memcpy(&out_shape_x[2], &shape_x[rank_x - 2], 2 * sizeof(vsi_size_t));
+
+        *out_rank_x = 4;
+    }
+    else
+    {
+        memcpy(out_shape_x, shape_x, rank_x * sizeof(vsi_size_t));
+
+        *out_rank_x = rank_x;
+    }
+}
 
 static vsi_status op_compute
     (
@@ -96,112 +125,48 @@ static vsi_status op_compute
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_kernel_node_t    n = NULL;
     float eps = self->nn_param.instancenorm.eps;
-    vsi_size_t *input_size = inputs[0]->attr.size;
-    vsi_size_t dims_num = inputs[0]->attr.dim_num;
-    int32_t rs_flg = 0;
-    vsi_nn_tensor_t * tmp_inputs[3]  = {NULL, NULL, NULL};
-    vsi_nn_tensor_t * tmp_outputs[1] = {NULL};
-    vsi_nn_instancenorm_lcl_data2 *local = self->nn_param.instancenorm.lcl2_data;
+    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
+    vsi_size_t new_rank = 0;
+    vsi_nn_tensor_t * tmp_tensors[4] = {NULL};
 
-    status = _try_set_high_presision_tensor(inputs);
+    vsi_nn_optimize_instance_norm_shape(inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank);
+
+    tmp_tensors[0] = vsi_nn_reshape_tensor( self->graph,
+        inputs[0], shape, new_rank );
+    tmp_tensors[1] = inputs[1];
+    tmp_tensors[2] = inputs[2];
+    tmp_tensors[3] = vsi_nn_reshape_tensor( self->graph,
+            outputs[0], shape, new_rank );
+
+    status = _try_set_high_presision_tensor(tmp_tensors);
     if(status != VSI_SUCCESS)
     {
         VSILOGE("Set tensor attr of high presision fail");
         return status;
     }
 
-    if(_is_3d_instance_norm(self, inputs))
-    {
-        tmp_inputs[0]  = local->reshaped_input;
-        tmp_outputs[0] = local->reshaped_output;
-        tmp_inputs[1] = inputs[1];
-        tmp_inputs[2] = inputs[2];
-    }
-    else
-    {
-        tmp_inputs[0] = inputs[0];
-        tmp_outputs[0] = outputs[0];
-        tmp_inputs[1] = inputs[1];
-        tmp_inputs[2] = inputs[2];
-        if((input_size[1] * input_size[2] < 65536)
-            && dims_num > 2)
-        {
-            rs_flg = 1;
-        }
-    }
-
     param = vsi_nn_kernel_param_create();
     vsi_nn_kernel_param_add_float32( param, "eps", eps );
-    vsi_nn_kernel_param_add_int32( param, "reshape_flg", rs_flg );
+
     n = vsi_nn_kernel_selector( self->graph, "instance_norm",
-                    tmp_inputs, _INPUT_NUM, tmp_outputs, _OUTPUT_NUM, param );
+                    tmp_tensors, _INPUT_NUM, &tmp_tensors[3], _OUTPUT_NUM, param );
     if( n != NULL )
     {
         self->n = (vx_node)n;
         status = VSI_SUCCESS;
     }
 
-    if(param != NULL)
+    if (param != NULL)
     {
         vsi_nn_kernel_param_release( &param );
     }
 
+    vsi_safe_release_tensor(tmp_tensors[0]);
+    vsi_safe_release_tensor(tmp_tensors[3]);
+
     return status;
 } /* op_compute() */
 
-static vsi_status op_optimize
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs,
-    vsi_nn_opt_direction_e direction
-    )
-{
-    uint32_t dim = 0;
-    vsi_nn_instancenorm_lcl_data2 *local = NULL;
-    vsi_size_t shape[VSI_NN_MAX_DIM_NUM];
-    char tensor_name[128];
-
-    dim = inputs[0]->attr.dim_num;
-    if(_is_3d_instance_norm(self, inputs) == FALSE)
-    {
-        return VSI_SUCCESS;
-    }
-
-    VSILOGD("Optimize 3D %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
-    /*
-        insert a reshape node before and after 3D instance_norm
-    */
-    shape[0] = inputs[0]->attr.size[0];
-    shape[1] = 1;
-    shape[2] = inputs[0]->attr.size[1];
-    shape[3] = inputs[0]->attr.size[2];
-    dim = 4;
-    local = self->nn_param.instancenorm.lcl2_data;
-    if (VSI_NN_OPTIMIZE_FORWARD == direction)
-    {
-        /* reshape 3d input (xcn) --> 4d input (whcn) */
-        local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim);
-    }
-    else
-    {
-        /* reshape 3d output(xcn) --> 4d output(whcn) */
-        local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim);
-        if(local->reshaped_output && local->reshaped_output->t)
-        {
-            memset(tensor_name, 0, sizeof(tensor_name));
-            snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid);
-            if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE)
-            {
-                VSILOGW("Set uid %u batchnorm reshaped output name fail", self->uid);
-                return VSI_FAILURE;
-            }
-        }
-    }
-
-    return VSI_SUCCESS;
-} /* op_optimize() */
-
 static vsi_bool op_check
     (
     vsi_nn_node_t * self,
@@ -241,66 +206,6 @@ static vsi_bool op_check
     return TRUE;
 } /* op_check() */
 
-static vsi_status op_init
-    (
-    vsi_nn_node_t * self
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-
-    self->nn_param.instancenorm.lcl2_data =
-    (vsi_nn_instancenorm_lcl_data2 *)malloc(sizeof(vsi_nn_instancenorm_lcl_data2));
-    if (NULL == self->nn_param.instancenorm.lcl2_data)
-    {
-        return  VX_ERROR_NO_MEMORY;
-    }
-
-    memset( self->nn_param.instancenorm.lcl2_data, 0, sizeof(vsi_nn_instancenorm_lcl_data2) );
-
-    self->nn_param.instancenorm.lcl2_data->reshapeFlg = 0;
-    self->nn_param.instancenorm.lcl2_data->execute_on_sw = 0;
-    self->nn_param.instancenorm.lcl2_data->hash_idx = 0;
-    self->nn_param.instancenorm.lcl2_data->reshaped_input = NULL;
-    self->nn_param.instancenorm.lcl2_data->reshaped_output = NULL;
-
-    return status;
-} /* op_init() */
-
-static vsi_status op_deinit
-    (
-    vsi_nn_node_t * self
-    )
-{
-    uint32_t i;
-    vsi_nn_instancenormalize_param *p = &(self->nn_param.instancenorm);
-    for (i = 0; i < _VSI_NN_INSTANCENORM_LOCAL_TENSOR_NUM; i++)
-    {
-        if (self->nn_param.instancenorm.local.local_tensor[i] != NULL)
-        {
-            vxReleaseTensor(&(self->nn_param.instancenorm.local.local_tensor[i]));
-            self->nn_param.instancenorm.local.local_tensor[i] = NULL;
-        }
-    }
-    if(p->lcl2_data->reshaped_input)
-    {
-        vsi_nn_ReleaseTensor(&(p->lcl2_data->reshaped_input));
-        p->lcl2_data->reshaped_input = NULL;
-    }
-    if(p->lcl2_data->reshaped_output)
-    {
-        vsi_nn_ReleaseTensor(&(p->lcl2_data->reshaped_output));
-        p->lcl2_data->reshaped_output = NULL;
-    }
-    if(self->nn_param.instancenorm.lcl2_data)
-    {
-        free(self->nn_param.instancenorm.lcl2_data);
-        self->nn_param.instancenorm.lcl2_data = NULL;
-    }
-    vsi_nn_op_common_deinit(self);
-
-    return VSI_SUCCESS;
-} /* op_deinit() */
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -308,12 +213,12 @@ extern "C" {
 DEF_OP_REG
     (
     /* op_name    */ INSTANCE_NORM,
-    /* init       */ op_init,
+    /* init       */ NULL,
     /* compute    */ op_compute,
-    /* deinit     */ op_deinit,
+    /* deinit     */ vsi_nn_op_common_deinit,
     /* check      */ op_check,
     /* setup      */ vsi_nn_op_common_setup,
-    /* optimize   */ op_optimize,
+    /* optimize   */ NULL,
     /* input_num  */ _INPUT_NUM,
     /* output_num */ _OUTPUT_NUM
     );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
index 8c298ae..08955f3 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
@@ -45,7 +45,6 @@
 
 #define VSI_NN_L2NORMALIZESCALE_DEFAULT_AXIS 2
 
-
 static vsi_nn_tensor_t* _expand_scale_tensor
     (
     vsi_nn_graph_t  *graph,
@@ -84,7 +83,7 @@ static vsi_nn_tensor_t* _expand_scale_tensor
     attr.size[0] = scale_size_out;
     attr.size[1] = 1;
     attr.dim_num = 2;
-    out_dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
     attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
     attr.vtl = FALSE;
     scale_tensor = vsi_nn_CreateTensor(graph, &attr);
@@ -115,7 +114,6 @@ final:
     return scale_tensor;
 }
 
-
 static vsi_bool _check_value_is_equal_to_one
     (
     vsi_nn_graph_t* graph,
@@ -429,4 +427,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c b/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c
index 69e27a1..3e79acc 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c
@@ -55,11 +55,13 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_float32( param, "b_v", self->nn_param.linear.b );
 
     n = vsi_nn_kernel_selector( self->graph, "linear", inputs, 1, outputs, 1, param );
-    if( n == NULL )
+    if ( n == NULL )
     {
         status = VSI_FAILURE;
     }
 
+    self->n = (vx_node)n;
+
     vsi_nn_kernel_param_release( &param );
 
     return status;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c
index bff1972..7b2d441 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c
@@ -113,6 +113,7 @@ static vsi_bool op_check
     BEGIN_IO_TYPE_DECL(LOGICAL_OPS, 2, 1)
         IO_TYPE(D_I8, D_I8, D_I8)
         IO_TYPE(D_BOOL8, D_BOOL8, D_BOOL8)
+        IO_TYPE(D_BF16, D_BF16, D_BOOL8)
     END_IO_TYPE_DECL(LOGICAL_OPS)
     if(!VALIDATE_OP_IO_TYPES(LOGICAL_OPS, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
index 5433281..d792d34 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
@@ -634,7 +634,7 @@ static vsi_bool op_setup
     {
         if ( p->local->use_hybrid && p->local->use_projection_bias )
         {
-            vsi_bool use_virtual_tensor = inputs[LSTMUNIT_INPUT_BIAS_PROJ]->attr.vtl;
+            use_virtual_tensor = inputs[LSTMUNIT_INPUT_BIAS_PROJ]->attr.vtl;
             input_tensor = vsi_nn_internal_create_zero_bias_tensor(self, &output_tensor->t->attr,
                 &inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr, VSI_NN_OP_FCL, FALSE);
             zero_bias_tensor = input_tensor->t;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
index fcf29f4..5da258f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
@@ -174,6 +174,7 @@ static vsi_bool op_check
         IO_TYPE(D_F16,  D_F16,  D_I16|Q_DFP)
         IO_TYPE(D_F16,  D_F16,  D_I8|Q_DFP)
         IO_TYPE(D_F16,  D_F16,  D_U8|Q_ASYM)
+        IO_TYPE(D_BF16, D_BF16, D_BF16)
     END_IO_TYPE_DECL(MATRIXMUL)
     if (!VALIDATE_OP_IO_TYPES(MATRIXMUL, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c
index bd24f7d..881767e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c
@@ -121,6 +121,8 @@ static vsi_bool op_check
         IO_TYPE(D_F16,        D_F32,        D_F32)
         IO_TYPE(D_F32,        D_F32,        D_F32)
         IO_TYPE(D_I32,        D_F32,        D_F32)
+        IO_TYPE(D_BF16,       D_BF16,       D_BF16)
+        IO_TYPE(D_BF16,       D_F32,        D_F32)
     END_IO_TYPE_DECL(MOMENTS)
     if (!VALIDATE_OP_IO_TYPES(MOMENTS, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c b/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c
index bddcc12..99dbd5d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c
@@ -107,6 +107,7 @@ static vsi_bool op_check
 
         IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP, D_F16)
+        IO_TYPE(D_BF16,      D_BF16)
     END_IO_TYPE_DECL(ONE_HOT)
     if (!VALIDATE_OP_IO_TYPES(ONE_HOT, self, inputs, self->input.num, outputs, self->output.num))
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c
new file mode 100644
index 0000000..d0b89aa
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c
@@ -0,0 +1,198 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_constraint_check.h"
+#include "kernel/vsi_nn_kernel.h"
+
+typedef struct _pad2_local_data_t {
+    int32_t placeholder;
+} pad2_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static int32_t _get_vx_pad_mode(vx_enum mode)
+{
+    int32_t pad_mode = 0;
+    switch (mode)
+    {
+    case VSI_NN_PAD_MODE_CONSTANT:
+        pad_mode = VX_PAD_CONSTANT;
+        break;
+    case VSI_NN_PAD_MODE_REPLICATE:
+        pad_mode = VX_PAD_REPLICATE;
+        break;
+    case VSI_NN_PAD_MODE_SYMMETRIC:
+        pad_mode = VX_PAD_MIRROR_SYMMETRIC;
+        break;
+    case VSI_NN_PAD_MODE_REFLECT:
+        pad_mode = VX_PAD_MIRROR_REFLECT;
+        break;
+    default:
+        VSILOGE("Wrong pad_mode value");
+        break;
+    }
+
+    return pad_mode;
+}
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_pad2_param *p = &self->nn_param.pad2;
+    vsi_nn_kernel_param_t * param;
+    int32_t pad_mode = _get_vx_pad_mode(p->mode);
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_buffer( param, "front_size", (void *)p->front_size, p->dim_num );
+    vsi_nn_kernel_param_add_buffer( param, "back_size", (void *)p->back_size, p->dim_num );
+    vsi_nn_kernel_param_add_int32( param, "pad_mode", pad_mode );
+    vsi_nn_kernel_param_add_float32( param, "const_val", p->const_val );
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "pad2",
+        inputs, 1, outputs, 1, param );
+
+    if( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    vsi_nn_kernel_param_release( &param );
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(PAD2, 1, 1)
+        IO_TYPE(D_F32,          D_F32)
+        IO_TYPE(D_F32,          D_BF16)
+        IO_TYPE(D_BF16,         D_F32)
+        IO_TYPE(D_BF16,         D_BF16)
+        IO_TYPE(D_F16,          D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+    END_IO_TYPE_DECL(PAD2)
+    if (!VALIDATE_OP_IO_TYPES(PAD2, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    if (self->nn_param.pad2.dim_num != inputs[0]->attr.dim_num
+        && self->nn_param.pad2.dim_num != 0 )
+    {
+        VSILOGE("Error:input tensor dim should be equal with pad's.");
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    uint32_t i = 0;
+    if (self->nn_param.pad2.dim_num == 0)
+    {
+        self->nn_param.pad2.dim_num = (uint8_t)inputs[0]->attr.dim_num;
+    }
+    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
+    {
+        for (i = 0; i < self->nn_param.pad2.dim_num; i ++)
+        {
+            uint32_t front = self->nn_param.pad2.front_size[i];
+            uint32_t back  = self->nn_param.pad2.back_size[i];
+            outputs[0]->attr.size[i] = inputs[0]->attr.size[i] + front + back;
+        }
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+    }
+    else
+    {
+        for (i = 0; i < self->nn_param.pad2.dim_num; i ++)
+        {
+            uint32_t front = self->nn_param.pad2.front_size[i];
+            uint32_t back  = self->nn_param.pad2.back_size[i];
+
+            if (front + back + inputs[0]->attr.size[i] != outputs[0]->attr.size[i])
+            {
+                VSILOGE("Error:output shape[%u] not equal front padding[%u] + input shape[%u] + back padding[%u]",
+                    outputs[0]->attr.size[i], front, back);
+                return FALSE;
+            }
+        }
+    }
+    return TRUE;
+} /* op_setup() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ PAD2,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
index e61d9f2..ecbf5fa 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
@@ -53,6 +53,7 @@ typedef struct _vsi_nn_reduce_lcl2_data_t
     vsi_nn_tensor_t *axis_tensor2;
     int32_t axes[VSI_NN_MAX_DIM_NUM];
     int32_t axes_num;
+    vsi_bool use_internal_node;
 } vsi_nn_reduce_lcl2_data_t;
 
 static vsi_status op_comput_reduce_mean(vsi_nn_node_t * self,
@@ -148,7 +149,54 @@ static vsi_bool caculate_reshape_size(uint32_t* dim_value,
     return enable_reshape;
 }
 
+static vsi_bool _check_is_sp_supported_type
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t * input,
+    vsi_enum type
+    )
+{
+    int32_t * axes = self->nn_param.reduce.local2->axes;
+    int32_t axes_num = self->nn_param.reduce.local2->axes_num;
+    vsi_size_t shapes[4][VSI_NN_MAX_DIM_NUM] = { 0 };
+    int32_t axis_in[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t i = 0;
+    uint32_t axis_size = 0;
+    uint32_t rank_in = 0;
+    uint32_t rank_out = 0;
+    vsi_bool ret = FALSE;
 
+    if ( !self->graph->ctx->config.support_stream_processor ||
+         (type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN) )
+    {
+        return FALSE;
+    }
+
+    if (   (VSI_NN_TYPE_FLOAT64 == input->attr.dtype.vx_type)
+        || (VSI_NN_TYPE_UINT32  == input->attr.dtype.vx_type)
+        || (VSI_NN_TYPE_UINT64  == input->attr.dtype.vx_type)
+        )
+    {
+        return FALSE;
+    }
+
+    for (i = 0; i < axes_num; i++)
+    {
+        shapes[0][i] = input->attr.size[axes[i]];
+        shapes[1][i] = 1;
+        axis_in[i] = i;
+    }
+
+    ret = vsi_nn_kernel_optimize_reduce_shape(
+            shapes[0], axes_num,
+            axis_in, axes_num,
+            shapes[1], axes_num,
+            shapes[2], &rank_in, shapes[3], &rank_out,
+            new_axis, &axis_size);
+
+    return ret && axis_size < 3;
+}
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -158,7 +206,11 @@ static vsi_status op_compute
 {
     vsi_status status = VSI_FAILURE;
 
-    if (self->nn_param.reduce.type == VSI_NN_REDUCE_MEAN)
+    if ( self->nn_param.reduce.local2->use_internal_node )
+    {
+        status = vsi_nn_internal_compute_node( self );
+    }
+    else if (self->nn_param.reduce.type == VSI_NN_REDUCE_MEAN)
     {
         vx_tensor input_t, output_t;
         vsi_nn_tensor_t *axis_tensor = NULL;
@@ -440,16 +492,6 @@ static vsi_status op_compute
                                             input_t,
                                             output_t);
         }
-
-    }
-    else if (self->nn_param.reduce.type == VSI_NN_REDUCE_SUM ||
-             self->nn_param.reduce.type == VSI_NN_REDUCE_MAX ||
-             self->nn_param.reduce.type == VSI_NN_REDUCE_MIN ||
-             self->nn_param.reduce.type == VSI_NN_REDUCE_ALL ||
-             self->nn_param.reduce.type == VSI_NN_REDUCE_ANY ||
-             self->nn_param.reduce.type == VSI_NN_REDUCE_PROD)
-    {
-        status = vsi_nn_internal_compute_node( self );
     }
 
     return status;
@@ -463,12 +505,7 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
-    if (self->nn_param.reduce.type == VSI_NN_REDUCE_SUM ||
-        self->nn_param.reduce.type == VSI_NN_REDUCE_MAX ||
-        self->nn_param.reduce.type == VSI_NN_REDUCE_MIN ||
-        self->nn_param.reduce.type == VSI_NN_REDUCE_ALL ||
-        self->nn_param.reduce.type == VSI_NN_REDUCE_ANY ||
-        self->nn_param.reduce.type == VSI_NN_REDUCE_PROD)
+    if ( self->nn_param.reduce.local2->use_internal_node )
     {
         return vsi_nn_internal_optimize_node(self, direction );
     }
@@ -726,7 +763,6 @@ static vsi_bool op_set_reduce_axis(
             (vsi_size_t*)resolved_dim2,  &resolved_dim_count2 );
     }
 
-
     for (i = 0; i < (uint32_t)resolved_dim_count2; i++)
     {
         self->nn_param.reduce.local2->axes[i] = (int32_t)resolved_dim2[i];
@@ -736,6 +772,92 @@ static vsi_bool op_set_reduce_axis(
     return TRUE;
 }
 
+static vsi_bool op_set_sp_reduce_internal
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_enum  type_name
+    )
+{
+    vsi_nn_tensor_attr_t attr;
+    vsi_nn_internal_tensor_t* tensor1 = NULL;
+    vsi_nn_tensor_t* new_output = NULL;
+    uint32_t* permute_in_perm = NULL;
+    int32_t * new_axis = NULL;
+    vsi_size_t shapes[VSI_NN_MAX_DIM_NUM] = {1};
+    int32_t use_virtual_tensor = TRUE;
+    vsi_nn_internal_node_t* tmp_inode = NULL;
+    int32_t * axes = self->nn_param.reduce.local2->axes;
+    int32_t axes_num = self->nn_param.reduce.local2->axes_num;
+    int32_t i = 0, j = 0, index = 0;
+    vsi_size_t reduce_size = 1;
+
+    vsi_nn_internal_init_node_wksp( self );
+
+    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
+    vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor);
+    tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+
+    tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 );
+    permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode,
+        inputs[0]->attr.dim_num * sizeof(uint32_t));
+
+    for ( i = 0;  i < axes_num; i++)
+    {
+        shapes[index] = outputs[0]->attr.size[axes[i]];
+        permute_in_perm[index ++] = axes[i];
+        reduce_size *= inputs[0]->attr.size[axes[i]];
+    }
+
+    for ( j = 0;  j < (int32_t)inputs[0]->attr.dim_num;  j++)
+    {
+        for (i = 0; i < axes_num; i++)
+        {
+            if (j == axes[i])
+            {
+                break;
+            }
+        }
+        if (i == axes_num)
+        {
+            shapes[index] = outputs[0]->attr.size[j];
+            permute_in_perm[index ++] = j;
+        }
+    }
+    tmp_inode->node->nn_param.permute.perm = permute_in_perm;
+    tmp_inode->node->nn_param.permute.dim_num = inputs[0]->attr.dim_num;
+    tmp_inode->inputs[0] = inputs[0];
+    tmp_inode->outputs[0] = tensor1->t;
+    vsi_nn_internal_setup_node(self, tmp_inode);
+
+    new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes, outputs[0]->attr.dim_num);
+
+    tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_REDUCE_MEAN_INTERNAL, 0, 0 );
+    new_axis = (int32_t *)vsi_nn_internal_new_node_param(tmp_inode,
+        axes_num * sizeof(int32_t));
+    for (i = 0; i < axes_num; i++)
+    {
+        new_axis[i] = i;
+    }
+    tmp_inode->inputs[0] = tensor1->t;
+    tmp_inode->outputs[0] = new_output;
+    tmp_inode->node->nn_param.reduce_mean_internal.axis = new_axis;
+    tmp_inode->node->nn_param.reduce_mean_internal.axis_num = axes_num;
+    if (type_name == VSI_NN_REDUCE_SUM)
+    {
+        tmp_inode->node->nn_param.reduce_mean_internal.scale = 1.0f;
+    }
+    else
+    {
+        tmp_inode->node->nn_param.reduce_mean_internal.scale = 1.0f / (float)reduce_size;
+    }
+    vsi_nn_internal_setup_node(self, tmp_inode);
+
+    self->nn_param.reduce.local2->reshaped_output = new_output;
+
+    return TRUE;
+}
 
 static vsi_bool op_set_reduce_internal
     (
@@ -920,7 +1042,6 @@ static vsi_bool op_set_reduce_internal
         curr->outputs[0] = tmp_output_tensor[1]->t;
         vsi_nn_internal_setup_node( self, curr );
 
-
         if (3 == axes[resolved_dim_count - 1])
         {
             vsi_bool enable_reshape = TRUE;
@@ -968,7 +1089,6 @@ static vsi_bool op_set_reduce_internal
     return TRUE;
 }
 
-
 static vsi_bool op_setup
     (
     vsi_nn_node_t * self,
@@ -1063,32 +1183,43 @@ static vsi_bool op_setup
     reshape_out_t[0] = vsi_nn_reshape_tensor( self->graph,
             outputs[0], shape, new_rank );
     self->nn_param.reduce.local2->reshaped_output1 = reshape_out_t[0];
-    if (self->nn_param.reduce.type == VSI_NN_REDUCE_SUM)
+
+    if (_check_is_sp_supported_type(self, reshape_in_t[0], self->nn_param.reduce.type))
     {
+        self->nn_param.reduce.local2->use_internal_node = TRUE;
+        ret = op_set_sp_reduce_internal(self, reshape_in_t, reshape_out_t, self->nn_param.reduce.type);
+    }
+    else if (self->nn_param.reduce.type == VSI_NN_REDUCE_SUM)
+    {
+        self->nn_param.reduce.local2->use_internal_node = TRUE;
         ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCESUM_INTERNAL);
     }
     else if (self->nn_param.reduce.type == VSI_NN_REDUCE_MAX)
     {
+        self->nn_param.reduce.local2->use_internal_node = TRUE;
         ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCEMAX_INTERNAL);
     }
     else if (self->nn_param.reduce.type == VSI_NN_REDUCE_MIN)
     {
+        self->nn_param.reduce.local2->use_internal_node = TRUE;
         ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCEMIN_INTERNAL);
     }
     else if (self->nn_param.reduce.type == VSI_NN_REDUCE_PROD)
     {
+        self->nn_param.reduce.local2->use_internal_node = TRUE;
         ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCEPROD_INTERNAL);
     }
     else if (self->nn_param.reduce.type == VSI_NN_REDUCE_ALL)
     {
+        self->nn_param.reduce.local2->use_internal_node = TRUE;
         ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCEALL_INTERNAL);
     }
     else if (self->nn_param.reduce.type == VSI_NN_REDUCE_ANY)
     {
+        self->nn_param.reduce.local2->use_internal_node = TRUE;
         ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCEANY_INTERNAL);
     }
 
-
     return ret;
 } /* op_setup() */
 
@@ -1097,6 +1228,8 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
+    vsi_bool use_interanl_node = self->nn_param.reduce.local2->use_internal_node;
+
     if (self->nn_param.reduce.local.axis_tensor != NULL)
     {
         vsi_nn_ReleaseTensor(&(self->nn_param.reduce.local.axis_tensor));
@@ -1132,12 +1265,7 @@ static vsi_status op_deinit
         self->nn_param.reduce.local2 = NULL;
     }
 
-    if (self->nn_param.reduce.type == VSI_NN_REDUCE_SUM ||
-        self->nn_param.reduce.type == VSI_NN_REDUCE_MAX ||
-        self->nn_param.reduce.type == VSI_NN_REDUCE_MIN ||
-        self->nn_param.reduce.type == VSI_NN_REDUCE_ALL ||
-        self->nn_param.reduce.type == VSI_NN_REDUCE_ANY ||
-        self->nn_param.reduce.type == VSI_NN_REDUCE_PROD)
+    if ( use_interanl_node )
     {
         vsi_nn_internal_deinit_node_wksp(self);
     }
@@ -1184,4 +1312,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c
new file mode 100644
index 0000000..ced3cd7
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c
@@ -0,0 +1,163 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+typedef struct _reduce_mean_internal_local_data_t {
+    int32_t placeholder;
+} reduce_mean_internal_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    int32_t * axis = self->nn_param.reduce_mean_internal.axis;
+    int32_t axis_num = self->nn_param.reduce_mean_internal.axis_num;
+    float scale = self->nn_param.reduce_mean_internal.scale;
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { 0 };
+    int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0};
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    uint32_t axis_size = 0;
+    uint32_t rank_in = 0;
+    uint32_t rank_out = 0;
+    vsi_bool ret = FALSE;
+    vsi_nn_kernel_param_t * param = NULL;
+
+    ret = vsi_nn_kernel_optimize_reduce_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            axis, axis_num,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes[0], &rank_in, shapes[1], &rank_out,
+            new_axis, &axis_size);
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32( param, "axis_num", axis_size );
+    vsi_nn_kernel_param_add_float32( param, "scale", scale );
+
+    if (ret)
+    {
+        uint32_t i = 0;
+        reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
+            inputs[0], shapes[0], rank_in );
+        for (i = 0; i < axis_size; i++)
+        {
+            shapes[0][i] = 1;
+        }
+        reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+            outputs[0], shapes[0], rank_in );
+
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+                "reduce_mean",
+                &reshape_tensors[0], 1,
+                &reshape_tensors[1], 1, param );
+
+        vsi_nn_ReleaseTensor( &reshape_tensors[0] );
+        vsi_nn_ReleaseTensor( &reshape_tensors[1] );
+    }
+
+    if ( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    vsi_nn_kernel_param_release( &param );
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /*TODO: Check tensor shapes. */
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_reduce_mean_internal_param * p = &(self->nn_param.reduce_mean_internal);
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        uint32_t i = 0;
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+
+        memcpy(outputs[0]->attr.size, inputs[0]->attr.size, inputs[0]->attr.dim_num * sizeof(vsi_size_t));
+
+        for (i = 0; i < p->axis_num; i++)
+        {
+            outputs[0]->attr.size[p->axis[i]] = 1;
+        }
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ REDUCE_MEAN_INTERNAL,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c
index c04009a..f9213ad 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c
@@ -110,6 +110,7 @@ static vsi_bool op_check
         IO_TYPE(D_I32, D_I32, D_I32)
         IO_TYPE(D_I32, D_U32, D_U32)
         IO_TYPE(D_I32, D_F32, D_F32)
+        IO_TYPE(D_I32, D_BF16,D_BF16)
     END_IO_TYPE_DECL(SCATTER_ND)
     if(!VALIDATE_OP_IO_TYPES(SCATTER_ND, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
index 0f7ff2b..81a0afd 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
@@ -81,7 +81,7 @@ static vsi_status _create_split_softmax
     return VSI_SUCCESS;
 } /* _create_split_softmax() */
 
-static vsi_status vsi_nn_softmax_compute
+vsi_status op_compute
     (
     vsi_nn_node_t * self,
     vsi_nn_tensor_t ** inputs,
@@ -161,7 +161,7 @@ static vsi_status vsi_nn_softmax_compute
     }
 
     return status;
-} /* vsi_nn_softmax_compute() */
+} /* op_compute() */
 
 static vsi_status op_optimize
     (
@@ -296,7 +296,7 @@ DEF_OP_REG
     (
     /* op_name    */ SOFTMAX_INTERNAL,
     /* init       */ NULL,
-    /* compute    */ vsi_nn_softmax_compute,
+    /* compute    */ op_compute,
     /* deinit     */ op_deinit,
     /* check      */ op_check,
     /* setup      */ vsi_nn_op_common_setup,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_square.c b/src/tim/vx/internal/src/ops/vsi_nn_op_square.c
index 86d46dd..8daa728 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_square.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_square.c
@@ -49,6 +49,7 @@ static vsi_status op_compute
     {
         status = VSI_FAILURE;
     }
+    self->n = (vx_node)n;
 
     return status;
 } /* op_compute() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c
index a514514..1dbe3ca 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c
@@ -118,6 +118,16 @@ static vsi_bool op_setup
         }
     }
 
+    if (1 == node->input.num)
+    {
+        curr = vsi_nn_internal_new_node( node, VSI_NN_OP_RESHAPE2, 1, 1);
+        curr->inputs[0] = inputs[0];
+        curr->outputs[0] = outputs[0];
+        curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
+        curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
+        goto final;
+    }
+
     input_shape[0] = block_size;
     input_shape[1] = block_num;
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
index c0a0562..76495df 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
@@ -467,6 +467,7 @@ static vsi_bool op_check
 
         IO_TYPE(D_F16,          D_BF16)
         IO_TYPE(D_F16,          D_F32)
+        IO_TYPE(D_U8|Q_ASYM,    D_I32|Q_ASYM)
 
     END_IO_TYPE_DECL(STRIDED_SLICE)
     if (!VALIDATE_OP_IO_TYPES(STRIDED_SLICE, self, inputs, self->input.num, outputs, self->output.num))
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
index 9f8ca77..bae1005 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@@ -54,7 +54,7 @@ static void _try_open_file
         VSILOGW( "File handle is not NULL." );
         fclose( *fp );
     }
-    *fp = fopen( file_path, mode );
+    *fp = vsi_nn_fopen( file_path, mode );
     if( NULL == *fp )
     {
         VSILOGE( "Open file %s fail.", file_path );
@@ -437,6 +437,9 @@ static _op_param_gen_t s_op_gen[] =
     /* GRUCELL_ACTIVATION */    NULL,
     /* RESHAPE2 */              NULL,
     /* CONV3D */                NULL,
+    /* DECONV3D */              NULL,
+    /* PAD2 */                  NULL,
+    /* COS */                   NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
 
@@ -557,4 +560,3 @@ void vsi_nn_GenGraphCCode
     _try_close_file( &s_dfile_hndl );
     _try_close_file( &s_net_file_hndl );
 } /* vsi_nn_GenGraphCCode() */
-
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
index acca854..92dedcc 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
@@ -267,6 +267,12 @@ vsi_bool vsi_nn_dtype_convert_float_to_quantize_asymm
         case U8:
             return vsi_nn_dtype_convert_float_to_quantize_asymm8(
                     buffer, size, scale, zero_point, (uint8_t*)out_buffer );
+        case I8:
+            return vsi_nn_dtype_convert_float_to_quantize_symm8(
+                    buffer, size, scale, zero_point, (int8_t*)out_buffer );
+        case I16:
+            return vsi_nn_dtype_convert_float_to_quantize_symm16(
+                    buffer, size, scale, zero_point, (int16_t*)out_buffer );
         default:
             VSILOGE("Don't support convert float to asymm quant %d.", dtype);
             break;
@@ -413,6 +419,12 @@ vsi_bool vsi_nn_dtype_convert_quantize_asymm_to_float
         case U8:
             return vsi_nn_dtype_convert_quantize_asymm8_to_float(
                     (const uint8_t *)buffer, size, scale, zero_point, out_buffer );
+        case I8:
+            return vsi_nn_dtype_convert_quantize_symm8_to_float(
+                    (const int8_t *)buffer, size, scale, zero_point, out_buffer );
+        case I16:
+            return vsi_nn_dtype_convert_quantize_symm16_to_float(
+                    (const int16_t *)buffer, size, scale, zero_point, out_buffer );
         case I32:
             return vsi_nn_dtype_convert_quantize_symm32_to_float(
                     (const int *)buffer, size, scale, zero_point, out_buffer );
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c
index b05fdab..bd14b39 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c
@@ -23,6 +23,7 @@
 *****************************************************************************/
 #include <stdlib.h>
 #include <stdio.h>
+#include <stdarg.h>
 #include <string.h>
 #include <math.h>
 #include <fcntl.h>
@@ -103,6 +104,69 @@ _compiler_assert(VX_STATUS_MIN == -25, VX_STATUS_VALUE_CHANGED);
 
 static const int16_t vx_status_desc_cnt = _cnt_of_array( vx_status_desc );
 
+char* vsi_nn_strncpy
+    (
+    char* dest,
+    const char* source,
+    size_t count
+    )
+{
+    char* ret = NULL;
+    #ifdef _MSC_VER
+        strncpy_s(dest, count, source, _TRUNCATE);
+    #else
+        strncpy(dest, source, count);
+    #endif
+    return ret;
+}
+
+char* vsi_nn_strncat
+    (
+    char* dest,
+    const char* source,
+    size_t count
+    )
+{
+    char* ret = NULL;
+    #ifdef _MSC_VER
+        strncat_s(dest, count, source, _TRUNCATE);
+        ret = dest;
+    #else
+        ret = strncat(dest, source, count);
+    #endif
+    return ret;
+}
+
+char* vsi_nn_getenv
+    (
+    const char * var_name
+    )
+{
+    char* var = NULL;
+    #ifdef _MSC_VER
+        size_t var_size = 0;
+        _dupenv_s(&var, &var_size, var_name);
+    #else
+        var = getenv(var_name);
+    #endif
+    return var;
+};
+
+FILE* vsi_nn_fopen
+    (
+    const char * file_name,
+    const char * mode
+    )
+{
+    FILE * file = NULL;
+    #ifdef _MSC_VER
+        fopen_s(&file, file_name, mode);
+    #else
+        file = fopen(file_name, mode);
+    #endif
+    return file;
+}
+
 static vsi_size_t _compute_stride_rounding
     (
     vsi_size_t out,
@@ -148,7 +212,7 @@ uint8_t * vsi_nn_LoadBinaryData
     vsi_size_t      cnt;
     FILE      * fp;
 
-    fp = fopen( filename, "rb" );
+    fp = vsi_nn_fopen( filename, "rb" );
     if( NULL == fp )
     {
         return NULL;
@@ -867,21 +931,21 @@ void vsi_nn_FormatToString
 {
     switch(tensor->attr.dtype.vx_type)
     {
-    case VSI_NN_TYPE_INT4:strncpy(buf,  "i4 ",  buf_sz);break;
-    case VSI_NN_TYPE_INT8:strncpy(buf,  "i8 ",  buf_sz);break;
-    case VSI_NN_TYPE_INT16:strncpy(buf, "i16", buf_sz);break;
-    case VSI_NN_TYPE_INT32:strncpy(buf, "i32", buf_sz);break;
-    case VSI_NN_TYPE_INT64:strncpy(buf, "i64", buf_sz);break;
-    case VSI_NN_TYPE_UINT4:strncpy(buf,  "u4 ",  buf_sz);break;
-    case VSI_NN_TYPE_UINT8:strncpy(buf,  "u8 ",  buf_sz);break;
-    case VSI_NN_TYPE_UINT16:strncpy(buf, "u16", buf_sz);break;
-    case VSI_NN_TYPE_UINT32:strncpy(buf, "u32", buf_sz);break;
-    case VSI_NN_TYPE_UINT64:strncpy(buf, "u64", buf_sz);break;
-    case VSI_NN_TYPE_FLOAT16:strncpy(buf, "f16", buf_sz);break;
-    case VSI_NN_TYPE_FLOAT32:strncpy(buf, "f32", buf_sz);break;
-    case VSI_NN_TYPE_FLOAT64:strncpy(buf, "f64", buf_sz);break;
-    case VSI_NN_TYPE_BFLOAT16:strncpy(buf, "bf16", buf_sz);break;
-    case VSI_NN_TYPE_BOOL8:strncpy(buf, "bool8", buf_sz);break;
+    case VSI_NN_TYPE_INT4:vsi_nn_strncpy(buf,  "i4 ",  buf_sz);break;
+    case VSI_NN_TYPE_INT8:vsi_nn_strncpy(buf,  "i8 ",  buf_sz);break;
+    case VSI_NN_TYPE_INT16:vsi_nn_strncpy(buf, "i16", buf_sz);break;
+    case VSI_NN_TYPE_INT32:vsi_nn_strncpy(buf, "i32", buf_sz);break;
+    case VSI_NN_TYPE_INT64:vsi_nn_strncpy(buf, "i64", buf_sz);break;
+    case VSI_NN_TYPE_UINT4:vsi_nn_strncpy(buf,  "u4 ",  buf_sz);break;
+    case VSI_NN_TYPE_UINT8:vsi_nn_strncpy(buf,  "u8 ",  buf_sz);break;
+    case VSI_NN_TYPE_UINT16:vsi_nn_strncpy(buf, "u16", buf_sz);break;
+    case VSI_NN_TYPE_UINT32:vsi_nn_strncpy(buf, "u32", buf_sz);break;
+    case VSI_NN_TYPE_UINT64:vsi_nn_strncpy(buf, "u64", buf_sz);break;
+    case VSI_NN_TYPE_FLOAT16:vsi_nn_strncpy(buf, "f16", buf_sz);break;
+    case VSI_NN_TYPE_FLOAT32:vsi_nn_strncpy(buf, "f32", buf_sz);break;
+    case VSI_NN_TYPE_FLOAT64:vsi_nn_strncpy(buf, "f64", buf_sz);break;
+    case VSI_NN_TYPE_BFLOAT16:vsi_nn_strncpy(buf, "bf16", buf_sz);break;
+    case VSI_NN_TYPE_BOOL8:vsi_nn_strncpy(buf, "bool8", buf_sz);break;
     default:
         break;
     }
@@ -1199,6 +1263,8 @@ int32_t vsi_nn_get_tensor_zero_point
     switch (tensor->attr.dtype.qnt_type)
     {
         case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
+            zero_point = 0;
+            break;
         case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
             zero_point = tensor->attr.dtype.zero_point;
             break;
@@ -1226,7 +1292,14 @@ void vsi_nn_get_tensor_clamp_min_max
     }
     else if (vx_type == VSI_NN_TYPE_INT8)
     {
-        *clampMin = -128 - zero_point;
+        if (input->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC)
+        {
+            *clampMin = -127 - zero_point;
+        }
+        else
+        {
+            *clampMin = -128 - zero_point;
+        }
         *clampMax = 127 - zero_point;
     }
     else if (vx_type == VSI_NN_TYPE_INT16)
diff --git a/src/tim/vx/internal/src/vsi_nn_client_op.c b/src/tim/vx/internal/src/vsi_nn_client_op.c
index fcfa365..b7983d9 100644
--- a/src/tim/vx/internal/src/vsi_nn_client_op.c
+++ b/src/tim/vx/internal/src/vsi_nn_client_op.c
@@ -34,6 +34,7 @@ typedef struct _client_node
 {
     vsi_nn_op_t                op;
     vsi_nn_op_proc_t           proc;
+    const char*                kernel_name;
 } _client_node_t;
 
 static vsi_nn_binary_tree_t * s_root = NULL;
@@ -139,3 +140,41 @@ void vsi_nn_OpRemoveClient
     }
 } /* vsi_nn_OpRemoveClient() */
 
+vsi_bool vsi_nn_OpAddClientName
+  (
+    vsi_nn_op_t op,
+    const char* kernel_name
+  )
+{
+    _client_node_t * node;
+     vsi_bool ret;
+
+    ret = FALSE;
+    node = (_client_node_t *)vsi_nn_BinaryTreeGetNode(
+        &s_root,
+        (vsi_nn_binary_tree_key_t)op );
+    if( NULL != node && NULL != kernel_name)
+    {
+        node->kernel_name = kernel_name;
+        ret = TRUE;
+    }
+    return ret;
+}/* vsi_nn_OpAddClientName() */
+
+const char * vsi_nn_OpGetClientName
+    (
+    vsi_nn_op_t op
+    )
+{
+    _client_node_t * node;
+
+    node = (_client_node_t *)vsi_nn_BinaryTreeGetNode(
+        &s_root,
+        (vsi_nn_binary_tree_key_t)op );
+
+    if( NULL != node ){
+        return node->kernel_name;
+    }else{
+        return NULL;
+    }
+} /* vsi_nn_OpGetClientName() */
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c
index f453b32..acd5b4f 100644
--- a/src/tim/vx/internal/src/vsi_nn_context.c
+++ b/src/tim/vx/internal/src/vsi_nn_context.c
@@ -86,7 +86,7 @@ final:
 
 int32_t vsi_nn_getEnv(const char* name, char** env_s) {
     int32_t ret = 0;
-    *env_s = getenv(name);
+    *env_s = vsi_nn_getenv(name);
     if (*env_s) {
         ret = TRUE;
     }
@@ -121,6 +121,13 @@ static vsi_status vsi_nn_initOptions
         options->enable_concat_optimize = atoi(env_s);
     }
 
+    env_s = NULL;
+    options->enable_asymi8_to_u8 = 1;
+    if (vsi_nn_getEnv("VSI_NN_ENABLE_I8TOU8", &env_s) && env_s)
+    {
+        options->enable_asymi8_to_u8 = atoi(env_s);
+    }
+
     return VSI_SUCCESS;
 }
 
diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c
index b721265..76f4f2c 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph.c
@@ -614,8 +614,8 @@ void vsi_nn_ReleaseGraph
     uint32_t i;
     vsi_nn_graph_t  * ptr;
 
-    ptr = *graph;
-    if( NULL != graph && NULL != * graph )
+    ptr = (NULL != graph) ? *graph : NULL;
+    if( NULL != ptr)
     {
         if( NULL != ptr->nodes )
         {
@@ -657,7 +657,6 @@ void vsi_nn_ReleaseGraph
         free( ptr );
         *graph = NULL;
     }
-
 } /* vsi_nn_ReleaseGraph() */
 
 /*
@@ -1171,6 +1170,65 @@ vsi_nn_node_t * vsi_nn_AppendNode
     return vsi_nn_AddNode( graph, op, 0, 0, node_id );
 } /* vsi_nn_AppendNode() */
 
+vsi_nn_node_t * vsi_nn_AddExternalNode
+    (
+    vsi_nn_graph_t      * graph,
+    vsi_nn_op_t           op,
+    const void          * proc,
+    vsi_nn_node_id_t    * node_id,
+    const char          * kernel_name
+    )
+{
+    vsi_nn_node_t * node;
+    vsi_nn_node_id_t id;
+    vsi_nn_op_proc_t * node_proc;
+
+    node_proc = (vsi_nn_op_proc_t*)proc;
+
+    if( NULL == graph )
+    {
+        return NULL;
+    }
+    node = (vsi_nn_node_t *)malloc( sizeof( vsi_nn_node_t ) );
+
+    if( NULL != node )
+    {
+        memset( node, 0, sizeof( vsi_nn_node_t ) );
+        node->graph = graph;
+        node->op = op;
+        node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
+        node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_ZERO;
+        node->vx_param.down_scale_size_rounding = VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR;
+
+        /* init op */
+        if(node_proc->init != NULL){
+            //TODO
+        }
+
+        /* init output struct */
+        node->output.num = node_proc->output_num;
+        node->output.tensors = (vsi_nn_tensor_id_t *) malloc(
+            node_proc->output_num * sizeof( vsi_nn_tensor_id_t ) );
+        vsi_nn_InitTensorsId( node->output.tensors, node_proc->output_num );
+
+        /* init input struct */
+        node->input.num = node_proc->input_num;
+        node->input.tensors = (vsi_nn_tensor_id_t *) malloc(
+            node_proc->input_num * sizeof( vsi_nn_tensor_id_t ) );
+        vsi_nn_InitTensorsId( node->input.tensors, node_proc->input_num );
+        node->attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE;
+        node->attr.enable_op_constraint_check = TRUE;
+    }
+    id = graph->cur_nid;
+    if(NULL != node){
+        vsi_nn_MapAdd( graph->node_table, (vsi_nn_map_key_t)id, (void *)node );
+        graph->node_num = graph->cur_nid;
+        graph->cur_nid ++;
+    }
+    vsi_nn_OpRegisterExternalOvxInit(op, kernel_name, node_proc);
+    return node;
+} /* vsi_nn_AddExternalNode() */
+
 void vsi_nn_RemoveNode
     (
     vsi_nn_graph_t      * graph,
@@ -1251,7 +1309,6 @@ vsi_bool vsi_nn_SetGraphOutputs
     }
 
     return ret;
-
 } /* vsi_nn_SetGraphOutputs() */
 
 vsi_nn_node_id_t * vsi_nn_SortGraphNode
@@ -1507,10 +1564,10 @@ void vsi_nn_DumpGraphNodeOutputsEx
 
     if( NULL != prefix )
     {
-        strncpy(filename_prefix, prefix, _SHAPE_BUF_SZ);
+        vsi_nn_strncpy(filename_prefix, prefix, _SHAPE_BUF_SZ);
         filename_prefix[_SHAPE_BUF_SZ - 1] = '\0';
 
-        strncat(filename_prefix, "_", _SHAPE_BUF_SZ - 1);
+        vsi_nn_strncat(filename_prefix, "_", _SHAPE_BUF_SZ - 1);
         filename_prefix[_SHAPE_BUF_SZ - 1] = '\0';
     }
 
@@ -1611,7 +1668,7 @@ void vsi_nn_DumpGraphToJson
         return ;
     }
 
-    fp = fopen("graph.json", "w+");
+    fp = vsi_nn_fopen("graph.json", "w+");
     if(NULL == fp)
     {
         VSILOGE("Create dump file fail");
diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
index fb17d8b..8e57205 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
@@ -205,7 +205,6 @@ static void _get_graph_input_asymm_int8_norm_tensor
                     {
                         tensor_ids[id_count ++] = id;
                     }
-
                 }
                 tensor_count += 1;
             }
@@ -867,7 +866,7 @@ vsi_status vsi_nn_OptimizeGraph
         }
     }
 
-    if (!nbg_flag)
+    if (!nbg_flag && graph->ctx->options.enable_asymi8_to_u8)
     {
         status = _graph_optimization_convert_int8_to_uint8(graph, dirty);
         TEST_CHECK_STATUS(status, final);
@@ -876,4 +875,3 @@ vsi_status vsi_nn_OptimizeGraph
 final:
     return status;
 } /* vsi_nn_OptimizeGraph() */
-
diff --git a/src/tim/vx/internal/src/vsi_nn_log.c b/src/tim/vx/internal/src/vsi_nn_log.c
index 9ee1114..25d421b 100644
--- a/src/tim/vx/internal/src/vsi_nn_log.c
+++ b/src/tim/vx/internal/src/vsi_nn_log.c
@@ -40,7 +40,6 @@ static const char* ENV_LOG_LEVEL = "VSI_NN_LOG_LEVEL";
 #endif
 
 int get_env_as_int(const char* env, int default_value) {
-
     int value = default_value;
     #ifdef __ANDROID__
     {
@@ -52,7 +51,7 @@ int get_env_as_int(const char* env, int default_value) {
     }
     #else
     {
-        char* env_s = getenv(env);
+        char* env_s = vsi_nn_getenv(env);
         if (env_s) {
             value = atoi(env_s);
         }
diff --git a/src/tim/vx/internal/src/vsi_nn_node.c b/src/tim/vx/internal/src/vsi_nn_node.c
index 0d5fbc8..86d4937 100644
--- a/src/tim/vx/internal/src/vsi_nn_node.c
+++ b/src/tim/vx/internal/src/vsi_nn_node.c
@@ -106,8 +106,8 @@ void vsi_nn_ReleaseNode
     )
 {
     vsi_nn_node_t * ptr;
-    ptr = *node;
-    if( NULL != node && NULL != *node )
+    ptr = (NULL != node) ? *node : NULL;
+    if( NULL != ptr)
     {
         vsi_nn_OpDeinit( ptr->op, ptr );
         if( NULL != ptr->input.tensors )
diff --git a/src/tim/vx/internal/src/vsi_nn_ops.c b/src/tim/vx/internal/src/vsi_nn_ops.c
index 0214680..8ca7df2 100644
--- a/src/tim/vx/internal/src/vsi_nn_ops.c
+++ b/src/tim/vx/internal/src/vsi_nn_ops.c
@@ -87,6 +87,15 @@ static const char * vsi_nn_internal_ops_name[] =
 };
 #undef DEF_OP
 
+
+vsi_bool _is_external_ops(vsi_nn_op_t op) {
+    vsi_bool ret = FALSE;
+    if (op <  0) {
+        ret = TRUE;
+    }
+    return ret;
+}
+
 vsi_bool _is_custom_ops
     (
     vsi_nn_op_t op
@@ -357,13 +366,33 @@ vsi_bool vsi_nn_OpRegisterOvxInit
     return ret;
 } /* vsi_nn_OpRegisterClientCompute() */
 
+vsi_bool vsi_nn_OpRegisterExternalOvxInit
+    (
+    vsi_nn_op_t op,
+    const char* kernel_name,
+    vsi_nn_op_proc_t* proc
+    )
+{
+    vsi_bool ret;
+
+    ret = FALSE;
+    if (vsi_nn_OpRegisterClient(op, proc) &&
+        vsi_nn_OpAddClientName(op, kernel_name)) {
+        ret = TRUE;
+    }
+    return ret;
+}
+
 const char * vsi_nn_OpGetName
     (
     vsi_nn_op_t op
     )
 {
     const char * name;
-    if( op < VSI_NN_OP_NUM )
+    if(_is_external_ops(op)){
+        name = vsi_nn_OpGetClientName(op);
+    }
+    else if( op < VSI_NN_OP_NUM )
     {
         name = vsi_nn_ops_name[op];
     }
diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
index 06dd052..8c6c7ba 100644
--- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
+++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
@@ -118,22 +118,37 @@ static void _set_preproc_node_rect_params
     (
     vsi_nn_node_t* node,
     vsi_nn_preprocess_crop_t* crop,
-    vsi_nn_preprocess_image_size_t* input_size
+    vsi_nn_preprocess_image_size_t* input_size,
+    vsi_nn_preprocess_source_format_e* source_format
     )
 {
     if(crop != NULL)
     {
-        node->nn_param.pre_process.rect.left = crop->begin[0];
-        node->nn_param.pre_process.rect.top = crop->begin[1];
-        node->nn_param.pre_process.rect.width = crop->size[0];
-        node->nn_param.pre_process.rect.height = crop->size[1];
+        if(*source_format == VSI_NN_SOURCE_FORMAT_TENSOR)
+        {
+            VSILOGW("don not need to set crop parameter for tensor preprocess");
+        }
+        else
+        {
+            node->nn_param.pre_process.rect.left = crop->begin[0];
+            node->nn_param.pre_process.rect.top = crop->begin[1];
+            node->nn_param.pre_process.rect.width = crop->size[0];
+            node->nn_param.pre_process.rect.height = crop->size[1];
+        }
     }
-    else
+    else if (*source_format != VSI_NN_SOURCE_FORMAT_TENSOR)
     {
-        node->nn_param.pre_process.rect.left = 0;
-        node->nn_param.pre_process.rect.top = 0;
-        node->nn_param.pre_process.rect.width = input_size->w;
-        node->nn_param.pre_process.rect.height = input_size->h;
+        if(input_size == NULL)
+        {
+            VSILOGE("Please set image size for preprocess node");
+        }
+        else
+        {
+            node->nn_param.pre_process.rect.left = 0;
+            node->nn_param.pre_process.rect.top = 0;
+            node->nn_param.pre_process.rect.width = input_size->w;
+            node->nn_param.pre_process.rect.height = input_size->h;
+        }
     }
 } /* _set_preproc_node_rect_params() */
 
@@ -490,7 +505,7 @@ vsi_status vsi_nn_add_single_preproc_node
     status = _set_preproc_node_type(node, source_format);
     TEST_CHECK_STATUS(status, final);
 
-    _set_preproc_node_rect_params(node, crop, input_size);
+    _set_preproc_node_rect_params(node, crop, input_size, source_format);
     _set_preproc_node_norm_params(node, mean_and_scale, &org_norm_tensor->attr);
 
     if(permute != NULL)
diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c
index e82e537..80ea7a6 100644
--- a/src/tim/vx/internal/src/vsi_nn_tensor.c
+++ b/src/tim/vx/internal/src/vsi_nn_tensor.c
@@ -139,7 +139,7 @@ static void print_tensor
         break;
 #endif
     default:
-        strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ);
+        vsi_nn_strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ);
         break;
     }
 
@@ -475,18 +475,18 @@ static vsi_bool _init_tensor
             {
 #ifdef VSI_40BIT_VA_SUPPORT
                 {
-                    vx_size size_vxsize[_cnt_of_array(tensor->attr.size)] = {0};
+                    vx_size size_vxsize2[_cnt_of_array(tensor->attr.size)] = {0};
                     vx_size stride_size_vxsize[_cnt_of_array(stride_size)] = {0};
                     for(i = 0; i < _cnt_of_array(tensor->attr.size); i++)
                     {
-                        size_vxsize[i] = -1 == tensor->attr.size[i] ? -1 : (vx_size)tensor->attr.size[i];
+                        size_vxsize2[i] = -1 == tensor->attr.size[i] ? -1 : (vx_size)tensor->attr.size[i];
                     }
                     for(i = 0; i < _cnt_of_array(stride_size); i++)
                     {
                         stride_size_vxsize[i] = -1 == stride_size[i] ? -1 : (vx_size)stride_size[i];
                     }
                     addr = vxCreateTensorAddressing(graph->ctx->c,
-                        size_vxsize, stride_size_vxsize, (vx_size)tensor->attr.dim_num);
+                        size_vxsize2, stride_size_vxsize, (vx_size)tensor->attr.dim_num);
                 }
 #else
                 {
@@ -785,8 +785,8 @@ void vsi_nn_ReleaseTensor
     )
 {
     vsi_nn_tensor_t * ptr;
-    ptr = *tensor;
-    if( NULL != tensor && NULL != *tensor )
+    ptr = (NULL != tensor) ? *tensor : NULL;
+    if( NULL != ptr)
     {
         uint8_t * handle = NULL;
         if( NULL != ptr->t )
@@ -1224,7 +1224,7 @@ void vsi_nn_SaveTensorToTextByFp32
         return;
     }
 
-    fp = fopen( filename, "w" );
+    fp = vsi_nn_fopen( filename, "w" );
     if( NULL == fp )
     {
         VSILOGW( "Write file %s fail. Please check...", filename );
@@ -1313,7 +1313,7 @@ void vsi_nn_SaveDataToText
         return;
     }
 
-    fp = fopen( filename, "w" );
+    fp = vsi_nn_fopen( filename, "w" );
     if( NULL == fp )
     {
         VSILOGW( "Write file %s fail. Please check...", filename );
@@ -1358,6 +1358,8 @@ void vsi_nn_SaveTensorToBinary
     FILE            * fp;
     vsi_size_t         sz;
     uint32_t         i;
+    uint8_t        * packed_data = NULL;
+    vsi_size_t     packed_size;
 
     if( NULL == graph || NULL == tensor || NULL == filename )
     {
@@ -1365,24 +1367,42 @@ void vsi_nn_SaveTensorToBinary
     }
 
     data = vsi_nn_ConvertTensorToData( graph, tensor );
+
     if( NULL == data )
     {
         VSILOGE( "Convert data fail." );
         return;
     }
 
-    fp = fopen( filename, "wb" );
+    fp = vsi_nn_fopen( filename, "wb" );
     if( NULL == fp )
     {
         VSILOGW( "Write file %s fail. Please check...", filename );
         return;
     }
     sz = (vsi_size_t)vsi_nn_GetTypeBytes( tensor->attr.dtype.vx_type );
-    for( i = 0; i < tensor->attr.dim_num; i ++ )
+    if( tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT4 ||
+        tensor->attr.dtype.vx_type == VSI_NN_TYPE_UINT4 )
     {
-        sz *= tensor->attr.size[i];
+        packed_size = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num,
+                                                         tensor->attr.dtype.vx_type);
+        packed_data = (uint8_t*)malloc(packed_size);
+        vsi_nn_Pack4bitData(tensor, data, packed_data);
+        fwrite( packed_data, packed_size, 1, fp );
+        if( packed_data )
+        {
+            free(packed_data);
+            packed_data = NULL;
+        }
+    }
+    else
+    {
+        for( i = 0; i < tensor->attr.dim_num; i ++ )
+        {
+            sz *= tensor->attr.size[i];
+        }
+        fwrite( data, sz, 1, fp );
     }
-    fwrite( data, sz, 1, fp );
     fclose( fp );
     vsi_nn_safe_free( data );
 } /* vsi_nn_SaveTensorToBinary() */
@@ -2720,4 +2740,4 @@ final:
     vsi_nn_safe_free(data);
 
     return output;
-}
\ No newline at end of file
+}