Pre-release for 22Q1 (#302)

update internal to commit-id: d45da6fa Co-authored-by: zhouheng.zheng <zhouheng.zheng@ouotlook.com>
2022-03-01 17:56:03 +08:00 · 2022-03-01 17:56:03 +08:00 · 161bb8a7c4
parent e63059857b
commit 161bb8a7c4
149 changed files with 12641 additions and 970 deletions
--- a/src/tim/vx/internal/include/custom/custom_node_type.def
+++ b/src/tim/vx/internal/include/custom/custom_node_type.def
@ -2,3 +2,6 @@
    custom op data struct def
 */
 DEF_NODE_TYPE(custom_softmax)
+DEF_NODE_TYPE(custom_ainr_denoise_postprocess)
+DEF_NODE_TYPE(custom_warp_affine)
+DEF_NODE_TYPE(custom_warp_perspective)
--- a/src/tim/vx/internal/include/custom/custom_ops.def
+++ b/src/tim/vx/internal/include/custom/custom_ops.def
@ -2,3 +2,6 @@
   Add custom ops to the end.
 */
 DEF_OP(CUSTOM_SOFTMAX)
+DEF_OP(CUSTOM_AINR_DENOISE_POSTPROCESS)
+DEF_OP(CUSTOM_WARP_AFFINE)
+DEF_OP(CUSTOM_WARP_PERSPECTIVE)
--- a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_ainr_denoise_postprocess.h
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_ainr_denoise_postprocess.h
@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CUSTOM_AINR_DENOISE_POSTPROCESS_H
+#define _VSI_NN_OP_CUSTOM_AINR_DENOISE_POSTPROCESS_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_custom_ainr_denoise_postprocess_param
+{
+    struct _ainr_denoise_postprocess_local_data_t* local;
+    // Add parameters here
+} vsi_nn_custom_ainr_denoise_postprocess_param;
+_compiler_assert(offsetof(vsi_nn_custom_ainr_denoise_postprocess_param, local) == 0, \
+    vsi_nn_custom_ainr_denoise_postprocess_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h
@ -0,0 +1,49 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CUSTOM_WARP_AFFINE_H
+#define _VSI_NN_OP_CUSTOM_WARP_AFFINE_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_custom_warp_affine_param
+{
+    struct _custom_warp_affine_local_data_t* local;
+    // Add parameters here
+    const float *matrix;
+    vsi_enum type;
+    int32_t size[2];
+} vsi_nn_custom_warp_affine_param;
+_compiler_assert(offsetof(vsi_nn_custom_warp_affine_param, local) == 0, \
+    vsi_nn_custom_warp_affine_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_perspective.h
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_perspective.h
@ -0,0 +1,50 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CUSTOM_WARP_PERSPECTIVE_H
+#define _VSI_NN_OP_CUSTOM_WARP_PERSPECTIVE_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_custom_warp_perspective_param
+{
+    struct _custom_warp_perspective_local_data_t* local;
+    // Add parameters here
+    const float *matrix;
+    vsi_enum type;
+    int32_t size[2];
+} vsi_nn_custom_warp_perspective_param;
+_compiler_assert(offsetof(vsi_nn_custom_warp_perspective_param, local) == 0, \
+    vsi_nn_custom_warp_perspective_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
+++ b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
@ -27,5 +27,8 @@
    custom op head files
 */
 #include "custom/ops/vsi_nn_op_custom_softmax.h"
+#include "custom/ops/vsi_nn_op_custom_ainr_denoise_postprocess.h"
+#include "custom/ops/vsi_nn_op_custom_warp_affine.h"
+#include "custom/ops/vsi_nn_op_custom_warp_perspective.h"

 #endif
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@ -165,3 +165,6 @@ DEF_OP(GRUCELL)
 DEF_OP(GRUCELL_ACTIVATION)
 DEF_OP(RESHAPE2)
 DEF_OP(CONV3D)
+DEF_OP(DECONV3D)
+DEF_OP(PAD2)
+DEF_OP(COS)
--- a/src/tim/vx/internal/include/internal/internal_ops.def
+++ b/src/tim/vx/internal/include/internal/internal_ops.def
@ -19,3 +19,4 @@ DEF_OP(RESIZE_1D_NEAREST_INTERNAL)
 DEF_OP(SPACE2DEPTH_INTERNAL)
 DEF_OP(GRUCELL_H_TIMES_ACTIVATION_R)
 DEF_OP(GRUCELL_ACTIVATION_Z_H)
+DEF_OP(REDUCE_MEAN_INTERNAL)
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
@ -640,6 +640,13 @@ vsi_nn_kernel_node_t  vsi_nn_kernel_create_node
    vsi_nn_kernel_t * kernel
    );

+vsi_nn_kernel_node_t  vsi_nn_kernel_create_node_ext
+    (
+    vsi_nn_graph_t * graph,
+    vsi_nn_kernel_t * kernel,
+    const char** resources
+    );
+
 vsi_status vsi_nn_kernel_node_set_border
    (vsi_nn_kernel_node_t node,
    vx_border_t* border);
@ -720,6 +727,13 @@ vsi_status vsi_nn_kernel_register
    vsi_nn_kernel_t * kernel
    );

+vsi_status vsi_nn_kernel_register_ext
+    (
+    vsi_nn_graph_t * graph,
+    vsi_nn_kernel_t * kernel,
+    const char** resources
+    );
+
 vsi_bool vsi_nn_kernel_gpu_check_shape
    ( const vsi_size_t * shape, vsi_size_t rank );

--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
@ -79,4 +79,10 @@ vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape
    vsi_size_t* out_shape, uint32_t* out_rank
    );

+vsi_bool vsi_nn_kernel_optimize_group_norm_shape
+    (
+    const vsi_size_t* shape, const uint32_t rank, int32_t groups,
+    int32_t is_sp_kernel, vsi_size_t* out_shape
+    );
+
 #endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h
@ -0,0 +1,54 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_DECONV3D_H
+#define _VSI_NN_OP_DECONV3D_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_deconv3d_param
+{
+    struct _deconv3d_local_data_t* local;
+    // Add parameters here
+    uint32_t   ksize[3];
+    uint32_t   stride[3];
+    /* Pad left, right, top, bottom, front, rear */
+    uint32_t   pad[6];
+
+    uint32_t   weights;
+    uint32_t   group;
+    uint32_t   output_padding[3];
+} vsi_nn_deconv3d_param;
+_compiler_assert(offsetof(vsi_nn_deconv3d_param, local) == 0, \
+    vsi_nn_deconv3d_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_gather.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gather.h
@ -41,6 +41,7 @@ typedef struct _vsi_nn_gather_param
 {
    vsi_nn_gather_lcl_data local;
    int32_t     axis;
+    int32_t     batch_dims;
 } vsi_nn_gather_param;

 #ifdef __cplusplus
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pad2.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pad2.h
@ -0,0 +1,50 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_PAD2_H
+#define _VSI_NN_OP_PAD2_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_pad2_param
+{
+    struct _pad2_local_data_t* local;
+    const uint32_t * front_size;
+    const uint32_t * back_size;
+    uint8_t dim_num;
+    float const_val;
+    vsi_nn_pad_mode_e mode;
+} vsi_nn_pad2_param;
+_compiler_assert(offsetof(vsi_nn_pad2_param, local) == 0, \
+    vsi_nn_pad2_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reduce.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduce.h
@ -51,7 +51,7 @@ typedef struct _vsi_nn_reduce_param
 {
    /* local data must be the first. */
    vsi_nn_reduce_lcl_data_t local;
-    vx_enum     type;
+    vsi_enum     type;
    const int32_t *axis;
    vx_uint32   axis_num;
    vx_bool     keep_dim;
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reduce_mean_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduce_mean_internal.h
@ -0,0 +1,49 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_REDUCE_MEAN_INTERNAL_H
+#define _VSI_NN_OP_REDUCE_MEAN_INTERNAL_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_reduce_mean_internal_param
+{
+    struct _reduce_mean_internal_local_data_t* local;
+    // Add parameters here
+    vx_int32    *axis;
+    vx_uint32   axis_num;
+    float       scale;
+} vsi_nn_reduce_mean_internal_param;
+_compiler_assert(offsetof(vsi_nn_reduce_mean_internal_param, local) == 0, \
+    vsi_nn_reduce_mean_internal_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/tim/vx/internal/include/utils/vsi_nn_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h
@ -28,6 +28,7 @@
 /*-------------------------------------------
                Includes
 -------------------------------------------*/
+#include <stdio.h>
 #include "vsi_nn_platform.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_types.h"
@ -398,6 +399,31 @@ void vsi_nn_get_tensor_clamp_min_max
    float *clampMax
    );

+char* vsi_nn_strncpy
+    (
+    char* dest,
+    const char* source,
+    size_t count
+    );
+
+char* vsi_nn_strncat
+    (
+    char* dest,
+    const char* source,
+    size_t count
+    );
+
+char* vsi_nn_getenv
+    (
+    const char * var_name
+    );
+
+FILE* vsi_nn_fopen
+    (
+    const char * file_name,
+    const char * mode
+    );
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/tim/vx/internal/include/vsi_nn_client_op.h
+++ b/src/tim/vx/internal/include/vsi_nn_client_op.h
@ -71,6 +71,17 @@ OVXLIB_API void vsi_nn_OpRemoveClient
    vsi_nn_op_t op
    );

+vsi_bool vsi_nn_OpAddClientName
+  (
+    vsi_nn_op_t op,
+    const char* kernel_name
+  );
+
+const char* vsi_nn_OpGetClientName
+  (
+    vsi_nn_op_t op
+  );
+
 #if defined(__cplusplus)
 }
 #endif
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@ -73,6 +73,7 @@ typedef struct _vsi_nn_runtime_option_t
    int32_t enable_shader;
    int32_t enable_opcheck;
    int32_t enable_concat_optimize;
+    int32_t enable_asymi8_to_u8;
 } vsi_nn_runtime_option_t;

 /**
--- a/src/tim/vx/internal/include/vsi_nn_feature_config.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h
@ -1,26 +1,3 @@
-/****************************************************************************
-*
-*    Copyright (c) 2019 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the Software),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
 /*****Auto generated header file, Please DO NOT modify manually!*****/
 #ifndef _VSI_NN_FEATURE_CONFIG_H
 #define _VSI_NN_FEATURE_CONFIG_H
--- a/src/tim/vx/internal/include/vsi_nn_graph.h
+++ b/src/tim/vx/internal/include/vsi_nn_graph.h
@ -456,6 +456,29 @@ OVXLIB_API vsi_nn_node_t * vsi_nn_AddNode
    vsi_nn_node_id_t    * node_id
    );

+/**
+ * Add External node
+ * Create a new External node and attach it to graph.
+ *
+ * @param[in] graph Graph handle
+ * @param[in] op Node operation.
+ * @param[in] vsi_nn_proc_t to this node.
+ * @param[in] output_num Number of outputs to this node.
+ * @param[in] kernel name.
+ * @param[out] node_id A handle to get the id of new node,
+ *                  pass it to NULL to get nothing.
+ *
+ * @return The node handle on success, or NULL otherwise.
+ */
+OVXLIB_API vsi_nn_node_t * vsi_nn_AddExternalNode
+    (
+    vsi_nn_graph_t      * graph,
+    vsi_nn_op_t           op,
+    const void           * proc,
+    vsi_nn_node_id_t    * node_id,
+    const char          *kernel_name
+    );
+
 /**
 * @deprecated
 * @see vsi_nn_AddNode
--- a/src/tim/vx/internal/include/vsi_nn_log.h
+++ b/src/tim/vx/internal/include/vsi_nn_log.h
@ -24,14 +24,18 @@

 #ifndef _VSI_NN_LOG_H
 #define _VSI_NN_LOG_H
-#include <stdio.h>
+
+#include "utils/vsi_nn_util.h"

 #if defined(__cplusplus)
 extern "C"{
 #endif

 #ifdef _MSC_VER
-#define snprintf _snprintf
+#define snprintf(buffer, count, format, ...) \
+    _snprintf_s(buffer, count, _TRUNCATE, format, ##__VA_ARGS__)
+#define vsnprintf(buffer, count, format, args) \
+    _vsnprintf_s(buffer, count, _TRUNCATE, format, args)
 #endif

 typedef enum _vsi_nn_log_level_e
@ -68,4 +72,3 @@ OVXLIB_API void vsi_nn_LogMsg
 #endif

 #endif
-
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@ -182,6 +182,9 @@
 #include "ops/vsi_nn_op_conv3d.h"
 #include "ops/vsi_nn_op_grucell_h_times_activation_r.h"
 #include "ops/vsi_nn_op_grucell_activation_z_h.h"
+#include "ops/vsi_nn_op_deconv3d.h"
+#include "ops/vsi_nn_op_reduce_mean_internal.h"
+#include "ops/vsi_nn_op_pad2.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"

@ -350,7 +353,10 @@ typedef union _vsi_nn_nn_param
    vsi_nn_conv3d_param             conv3d;
    vsi_nn_grucell_h_times_activation_r_param grucell_h_times_activation_r;
    vsi_nn_grucell_activation_z_h_param grucell_activation_z_h;
-    uint8_t                         client_param[128];
+    vsi_nn_deconv3d_param           deconv3d;
+    vsi_nn_reduce_mean_internal_param reduce_mean_internal;
+    vsi_nn_pad2_param               pad2;
+    void*                         client_param;

    /* custom node data struct define */
 #define DEF_NODE_TYPE( NAME ) vsi_nn_##NAME##_param NAME;
--- a/src/tim/vx/internal/include/vsi_nn_ops.h
+++ b/src/tim/vx/internal/include/vsi_nn_ops.h
@ -48,7 +48,7 @@ extern "C"{
 * @see include/custom/custom_ops.def
 * @see include/internal/internal_ops.def
 */
-typedef uint32_t vsi_nn_op_t; enum
+typedef int32_t vsi_nn_op_t; enum
 {
 #define DEF_OP( NAME, ... ) VSI_NN_OP_##NAME,
    #include "interface/ops.def"
@ -317,6 +317,13 @@ vsi_bool vsi_nn_OpRegisterOvxInit
    vsi_nn_op_compute_t compute
    );

+vsi_bool vsi_nn_OpRegisterExternalOvxInit
+    (
+    vsi_nn_op_t op,
+    const char* kernel_name,
+    vsi_nn_op_proc_t* proc
+    );
+
 /**
 * Get operation name
 * Get operation name string by operation id.
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@ -33,7 +33,7 @@ extern "C"{

 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 37
+#define VSI_NN_VERSION_PATCH 39
 #define VSI_NN_VERSION \
    (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
@ -77,7 +77,7 @@ DEF_KERNEL_EXECUTOR(_softmax_exec)

    /* alloc the float32 data buffer */
    buffer[1] = (float *)malloc(out_elements * sizeof(float));
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
    memset(buffer[1], 0, out_elements * sizeof(float));

    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c
@ -0,0 +1,296 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+#define _CPU_IO_NUM         (_INPUT_NUM + _OUTPUT_NUM)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.custom_warp_affine")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _custom_warp_affine_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _CUSTOM_WARP_AFFINE_PARAM_NUM  _cnt_of_array( _custom_warp_affine_kernel_param_def )
+#define SCALAR_INPUT_TYPE       (2)
+#define SCALAR_MATRIX_OFFSET    (3)
+
+static void _transform_affine
+    (
+    vsi_size_t dst_x,
+    vsi_size_t dst_y,
+    const float m[],
+    float *src_x,
+    float *src_y
+    )
+{
+    *src_x = dst_x * m[0] + dst_y * m[2] + m[4];
+    *src_y = dst_x * m[1] + dst_y * m[3] + m[5];
+}
+
+static vsi_bool _read_pixel
+    (
+    float *base,
+    vsi_nn_kernel_tensor_attr_t *attr,
+    float x,
+    float y,
+    float *pixel
+    )
+{
+    vsi_size_t width = attr->shape->data[0];
+    vsi_size_t height = attr->shape->data[1];
+    vsi_bool out_of_bounds = (x < 0 || y < 0 || x >= width || y >= height);
+    vsi_size_t bx = 0, by = 0;
+
+    if (out_of_bounds)
+    {
+        *pixel = 205.0f;
+        return TRUE;
+    }
+
+    // bounded x/y
+    bx = x < 0 ? 0 : x >= width ? width - 1 : (vsi_size_t)x;
+    by = y < 0 ? 0 : y >= height ? height - 1 : (vsi_size_t)y;
+
+    *pixel = base[by * width + bx];
+
+    return TRUE;
+}
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    float* buffer[_CPU_IO_NUM] = { NULL };
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL };
+    int32_t type = 0;
+    float matrix[6] = {0};
+    vsi_size_t i = 0;
+    vsi_size_t b = 0;
+    vsi_size_t x = 0;
+    vsi_size_t y = 0;
+    vsi_size_t out_elements = 0;
+    vsi_size_t width = 0;
+    vsi_size_t height = 0;
+    vsi_size_t outer_size = 1;
+
+    tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
+
+    /* alloc the float32 data buffer */
+    buffer[1] = (float *)malloc(out_elements * sizeof(float));
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
+    memset(buffer[1], 0, out_elements * sizeof(float));
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_TYPE],
+        &type);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    for (i = 0; i < 6; i++)
+    {
+        status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
+            &matrix[i]);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    width = attr[1]->shape->data[0];
+    height = attr[1]->shape->data[1];
+    for(i = 2; i < (vsi_size_t)attr[1]->shape->size; ++i)
+    {
+        outer_size *= attr[1]->shape->data[i];
+    }
+    // Do something
+    for (b = 0; b < outer_size; b++)
+    {
+        float *src_base = buffer[0] + b * attr[0]->shape->data[0] * attr[0]->shape->data[1];
+        float *dst_base = buffer[1] + b * width * height;
+        for (y = 0; y < height; y++)
+        {
+            for (x = 0; x < width; x++)
+            {
+                float xf = 0;
+                float yf = 0;
+                float dst = 0;
+
+                _transform_affine(x, y, matrix, &xf, &yf);
+                if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR)
+                {
+                    _read_pixel(src_base, attr[0], xf, yf, &dst);
+                    dst_base[y * width + x] = dst;
+                }
+                else
+                {
+                    float tl = 0, tr = 0, bl = 0, br = 0;
+                    float ar = xf - floorf(xf);
+                    float ab = yf - floorf(yf);
+                    float al = 1.0f - ar;
+                    float at = 1.0f - ab;
+
+                    _read_pixel(src_base, attr[0], floorf(xf), floorf(yf), &tl);
+                    _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf), &tr);
+                    _read_pixel(src_base, attr[0], floorf(xf), floorf(yf) + 1, &bl);
+                    _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf) + 1, &br);
+
+                    dst_base[y * width + x] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
+                }
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
+            buffer[1], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+final:
+    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+        vsi_nn_kernel_tensor_attr_release( &attr[i] );
+    }
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _custom_warp_affine_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _custom_warp_affine_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CUSTOM_WARP_AFFINE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    size_t i = 0;
+    size_t buffer_size = 0;
+    int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
+    float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[SCALAR_INPUT_TYPE] = vsi_nn_kernel_scalar_create(
+                graph, I32, &type );
+            for (i = 0; i < buffer_size; i++)
+            {
+                node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
+                        graph, F32, &buffer[i] );
+            }
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TYPE] );
+            for (i = 0; i < buffer_size; i++)
+            {
+                vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
+            }
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( custom_warp_affine, _setup )
--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c
@ -0,0 +1,300 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+#define _CPU_IO_NUM         (_INPUT_NUM + _OUTPUT_NUM)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.custom_warp_perspective")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _custom_warp_perspective_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM  _cnt_of_array( _custom_warp_perspective_kernel_param_def )
+#define SCALAR_INPUT_TYPE       (2)
+#define SCALAR_MATRIX_OFFSET    (3)
+
+static void _transform_perspective
+    (
+    vsi_size_t dst_x,
+    vsi_size_t dst_y,
+    const float m[],
+    float *src_x,
+    float *src_y
+    )
+{
+    float z = dst_x * m[2] + dst_y * m[5] + m[8];
+
+    *src_x = (dst_x * m[0] + dst_y * m[3] + m[6]) / z;
+    *src_y = (dst_x * m[1] + dst_y * m[4] + m[7]) / z;
+}
+
+static vsi_bool _read_pixel
+    (
+    float *base,
+    vsi_nn_kernel_tensor_attr_t *attr,
+    float x,
+    float y,
+    float *pixel
+    )
+{
+    vsi_size_t width = attr->shape->data[0];
+    vsi_size_t height = attr->shape->data[1];
+    vsi_bool out_of_bounds = (x < 0 || y < 0 || x >= width || y >= height);
+    vsi_size_t bx = 0, by = 0;
+
+    if (out_of_bounds)
+    {
+        *pixel = 205.0f;
+        return TRUE;
+    }
+
+    // bounded x/y
+    bx = x < 0 ? 0 : x >= width ? width - 1 : (vsi_size_t)x;
+    by = y < 0 ? 0 : y >= height ? height - 1 : (vsi_size_t)y;
+
+    *pixel = base[by * width + bx];
+
+    return TRUE;
+}
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    float* buffer[_CPU_IO_NUM] = { NULL };
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL };
+    int32_t type = 0;
+    float matrix[9] = {0};
+    vsi_size_t i = 0;
+    vsi_size_t b = 0;
+    vsi_size_t x = 0;
+    vsi_size_t y = 0;
+    vsi_size_t out_elements = 0;
+    vsi_size_t width = 0;
+    vsi_size_t height = 0;
+    vsi_size_t outer_size = 1;
+
+    tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
+
+    /* alloc the float32 data buffer */
+    buffer[1] = (float *)malloc(out_elements * sizeof(float));
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
+    memset(buffer[1], 0, out_elements * sizeof(float));
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_TYPE],
+        &type);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    for (i = 0; i < 9; i++)
+    {
+        status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
+            &matrix[i]);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    width = attr[1]->shape->data[0];
+    height = attr[1]->shape->data[1];
+    for(i = 2; i < (vsi_size_t)attr[1]->shape->size; ++i)
+    {
+        outer_size *= attr[1]->shape->data[i];
+    }
+    // Do something
+    for (b = 0; b < outer_size; b++)
+    {
+        float *src_base = buffer[0] + b * attr[0]->shape->data[0] * attr[0]->shape->data[1];
+        float *dst_base = buffer[1] + b * width * height;
+        for (y = 0; y < height; y++)
+        {
+            for (x = 0; x < width; x++)
+            {
+                float xf = 0;
+                float yf = 0;
+                float dst = 0;
+
+                _transform_perspective(x, y, matrix, &xf, &yf);
+                if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR)
+                {
+                    _read_pixel(src_base, attr[0], xf, yf, &dst);
+                    dst_base[y * width + x] = dst;
+                }
+                else
+                {
+                    float tl = 0, tr = 0, bl = 0, br = 0;
+                    float ar = xf - floorf(xf);
+                    float ab = yf - floorf(yf);
+                    float al = 1.0f - ar;
+                    float at = 1.0f - ab;
+
+                    _read_pixel(src_base, attr[0], floorf(xf), floorf(yf), &tl);
+                    _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf), &tr);
+                    _read_pixel(src_base, attr[0], floorf(xf), floorf(yf) + 1, &bl);
+                    _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf) + 1, &br);
+
+                    dst_base[y * width + x] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
+                }
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
+            buffer[1], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+final:
+    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+        vsi_nn_kernel_tensor_attr_release( &attr[i] );
+    }
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _custom_warp_perspective_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _custom_warp_perspective_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CUSTOM_WARP_PERSPECTIVE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    size_t i = 0;
+    size_t buffer_size = 0;
+    int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
+    float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[SCALAR_INPUT_TYPE] = vsi_nn_kernel_scalar_create(
+                graph, I32, &type );
+            for (i = 0; i < buffer_size; i++)
+            {
+                node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
+                        graph, F32, &buffer[i] );
+            }
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TYPE] );
+            for (i = 0; i < buffer_size; i++)
+            {
+                vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
+            }
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( custom_warp_perspective, _setup )
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c
@ -0,0 +1,295 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum _custom_warp_affine_type_e
+{
+    nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR,
+    bilinear = VSI_NN_INTERPOLATION_BILINEAR,
+}custom_warp_affine_type_e;
+
+#define _CUSTOM_WARP_AFFINE_KERNEL_SOURCE      "custom_warp_affine"
+
+// Add kernel hashtable here
+#define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D ) \
+        (( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20))
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
+        { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0 ), \
+          CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE), \
+          _CUSTOM_WARP_AFFINE_KERNEL_SOURCE }
+#define PACK_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
+        { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1 ), \
+          CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D"), \
+          _CUSTOM_WARP_AFFINE_KERNEL_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _custom_warp_affine_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( U8, U8, nearest_neighbor ),
+    PACK_KERNEL_MAP( U8, U8, bilinear ),
+
+    PACK_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
+    PACK_2D_KERNEL_MAP( U8, U8, bilinear ),
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _custom_warp_affine_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _CUSTOM_WARP_AFFINE_PARAM_NUM  _cnt_of_array( _custom_warp_affine_kernel_param_def )
+#define SCALAR_MATRIX_OFFSET    (2)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    vsi_size_array_t * out_shape = NULL;
+    float m[6] = {0};
+    float matrix0[4] = {0};
+    float matrix1[4] = {0};
+    float matrix4[4] = {0};
+    int32_t i = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    for (i = 0; i < 6; i++)
+    {
+        status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
+            &m[i]);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    matrix0[0] = m[0]; matrix0[1] = m[1]; matrix0[2] = m[2]; matrix0[3] = m[3];
+    matrix1[0] = m[4]; matrix1[1] = m[5];
+    matrix4[0] = m[0]; matrix4[1] = m[1]; matrix4[2] = m[0] * 2; matrix4[3] = m[1] * 2;
+
+    out_shape  = attr[1]->shape;
+
+    gpu_param.global_scale[0] = 8;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+    status = vsi_nn_kernel_gpu_add_param( node,
+        "matrix0", &matrix0 );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "matrix1", &matrix1 );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "matrix4", &matrix4 );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+} /* _custom_warp_affine_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t type
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _custom_warp_affine_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _custom_warp_affine_kernel_map );
+    vx_param_description_t * param_def  = _custom_warp_affine_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _custom_warp_affine_initializer;
+    int32_t is_2d_img = inputs[0]->attr.dim_num < 3 || inputs[0]->attr.size[2] == 1;
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _custom_warp_affine_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CUSTOM_WARP_AFFINE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    size_t i = 0;
+    size_t buffer_size = 0;
+    int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
+    float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
+
+    if (vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, type );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            vx_border_t border;
+            border.mode = VX_BORDER_CONSTANT;
+
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            for (i = 0; i < buffer_size; i++)
+            {
+                node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
+                        graph, F32, &buffer[i] );
+            }
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM );
+            for (i = 0; i < buffer_size; i++)
+            {
+                vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
+            }
+            // Set default border mode.
+            border.constant_value.U32 = 0xcdcdcdcd;
+            status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+            CHECK_STATUS(status);
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( custom_warp_affine, _setup )
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c
@ -0,0 +1,300 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum _custom_warp_perspective_type_e
+{
+    nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR,
+    bilinear = VSI_NN_INTERPOLATION_BILINEAR,
+}custom_warp_perspective_type_e;
+#define _CUSTOM_WARP_PERSPECTIVE_KERNEL_SOURCE      "custom_warp_perspective"
+
+// Add kernel hashtable here
+#define CUSTOM_WARP_PERSPECTIVE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D ) \
+        (( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20))
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
+        { CUSTOM_WARP_PERSPECTIVE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0 ), \
+          CVIVANTE_NAMESPACE("evis.custom_warp_perspective_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE), \
+          _CUSTOM_WARP_PERSPECTIVE_KERNEL_SOURCE }
+#define PACK_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
+        { CUSTOM_WARP_PERSPECTIVE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1 ), \
+          CVIVANTE_NAMESPACE("evis.custom_warp_perspective_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D"), \
+          _CUSTOM_WARP_PERSPECTIVE_KERNEL_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _custom_warp_perspective_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( U8, U8, nearest_neighbor ),
+    PACK_KERNEL_MAP( U8, U8, bilinear ),
+
+    PACK_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
+    PACK_2D_KERNEL_MAP( U8, U8, bilinear ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _custom_warp_perspective_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM  _cnt_of_array( _custom_warp_perspective_kernel_param_def )
+#define SCALAR_MATRIX_OFFSET    (2)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_custom_warp_perspective_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    vsi_size_array_t * out_shape = NULL;
+    float m[9] = {0};
+    float matrix0[4] = {0};
+    float matrix1[4] = {0};
+    float matrix2[4] = {0};
+    float matrix4[4] = {0};
+    int32_t i = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    for (i = 0; i < 9; i++)
+    {
+        status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
+            &m[i]);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    matrix0[0] = m[0]; matrix0[1] = m[1]; matrix0[2] = m[3]; matrix0[3] = m[4];
+    matrix1[0] = m[6]; matrix1[1] = m[7]; matrix1[2] = m[2]; matrix1[3] = m[5];
+    matrix2[0] = m[8];
+    matrix4[0] = m[0]; matrix4[1] = m[1]; matrix4[2] = m[0] * 2; matrix4[3] = m[1] * 2;
+
+    out_shape  = attr[1]->shape;
+
+    gpu_param.global_scale[0] = 8;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+    status = vsi_nn_kernel_gpu_add_param( node,
+        "matrix0", &matrix0 );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "matrix1", &matrix1 );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "matrix2", &matrix2 );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "matrix4", &matrix4 );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+} /* _custom_warp_perspective_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t type
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _custom_warp_perspective_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _custom_warp_perspective_kernel_map );
+    vx_param_description_t * param_def  = _custom_warp_perspective_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _custom_warp_perspective_initializer;
+    int32_t is_2d_img = inputs[0]->attr.dim_num < 3 || inputs[0]->attr.size[2] == 1;
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = CUSTOM_WARP_PERSPECTIVE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _custom_warp_perspective_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CUSTOM_WARP_PERSPECTIVE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    size_t i = 0;
+    size_t buffer_size = 0;
+    int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
+    float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
+
+    if (vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, type );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            vx_border_t border;
+            border.mode = VX_BORDER_CONSTANT;
+
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            for (i = 0; i < buffer_size; i++)
+            {
+                node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
+                        graph, F32, &buffer[i] );
+            }
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM );
+            for (i = 0; i < buffer_size; i++)
+            {
+                vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
+            }
+            // Set default border mode.
+            border.constant_value.U32 = 0xcdcdcdcd;
+            status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+            CHECK_STATUS(status);
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( custom_warp_perspective, _setup )
--- a/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c
+++ b/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c
@ -0,0 +1,136 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+typedef struct _ainr_denoise_postprocess_local_data_t {
+    int32_t placeholder;
+} ainr_denoise_postprocess_local_data_t;
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+#if defined(VX_DENOISE_POSTPROCESS_SUPPORT) && VX_DENOISE_POSTPROCESS_SUPPORT
+    self->n = vxDenoisePostProcesslayer(
+        self->graph->g,
+        REQUIRED_IO(inputs[0]), // currInput
+        REQUIRED_IO(inputs[1]), // nnOutput
+        REQUIRED_IO(inputs[2]), // preOutImg
+        REQUIRED_IO(inputs[3]), // S0
+        REQUIRED_IO(inputs[4]), // C0
+        REQUIRED_IO(inputs[5]), // C1
+        REQUIRED_IO(inputs[6]), // C2
+        REQUIRED_IO(inputs[7]), // C3
+        REQUIRED_IO(inputs[8]), // clampMin
+        REQUIRED_IO(inputs[9]), // clampMax
+        REQUIRED_IO(outputs[0]) // output
+        );
+#else
+    self->n = NULL;
+#endif
+
+    if(NULL == self->n)
+    {
+        VSILOGE( "Create vxDenoisePostProcesslayer fail." );
+        return VSI_FAILURE;
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    return VSI_SUCCESS;
+} /* op_init() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    status = vsi_nn_op_common_deinit(self);
+
+    return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ CUSTOM_AINR_DENOISE_POSTPROCESS,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ 10,
+    /* output_num */ 1
+    );
+
+__END_DECLS
+
--- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
+++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
@ -0,0 +1,136 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+typedef struct _custom_warp_affine_local_data_t {
+    int32_t placeholder;
+} custom_warp_affine_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_custom_warp_affine_param * p;
+    p = &(self->nn_param.custom_warp_affine);
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_const_buffer( param, "matrix", p->matrix, 6 );
+    vsi_nn_kernel_param_add_int32( param, "type", p->type);
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+            "custom_warp_affine",
+            inputs, 1,
+            outputs, 1, param );
+
+    vsi_nn_kernel_param_release( &param );
+
+    return VSI_SUCCESS;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /*TODO: Check tensor shapes. */
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        uint32_t i = 0;
+
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        outputs[0]->attr.size[0] = self->nn_param.custom_warp_affine.size[0];
+        outputs[0]->attr.size[1] = self->nn_param.custom_warp_affine.size[1];
+
+        for (i = 2; i < outputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+        }
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    status = vsi_nn_op_common_deinit(self);
+
+    return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ CUSTOM_WARP_AFFINE,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
--- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c
+++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c
@ -0,0 +1,136 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+typedef struct _custom_warp_perspective_local_data_t {
+    int32_t placeholder;
+} custom_warp_perspective_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_custom_warp_affine_param * p;
+    p = &(self->nn_param.custom_warp_affine);
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_const_buffer( param, "matrix", p->matrix, 9 );
+    vsi_nn_kernel_param_add_int32( param, "type", p->type);
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+            "custom_warp_perspective",
+            inputs, 1,
+            outputs, 1, param );
+
+    vsi_nn_kernel_param_release( &param );
+
+    return VSI_SUCCESS;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /*TODO: Check tensor shapes. */
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        uint32_t i = 0;
+
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        outputs[0]->attr.size[0] = self->nn_param.custom_warp_perspective.size[0];
+        outputs[0]->attr.size[1] = self->nn_param.custom_warp_perspective.size[1];
+
+        for (i = 2; i < outputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+        }
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    status = vsi_nn_op_common_deinit(self);
+
+    return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ CUSTOM_WARP_PERSPECTIVE,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
--- a/src/tim/vx/internal/src/kernel/cl/clip_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c
@ -64,14 +64,16 @@ typedef struct

 static const _kernel_map_type _clip_kernel_map[] =
 {
-    PACK_KERNEL_MAP(F32, F32),
-    PACK_KERNEL_MAP(F32, U8),
-    PACK_KERNEL_MAP(U8,  U8),
-    PACK_KERNEL_MAP(U8,  F32),
-    PACK_KERNEL_MAP_2D(F32, F32),
-    PACK_KERNEL_MAP_2D(F32, U8),
-    PACK_KERNEL_MAP_2D(U8,  U8),
-    PACK_KERNEL_MAP_2D(U8,  F32),
+    PACK_KERNEL_MAP(F32,     F32),
+    PACK_KERNEL_MAP(F32,     U8),
+    PACK_KERNEL_MAP(U8,      U8),
+    PACK_KERNEL_MAP(U8,      F32),
+    PACK_KERNEL_MAP(BF16,    BF16),
+    PACK_KERNEL_MAP_2D(F32,  F32),
+    PACK_KERNEL_MAP_2D(F32,  U8),
+    PACK_KERNEL_MAP_2D(U8,   U8),
+    PACK_KERNEL_MAP_2D(U8,   F32),
+    PACK_KERNEL_MAP_2D(BF16, BF16),
 };


--- a/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
@ -0,0 +1,226 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define _DEPTH2SPACE_CRD_KERNEL_SOURCE           "depth2space_crd"
+
+// Add kernel hashtable here
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F32TOF32    CVIVANTE_NAMESPACE("cl.depth2space_crd_F32toF32")
+
+// Add kernel hashtable here
+#define HASH_DEPTH2SPACE_CRD_KEY(_input0_type, _output_type, _blk_size) \
+    ((_input0_type << 24) | (_output_type << 16) | (_blk_size << 8))
+
+#define TENSOR_DEPTH2SPACE_CRD_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_DEPTH2SPACE_CRD_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        VX_KERNEL_NAME_DEPTH2SPACE_CRD_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } depth2space_crd_map[] =
+{
+    TENSOR_DEPTH2SPACE_CRD_KERNELS(F32,  F32,        _DEPTH2SPACE_CRD_KERNEL_SOURCE)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _depth2space_crd_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _DEPTH2SPACE_CRD_PARAM_NUM  _cnt_of_array( _depth2space_crd_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    int32_t     output_dims   = 0;
+    int32_t     output_width  = 0;
+    int32_t     output_height = 0;
+    int32_t     output_chn    = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+
+    output_dims = (int32_t)attr[0]->shape->size;
+    output_width = (int32_t)(attr[0]->shape->data[0]);
+    output_height = (int32_t)(attr[0]->shape->data[1]);
+    output_chn = (int32_t)(output_dims > 2 ? attr[0]->shape->data[2] : 1);
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = output_width;
+    gpu_param.global_size[1]   = output_height;
+    gpu_param.global_size[2]   = output_chn;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _depth2space_crd_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_DEPTH2SPACE_CRD_KEY( input0_dtype, output_dtype, 0 );
+
+    for ( i = 0; i < _cnt_of_array(depth2space_crd_map); i ++ )
+    {
+        if ( depth2space_crd_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < _cnt_of_array(depth2space_crd_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  depth2space_crd_map[i].function_name );
+        kernel->info.parameters = _depth2space_crd_kernel_param_def;
+        kernel->info.numParams = _DEPTH2SPACE_CRD_PARAM_NUM;
+        kernel->info.initialize = _depth2space_crd_initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                depth2space_crd_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                depth2space_crd_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_DEPTH2SPACE_CRD_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Pass parameters to node. */
+            vsi_nn_kernel_node_pack_io( node_params, _DEPTH2SPACE_CRD_PARAM_NUM,
+                inputs, 1, outputs, 1 );
+            node_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _DEPTH2SPACE_CRD_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( depth2space_internal, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
@ -42,6 +42,7 @@ __BEGIN_DECLS
 typedef enum
 {
    UNARY_SIN,
+    UNARY_COS,
    UNARY_EXP,
    UNARY_LOG,
    UNARY_ELU,
@ -89,6 +90,7 @@ typedef enum
        VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },

 #define SIN_OPERATION           sin
+#define COS_OPERATION           cos
 #define EXP_OPERATION           exp
 #define LOG_OPERATION           log
 #define ELU_OPERATION           elu
@ -107,6 +109,8 @@ static const struct {
 {
    TENSOR_UNARY_KERNELS_FLOAT(SIN_OPERATION,      UNARY_SIN,      F32, F32)
    TENSOR_UNARY_KERNELS_FLOAT(SIN_OPERATION,      UNARY_SIN,      F16, F16)
+    TENSOR_UNARY_KERNELS_FLOAT(COS_OPERATION,      UNARY_COS,      F32, F32)
+    TENSOR_UNARY_KERNELS_FLOAT(COS_OPERATION,      UNARY_COS,      F16, F16)
    TENSOR_UNARY_KERNELS_FLOAT(EXP_OPERATION,      UNARY_EXP,      F32, F32)
    TENSOR_UNARY_KERNELS_FLOAT(EXP_OPERATION,      UNARY_EXP,      F16, F16)
    TENSOR_UNARY_KERNELS_FLOAT(LOG_OPERATION,      UNARY_LOG,      F32, F32)
@ -128,6 +132,8 @@ static const struct {

    TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION,      UNARY_SIN,      F32, F32)
    TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION,      UNARY_SIN,      F16, F16)
+    TENSOR_UNARY_KERNELS_FLOAT_2D(COS_OPERATION,      UNARY_COS,      F32, F32)
+    TENSOR_UNARY_KERNELS_FLOAT_2D(COS_OPERATION,      UNARY_COS,      F16, F16)
    TENSOR_UNARY_KERNELS_FLOAT_2D(EXP_OPERATION,      UNARY_EXP,      F32, F32)
    TENSOR_UNARY_KERNELS_FLOAT_2D(EXP_OPERATION,      UNARY_EXP,      F16, F16)
    TENSOR_UNARY_KERNELS_FLOAT_2D(LOG_OPERATION,      UNARY_LOG,      F32, F32)
@ -148,6 +154,7 @@ static const struct {
    TENSOR_UNARY_KERNELS_FLOAT_2D(HGELU_OPERATION,    UNARY_HGELU,    F16, F16)

    TENSOR_UNARY_KERNELS(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
+    TENSOR_UNARY_KERNELS(COS_OPERATION,      UNARY_COS,      U8,  U8)
    TENSOR_UNARY_KERNELS(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
    TENSOR_UNARY_KERNELS(LOG_OPERATION,      UNARY_LOG,      U8,  U8)
    TENSOR_UNARY_KERNELS(ELU_OPERATION,      UNARY_ELU,      U8,  U8)
@ -159,6 +166,7 @@ static const struct {
    TENSOR_UNARY_KERNELS(HGELU_OPERATION,    UNARY_HGELU,    U8,  U8)

    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,      UNARY_COS,      U8,  U8)
    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,      UNARY_LOG,      U8,  U8)
    TENSOR_UNARY_KERNELS_2D(ELU_OPERATION,      UNARY_ELU,      U8,  U8)
@ -175,6 +183,7 @@ static const struct {
 };

 #undef SIN_OPERATION
+#undef COS_OPERATION
 #undef EXP_OPERATION
 #undef LOG_OPERATION
 #undef ELU_OPERATION
@ -438,6 +447,7 @@ OnError:


 REGISTER_ELTWISE_UNARY_BACKEND_CL( sin,          UNARY_SIN )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( cos,          UNARY_COS )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( exp,          UNARY_EXP )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( log,          UNARY_LOG )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( elu,          UNARY_ELU )
--- a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
@ -103,7 +103,6 @@ static vx_param_description_t _floordiv_kernel_param_def[] =
 #define SCALAR_OUTPUT_SCALE          (7)
 #define SCALAR_OUTPUT_TAIL           (8)

-#define FLOORDIV_PARAM_NUM         3
 #define FLOORDIV_QUANT_PARAM_NUM   _cnt_of_array( _floordiv_kernel_param_def )

 /*
@ -154,8 +153,6 @@ final:
    return status;
 } /* _floordiv_initializer() */

-
-
 /*
 * Query kernel
 */
@ -164,8 +161,7 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
-    vsi_bool image_2d,
-    vsi_bool *is_use_u8_kernel
+    vsi_bool image_2d
    )
 {
    vsi_status status = VSI_FAILURE;
@ -189,7 +185,7 @@ static vsi_status _query_kernel
    {
        in0_dtype = F32;
    }
-    else if (I16 == in0_dtype)
+    else if (I16 == in0_dtype || I8 == in0_dtype)
    {
        in0_dtype = I32;
    }
@ -198,7 +194,7 @@ static vsi_status _query_kernel
    {
        in1_dtype = F32;
    }
-    else if (I16 == in1_dtype)
+    else if (I16 == in1_dtype || I8 == in1_dtype)
    {
        in1_dtype = I32;
    }
@ -207,16 +203,9 @@ static vsi_status _query_kernel
    {
        out_dtype  = F32;
    }
-
-    if ((U8 == in0_dtype) || (U8 == in1_dtype) || (U8 == out_dtype))
+    else if (I16 == out_dtype || I8 == out_dtype)
    {
-        param_def_size = FLOORDIV_QUANT_PARAM_NUM;
-        *is_use_u8_kernel = TRUE;
-    }
-    else
-    {
-        param_def_size = FLOORDIV_PARAM_NUM;
-        *is_use_u8_kernel = FALSE;
+        out_dtype = I32;
    }

    key = FLOORDIV_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d);
@ -228,7 +217,7 @@ static vsi_status _query_kernel
            break;
        }
    }
-    if( i < kernel_map_size )
+    if ( i < kernel_map_size )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
        kernel->info.parameters  = param_def;
@ -262,19 +251,18 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t node_params[_FLOORDIV_PARAM_NUM] = {NULL};
    vsi_nn_kernel_node_t node = NULL;
    vsi_bool image_2d = FALSE;
-    float    outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
-    float    outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
-    float    input0Scale  = vsi_nn_get_tensor_scale(inputs[0]);
-    float    input0Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
-    float    input1Scale  = vsi_nn_get_tensor_scale(inputs[1]);
-    float    input1Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
-    vsi_bool is_use_u8_kernel = FALSE;
+    float outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float input0Scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float input0Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input1Scale  = vsi_nn_get_tensor_scale(inputs[1]);
+    float input1Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[1]);

    outputScale = 1.0f / outputScale;
    input0Tail   = -(input0Tail * input0Scale);
    input1Tail   = -(input1Tail * input1Scale);

-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
        return NULL;
@ -282,40 +270,35 @@ static vsi_nn_kernel_node_t _setup

    image_2d = (outputs[0]->attr.dim_num == 2);

-    status = _query_kernel( kernel, inputs, outputs, image_2d, &is_use_u8_kernel);
-    if( VSI_SUCCESS == status)
+    status = _query_kernel( kernel, inputs, outputs, image_2d);
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
        {
-            size_t node_params_num = FLOORDIV_PARAM_NUM;
+            size_t node_params_num = FLOORDIV_QUANT_PARAM_NUM;
            /* Set inputs and outputs */
            vsi_nn_kernel_node_pack_io( node_params, _FLOORDIV_PARAM_NUM,
                    inputs, input_num, outputs, output_num );
-            if (is_use_u8_kernel)
-            {
-                node_params[SCALAR_INPUT0_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale );
-                node_params[SCALAR_INPUT0_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail );
-                node_params[SCALAR_INPUT1_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale );
-                node_params[SCALAR_INPUT1_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &input1Tail );
-                node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
-                node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
-                node_params_num = FLOORDIV_QUANT_PARAM_NUM;
-            }
+
+            node_params[SCALAR_INPUT0_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale );
+            node_params[SCALAR_INPUT0_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail );
+            node_params[SCALAR_INPUT1_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale );
+            node_params[SCALAR_INPUT1_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &input1Tail );
+            node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
            VSI_ASSERT( status == VSI_SUCCESS );
-            if (is_use_u8_kernel)
-            {
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_SCALE] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_TAIL] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_SCALE] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
-            }
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_TAIL] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
        }
    }
+
    return node;
 } /* _setup() */

--- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
@ -47,7 +47,8 @@ typedef enum
    INTERNAL_KERNEL_GATHER,
 } _internal_kernel_e;

-#define _GATHER_KERNEL_SOURCE      "gather"
+#define _GATHER_KERNEL_SOURCE           "gather"
+#define _GATHER_BATCH_KERNEL_SOURCE     "gather_batch"

 // Add kernel hashtable here
 #define VX_KERNEL_NAME_GATHER_U8TOU8       CVIVANTE_NAMESPACE("cl.gather_U8toU8")
@ -55,25 +56,39 @@ typedef enum
 #define VX_KERNEL_NAME_GATHER_I32TOI32     CVIVANTE_NAMESPACE("cl.gather_I32toI32")
 #define VX_KERNEL_NAME_GATHER_F32TOF32     CVIVANTE_NAMESPACE("cl.gather_F32toF32")

+#define VX_KERNEL_NAME_GATHER_BATCH_U8TOU8       CVIVANTE_NAMESPACE("cl.gather_batch_U8toU8")
+#define VX_KERNEL_NAME_GATHER_BATCH_F16TOF16     CVIVANTE_NAMESPACE("cl.gather_batch_F16toF16")
+#define VX_KERNEL_NAME_GATHER_BATCH_I32TOI32     CVIVANTE_NAMESPACE("cl.gather_batch_I32toI32")
+#define VX_KERNEL_NAME_GATHER_BATCH_F32TOF32     CVIVANTE_NAMESPACE("cl.gather_batch_F32toF32")
+
 // Add kernel hashtable here
-#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
-    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
+#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _image_2d, _batch) \
+    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d << 4) | (_batch))

 #define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0), \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0), \
        VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \
        SOURCE },

+#define TENSOR_GATHER_BATCH_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 1), \
+        VX_KERNEL_NAME_GATHER_BATCH_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
 static const struct {
        uint32_t key;
        char* function_name;
        const char* source_name;
    } gather_map[] =
 {
-    TENSOR_GATHER_KERNELS(U8,  I32, U8,       _GATHER_KERNEL_SOURCE)
+    TENSOR_GATHER_KERNELS(U8,  I32, U8,        _GATHER_KERNEL_SOURCE)
    TENSOR_GATHER_KERNELS(F16, I32, F16,       _GATHER_KERNEL_SOURCE)
    TENSOR_GATHER_KERNELS(I32, I32, I32,       _GATHER_KERNEL_SOURCE)
    TENSOR_GATHER_KERNELS(F32, I32, F32,       _GATHER_KERNEL_SOURCE)
+    TENSOR_GATHER_BATCH_KERNELS(U8,  I32, U8,  _GATHER_BATCH_KERNEL_SOURCE)
+    TENSOR_GATHER_BATCH_KERNELS(F16, I32, F16, _GATHER_BATCH_KERNEL_SOURCE)
+    TENSOR_GATHER_BATCH_KERNELS(I32, I32, I32, _GATHER_BATCH_KERNEL_SOURCE)
+    TENSOR_GATHER_BATCH_KERNELS(F32, I32, F32, _GATHER_BATCH_KERNEL_SOURCE)
 };

 /*
@ -88,6 +103,7 @@ static vx_param_description_t _gather_kernel_param_def[] =
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    // Add kererl parameters here
 };
 #define _GATHER_PARAM_NUM  _cnt_of_array( _gather_kernel_param_def )
@ -97,6 +113,7 @@ static vsi_status cal_gather_tensor_reshape_size
    vsi_nn_tensor_t ** inputs,
    vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
    uint32_t block_size,
+    vsi_size_t batch_dims,
    uint32_t idxFlg
    )
 {
@ -105,30 +122,37 @@ static vsi_status cal_gather_tensor_reshape_size
    vsi_size_t *input_size = inputs[0]->attr.size;
    uint32_t i = 0;
    vsi_size_t elementCnt = 1;
+    vsi_size_t outerCnt = 1;
 #define VSI_NN_MAX_IMAGE_WIDTH  (65536)

-    for(i = 0; i < dims_num; ++i)
+    for (i = 0; i < dims_num - batch_dims; ++i)
    {
        elementCnt *= input_size[i];
    }

-    for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
+    for (; i < dims_num; ++i)
+    {
+        outerCnt *= input_size[i];
+    }
+
+    for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
    {
        sizes[i] = 1;
    }

-    if(idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH)
+    if (idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH)
    {
        sizes[0] = elementCnt;
-        sizes[1] = 1;
+        sizes[1] = outerCnt;
        status = VSI_SUCCESS;
    }
    else
    {
-        if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+        if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
        {
            sizes[0] = block_size;
            sizes[1] = elementCnt / block_size;
+            sizes[2] = outerCnt;
            status = VSI_SUCCESS;
        }
    }
@ -160,9 +184,9 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
    vsi_size_array_t * input1_shape = NULL;
    int32_t       block_size  = 0;
    int32_t       block_num   = 0;
-    vsi_ssize_t       indices_num = 1;
-    size_t      input_dims1 = 0;
-    size_t     i           = 0;
+    vsi_ssize_t   indices_num = 1;
+    size_t        input_dims1 = 0;
+    size_t        i           = 0;

    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@ -176,7 +200,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)

    input1_shape  = attr[1]->shape;
    input_dims1   = input1_shape->size;
-    for (i = 0; i < input_dims1; i++)
+    for (i = 0; i < input_dims1 - 1; i++)
    {
        indices_num *= input1_shape->data[i];
    }
@ -214,7 +238,8 @@ static vsi_status _query_kernel
    (
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t is_batch
    /* Add extra params */
    )
 {
@ -227,17 +252,17 @@ static vsi_status _query_kernel
    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

-    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0 );
+    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0, is_batch );

-    for( i = 0; i < _cnt_of_array(gather_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(gather_map); i ++ )
    {
-        if( gather_map[i].key == key )
+        if ( gather_map[i].key == key )
        {
            break;
        }
    }

-    if( i < _cnt_of_array(gather_map) )
+    if ( i < _cnt_of_array(gather_map) )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  gather_map[i].function_name );
        kernel->info.parameters = _gather_kernel_param_def;
@ -271,54 +296,69 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t node_params[_GATHER_PARAM_NUM] = {NULL};
    vsi_nn_kernel_node_t node = NULL;
    vsi_size_t  shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    int32_t batch_dims  = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
    int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
    int32_t block_num   = vsi_nn_kernel_param_get_int32( params, "block_num" );
    int32_t axis_num    = vsi_nn_kernel_param_get_int32( params, "axis_num" );
    int32_t indices_num = vsi_nn_kernel_param_get_int32( params, "indices_num" );
+    int32_t is_batch    = batch_dims > 0 ? 1 : 0;
+    vsi_size_t rs_dim   = batch_dims == 0 ? 2 : 3;
+    int32_t i           = 0;

-    status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0);
-    status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
-    status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0);
-    if(status != VSI_SUCCESS)
+    status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0);
+    status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1);
+    status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0);
+    if (status != VSI_SUCCESS)
    {
        return NULL;
    }

-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+        inputs[0], shapes[0], rs_dim );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+        inputs[1], shapes[1], 2 );
+    reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+        outputs[0], shapes[2], rs_dim );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
        return NULL;
    }

-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if( VSI_SUCCESS == status)
+    status = _query_kernel( kernel, inputs, outputs, is_batch );
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
        {
-            uint32_t index = 0;
-#define RESHAPE_DIM 2
+            uint32_t index = 3;
+            int32_t batch = (int32_t)shapes[1][1];
+
            /* Pass parameters to node. */
-            node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t,  shapes[0], RESHAPE_DIM );
-            node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t,  shapes[1], RESHAPE_DIM );
-            node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], RESHAPE_DIM );
-#undef RESHAPE_DIM
+            vsi_nn_kernel_node_pack_io( node_params, _GATHER_PARAM_NUM,
+                reshape_tensors, 2, &reshape_tensors[2], 1 );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &indices_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &batch );
            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GATHER_PARAM_NUM );
-            CHECK_STATUS(status);
-            vsi_nn_kernel_tensor_release( &node_params[0] );
-            vsi_nn_kernel_tensor_release( &node_params[1] );
-            vsi_nn_kernel_tensor_release( &node_params[2] );
            vsi_nn_kernel_scalar_release( &node_params[3] );
            vsi_nn_kernel_scalar_release( &node_params[4] );
            vsi_nn_kernel_scalar_release( &node_params[5] );
            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
        }
    }
+
+    for (i = 0; i < 3; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
    return node;
 } /* _setup() */

--- a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
@ -22,7 +22,6 @@
 *
 *****************************************************************************/

-
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -445,45 +444,6 @@ static vsi_status _query_kernel
    return status;
 } /* _query_kernel() */

-static int32_t _optimize_gn_shape_cl
-    (
-    vsi_nn_tensor_t ** inputs,
-    vsi_size_t group_size,
-    int32_t group_num,
-    vsi_size_t* opt_shape,
-    int32_t* is2D_flg
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_size_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
-    vsi_size_t new_rank = 0;
-    group_shape[0] = inputs[0]->attr.size[0];
-    group_shape[1] = inputs[0]->attr.size[1];
-    group_shape[2] = group_size;
-
-    vsi_nn_kernel_optimize_element_shape(group_shape, 3, opt_shape, &new_rank );
-
-    if (opt_shape[1] == 1)
-    {
-        opt_shape[1] = group_num;
-        opt_shape[2] = 1;
-        opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
-        is2D_flg[0] = 1;
-    }
-    else if (new_rank == 2)
-    {
-        opt_shape[2] = group_num;
-        opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
-    }
-    else
-    {
-        status = VSI_FAILURE;
-    }
-
-    return status;
-}
-
-
 static vsi_nn_kernel_node_t _setup
    (
    vsi_nn_graph_t              * graph,
@ -535,11 +495,13 @@ static vsi_nn_kernel_node_t _setup
        return NULL;
    }

-    status = _optimize_gn_shape_cl(inputs, group_size, group_num, new_shape, &is2D_flg);
+    status =  vsi_nn_kernel_optimize_group_norm_shape( (const vsi_size_t*)inputs[0]->attr.size,
+        inputs[0]->attr.dim_num, group_num, 0, new_shape);
    if ( VSI_SUCCESS != status )
    {
        goto final;
    }
+    is2D_flg = (new_shape[2] == 1) && ((int32_t)new_shape[1] == group_num);
    rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4);
    rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4);

--- a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
@ -406,12 +406,12 @@ static vsi_nn_kernel_node_t _setup
    uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
    uint32_t hashkey = 0;
    int32_t i = 0;
-
+    uint32_t rank = outputs[0]->attr.dim_num;
    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
-    int32_t reshape_flg  = vsi_nn_kernel_param_get_int32( params, "reshape_flg" );
-
    size_t width = inputs[0]->attr.size[0];
    size_t height = inputs[0]->attr.size[1];
+    int32_t reshape_flg  = outputs[0]->attr.size[1] * outputs[0]->attr.size[2] < GPU_TENSOR_MAX_WIDTH
+            && rank > 2;
    int32_t group_num = (int32_t)(width + 15) / 16;
    int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
    float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
--- a/src/tim/vx/internal/src/kernel/cl/moments_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/moments_cl.c
@ -101,18 +101,23 @@ static const _kernel_map_type moments_map[] =
    TENSOR_MOMENTS_KERNELS(U8,  F32, 0,        KERNEL_SOURCE_1)
    TENSOR_MOMENTS_KERNELS(F32, F32, 0,        KERNEL_SOURCE_1)
    TENSOR_MOMENTS_KERNELS(I32, F32, 0,        KERNEL_SOURCE_1)
+    TENSOR_MOMENTS_KERNELS(BF16,F32, 0,        KERNEL_SOURCE_1)
    TENSOR_MOMENTS_KERNELS(U8,  F32, 1,        KERNEL_SOURCE_2)
    TENSOR_MOMENTS_KERNELS(F32, F32, 1,        KERNEL_SOURCE_2)
    TENSOR_MOMENTS_KERNELS(I32, F32, 1,        KERNEL_SOURCE_2)
+    TENSOR_MOMENTS_KERNELS(BF16,F32, 1,        KERNEL_SOURCE_2)
    TENSOR_MOMENTS_KERNELS(U8,  F32, 2,        KERNEL_SOURCE_3)
    TENSOR_MOMENTS_KERNELS(F32, F32, 2,        KERNEL_SOURCE_3)
    TENSOR_MOMENTS_KERNELS(I32, F32, 2,        KERNEL_SOURCE_3)
+    TENSOR_MOMENTS_KERNELS(BF16,F32, 2,        KERNEL_SOURCE_3)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8,  F32, 0, 1,         KERNEL_SOURCE_4)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS(F32, F32, 0, 1,         KERNEL_SOURCE_4)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS(I32, F32, 0, 1,         KERNEL_SOURCE_4)
+    TENSOR_MOMENTS_TWO_AXIS_KERNELS(BF16,F32, 0, 1,         KERNEL_SOURCE_4)
    TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8,  F32, 0, 1, 2,    KERNEL_SOURCE_5)
    TENSOR_MOMENTS_THREE_AXIS_KERNELS(F32, F32, 0, 1, 2,    KERNEL_SOURCE_5)
    TENSOR_MOMENTS_THREE_AXIS_KERNELS(I32, F32, 0, 1, 2,    KERNEL_SOURCE_5)
+    TENSOR_MOMENTS_THREE_AXIS_KERNELS(BF16,F32, 0, 1, 2,    KERNEL_SOURCE_5)
 };

 /*
--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@ -0,0 +1,301 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+#define _TOPK_KERNEL_SOURCE      "topk"
+#define STR(a) #a
+// Add kernel hashtable here
+#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \
+        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) )
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \
+        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \
+          CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
+          _TOPK_KERNEL_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _topk_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( F32, F32, 0 ),
+    PACK_KERNEL_MAP( F32, F32, 1 ),
+    PACK_KERNEL_MAP( F32, F32, 2 ),
+    PACK_KERNEL_MAP( F32, F32, 3 ),
+    PACK_KERNEL_MAP( F32, F32, 4 ),
+    PACK_KERNEL_MAP( F32, F32, 5 ),
+    PACK_KERNEL_MAP( F32, F32, 6 ),
+
+    PACK_KERNEL_MAP( U32, U32, 0 ),
+    PACK_KERNEL_MAP( U32, U32, 1 ),
+    PACK_KERNEL_MAP( U32, U32, 2 ),
+    PACK_KERNEL_MAP( U32, U32, 3 ),
+    PACK_KERNEL_MAP( U32, U32, 4 ),
+    PACK_KERNEL_MAP( U32, U32, 5 ),
+    PACK_KERNEL_MAP( U32, U32, 6 ),
+
+    PACK_KERNEL_MAP( I32, I32, 0 ),
+    PACK_KERNEL_MAP( I32, I32, 1 ),
+    PACK_KERNEL_MAP( I32, I32, 2 ),
+    PACK_KERNEL_MAP( I32, I32, 3 ),
+    PACK_KERNEL_MAP( I32, I32, 4 ),
+    PACK_KERNEL_MAP( I32, I32, 5 ),
+    PACK_KERNEL_MAP( I32, I32, 6 ),
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _topk_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _TOPK_PARAM_NUM  _cnt_of_array( _topk_kernel_param_def )
+#define SCALAR_INPUT_NUM_STAGES (3)
+#define SCALAR_INPUT_WIDTH      (4)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_topk_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        2,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * input_attr   = NULL;
+    vsi_size_array_t * in_shape                = NULL;
+    int32_t num_stages = 0;
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_NUM_STAGES], &num_stages);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    in_shape  = input_attr->shape;
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.local_size[0]   = (size_t)(1 << num_stages);
+    gpu_param.local_size[1]   = 1;
+    gpu_param.global_size[0]  = (size_t)(1 << num_stages);
+    gpu_param.global_size[1]  = in_shape->data[1];
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(input_attr);
+    return status;
+} /* _topk_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t num_stages
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _topk_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _topk_kernel_map );
+    vx_param_description_t * param_def  = _topk_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _topk_initializer;
+#define _PACK_SELECT_KEY( in_type, out_type ) \
+    ( (in_type) | (out_type << 8) )
+    uint32_t key = 0;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+        key = TOPK_HASH_KEY( F32, F32, num_stages );
+        break;
+    case _PACK_SELECT_KEY(U32, U32):
+    case _PACK_SELECT_KEY(U16, U16):
+    case _PACK_SELECT_KEY(U8,  U8):
+        key = TOPK_HASH_KEY( U32, U32, num_stages );
+        break;
+    case _PACK_SELECT_KEY(I32, I32):
+    case _PACK_SELECT_KEY(I16, I16):
+    case _PACK_SELECT_KEY(I8,  I8):
+        key = TOPK_HASH_KEY( I32, I32, num_stages );
+        break;
+    default:
+        break;
+    }
+#undef _PACK_SELECT_KEY
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _topk_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_TOPK_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t block_size = inputs[0]->attr.size[0];
+    vsi_size_t block_num = 1;
+    uint32_t i = 0;
+    vsi_nn_tensor_t* rs_tensors[3] = { NULL };
+    vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
+    int32_t width = (int32_t)block_size;
+    int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
+    int32_t num_stages = (int32_t)ceil(log10(block_size / 2.0f) / log10(2.0f));
+
+    for (i = 1; i < inputs[0]->attr.dim_num; i ++)
+    {
+        block_num = block_num * inputs[0]->attr.size[i];
+    }
+
+    if( vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE ||
+        outputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_INT32 )
+    {
+        return NULL;
+    }
+
+    shape[0][0] = block_size;
+    shape[0][1] = block_num;
+    shape[1][0] = top_k;
+    shape[1][1] = block_num;
+
+    rs_tensors[0] = vsi_nn_reshape_tensor( graph,
+        inputs[0], shape[0], 2 );
+    rs_tensors[1] = vsi_nn_reshape_tensor( graph,
+        outputs[0], shape[1], 2 );
+    rs_tensors[2] = vsi_nn_reshape_tensor( graph,
+        outputs[1], shape[1], 2 );
+
+    status = _query_kernel( kernel, inputs, outputs, num_stages );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _TOPK_PARAM_NUM,
+                    rs_tensors, input_num, &rs_tensors[1], output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_INPUT_NUM_STAGES] = vsi_nn_kernel_scalar_create(
+                graph, I32, &num_stages );
+            node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create(
+                graph, I32, &width );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _TOPK_PARAM_NUM );
+            CHECK_STATUS_FAIL_GOTO( status, final );
+        }
+    }
+final:
+    vsi_safe_release_tensor(rs_tensors[0]);
+    vsi_safe_release_tensor(rs_tensors[1]);
+    vsi_safe_release_tensor(rs_tensors[2]);
+    if (node_params[SCALAR_INPUT_NUM_STAGES])
+    {
+        vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_NUM_STAGES] );
+    }
+    if (node_params[SCALAR_INPUT_WIDTH])
+    {
+        vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( topk, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
@ -40,6 +40,7 @@ __BEGIN_DECLS
 typedef enum
 {
    UNARY_SIN,
+    UNARY_COS,
    UNARY_EXP,
    UNARY_LOG,
    UNARY_ELU,
@ -69,6 +70,11 @@ static float sin_eval(float data)
    return sinf(data);
 }

+static float cos_eval(float data)
+{
+    return cosf(data);
+}
+
 static float log_eval(float data)
 {
    return logf(data);
@ -212,6 +218,9 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
        case UNARY_SIN:
            data = sin_eval(data);
            break;
+        case UNARY_COS:
+            data = cos_eval(data);
+            break;
        case UNARY_EXP:
            data = exp_eval(data);
            break;
@ -372,6 +381,7 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_ELTWISE_UNARY_BACKEND_CPU( sin,          UNARY_SIN )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( cos,          UNARY_COS )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( exp,          UNARY_EXP )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( log,          UNARY_LOG )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( elu,          UNARY_ELU )
--- a/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c
@ -42,7 +42,7 @@ __BEGIN_DECLS
 /*
 * Define kernel meta.
 */
-#define _CPU_ARG_NUM            (3)
+#define _CPU_ARG_NUM            (4)
 #define _CPU_INPUT_NUM          (2)
 #define _CPU_OUTPUT_NUM         (1)
 #define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
@ -62,9 +62,9 @@ DEF_KERNEL_EXECUTOR(_gather_exec)
    uint32_t* buffer_idx = NULL;
    size_t in_elements = 0, out_elements = 0;
    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    vsi_size_t i = 0, j = 0;
-    int32_t block_size = 1, block_num = 1, axis_num = 0;
-    vsi_size_t indices_num = 1;
+    vsi_size_t i = 0, j = 0, b = 0;
+    int32_t block_size = 1, block_num = 1, axis_num = 0, batch_dims = 0;
+    vsi_size_t indices_num = 1, batch = 1, in_stride = 1, out_stride = 1;

    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
@ -86,6 +86,8 @@ DEF_KERNEL_EXECUTOR(_gather_exec)
    CHECK_STATUS_FAIL_GOTO(status, final );
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis_num);
    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &batch_dims);
+    CHECK_STATUS_FAIL_GOTO(status, final );

    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
@ -98,26 +100,44 @@ DEF_KERNEL_EXECUTOR(_gather_exec)
    memset( buffer[1], 0, out_elements * sizeof(float) );

    {
-        for(i = 0; i < attr[1]->shape->size; ++i)
+        for (i = 0; i < attr[1]->shape->size - (vsi_size_t)batch_dims; i++)
        {
            indices_num *= attr[1]->shape->data[i];
        }

-        for(i = 0; i < (vsi_size_t)block_num; i++)
+        for (; i < attr[1]->shape->size; i++)
        {
-            for(j = 0; j < indices_num; j++)
+            batch *= attr[1]->shape->data[i];
+        }
+
+        for (i = 0; i < attr[0]->shape->size - (vsi_size_t)batch_dims; i++)
+        {
+            in_stride *= attr[0]->shape->data[i];
+        }
+
+        for (i = 0; i < attr[2]->shape->size - (vsi_size_t)batch_dims; i++)
+        {
+            out_stride *= attr[2]->shape->data[i];
+        }
+
+        for (b = 0; b < batch; b++)
+        {
+            for (i = 0; i < (vsi_size_t)block_num; i++)
            {
-                uint32_t indice = buffer_idx[j];
-                vsi_size_t in_index = (i * axis_num + indice) * block_size;
-                if(in_index < in_elements)
+                for (j = 0; j < indices_num; j++)
                {
-                    vsi_size_t out_index = (i * indices_num + j) * block_size;
-                    memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float));
-                }
-                else
-                {
-                    status = VX_FAILURE;
-                    CHECK_STATUS_FAIL_GOTO( status, final );
+                    uint32_t indice = buffer_idx[j + indices_num * b];
+                    vsi_size_t in_index = (i * axis_num + indice) * block_size + b * in_stride;
+                    if (in_index < in_elements)
+                    {
+                        vsi_size_t out_index = (i * indices_num + j) * block_size + b * out_stride;
+                        memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float));
+                    }
+                    else
+                    {
+                        status = VX_FAILURE;
+                        CHECK_STATUS_FAIL_GOTO( status, final );
+                    }
                }
            }
        }
@ -128,20 +148,20 @@ DEF_KERNEL_EXECUTOR(_gather_exec)
    CHECK_STATUS_FAIL_GOTO( status, final );

 final:
-    if( buffer_idx )
+    if ( buffer_idx )
    {
        free( buffer_idx );
    }
-    for( i = 0; i < 2; i ++ )
+    for ( i = 0; i < 2; i ++ )
    {
-        if( buffer[i] )
+        if ( buffer[i] )
        {
            free( buffer[i] );
        }
    }
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    for ( i = 0; i < _CPU_IO_NUM; i ++ )
    {
-        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
    }
    return status;
 } /* _gather_exec() */
@ -156,6 +176,7 @@ static vx_param_description_t _gather_kernel_param_def[] =
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    // Add kererl parameters here
 };
 #define _GATHER_PARAM_NUM  _cnt_of_array( _gather_kernel_param_def )
@ -201,15 +222,16 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_t node = NULL;

    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
        {
            uint32_t index = 3;
-            int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
-            int32_t block_num  = vsi_nn_kernel_param_get_int32( params, "block_num" );
+            int32_t block_size   = vsi_nn_kernel_param_get_int32( params, "block_size" );
+            int32_t block_num    = vsi_nn_kernel_param_get_int32( params, "block_num" );
            int32_t axis_num     = vsi_nn_kernel_param_get_int32( params, "axis_num" );
+            int32_t batch_dims   = vsi_nn_kernel_param_get_int32( params, "batch_dims" );

            /* Set inputs and outputs */
            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
@ -218,12 +240,14 @@ static vsi_nn_kernel_node_t _setup
            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num );
            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
+            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &batch_dims );
            /* Pass parameters to node. */
            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
            CHECK_STATUS( status );
            vsi_nn_kernel_scalar_release( &backend_params[3] );
            vsi_nn_kernel_scalar_release( &backend_params[4] );
            vsi_nn_kernel_scalar_release( &backend_params[5] );
+            vsi_nn_kernel_scalar_release( &backend_params[6] );
        }
        else
        {
--- a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
@ -103,9 +103,10 @@ DEF_KERNEL_EXECUTOR(_gather_nd_exec)
    if(coord_stride <= 4) // reshape 3D
    {
        vsi_ssize_t stride[4] = {block_size, 0, 0, 0};
+        int32_t     start_dim = (int32_t)attr[0]->shape->size - coord_stride;
        for(i = 1; i < coord_stride; ++i)
        {
-            stride[i] = stride[i - 1] * attr[0]->shape->data[i];
+            stride[i] = stride[i - 1] * attr[0]->shape->data[start_dim + i - 1];
        }

        for(i = 0; i < indices_num; i++)
@ -118,8 +119,8 @@ DEF_KERNEL_EXECUTOR(_gather_nd_exec)
            for(j = 0; j < coord_stride; j++)
            {
                coord[j] = buffer_idx[i * coord_stride + j];
+                in_index += coord[j] * stride[j];
            }
-            in_index = coord[3] * stride[3] + coord[2] * stride[2] + coord[1] * stride[1] + coord[0] * stride[0];
            memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float));
        }
    }
--- a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
@ -61,7 +61,13 @@ DEF_KERNEL_EXECUTOR(_instance_norm_exec)
    float * buffer[_CPU_IO_NUM] = { NULL };
    size_t out_elements = 0;
    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i = 0;
+    vsi_size_t batch = 1;
+    vsi_size_t depth = 1;
+    vsi_size_t norm_size = 1;
+    vsi_size_t b = 0;
+    vsi_size_t c = 0;
+    vsi_size_t i = 0;
+    size_t rank = 1;
    float eps = .0f;

    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
@ -96,62 +102,55 @@ DEF_KERNEL_EXECUTOR(_instance_norm_exec)
    CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
    memset( buffer[3], 0, out_elements * sizeof(float) );

+    rank = attr[0]->shape->size;
+
+    batch = attr[0]->shape->data[rank - 1];
+    depth = attr[0]->shape->data[rank - 2];
+
+    for ( i = 0; i < (vsi_size_t)rank - 2; i++)
    {
-        vsi_size_t b = 0, c = 0, h = 0, w = 0;
-        vsi_size_t height = attr[0]->shape->data[1];
-        vsi_size_t width = attr[0]->shape->data[0];
-        vsi_size_t ch = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;
-        vsi_size_t bh = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1;
+        norm_size *= attr[0]->shape->data[i];
+    }

-        for (b = 0; b < bh; b++)
+    for (b = 0; b < batch; b++)
+    {
+        for (c = 0; c < depth; c++)
        {
-            for (c = 0; c < ch; c++)
+            vsi_size_t page = c * norm_size + b * norm_size * depth;
+            float sum = .0f;
+            float sumsq = .0f;
+            float mean = .0f;
+            float vari = .0f;
+            float data = 0;
+            float scaleVal = buffer[2][c];
+            float biasVal = buffer[1][c];
+
+            for (i = 0; i < norm_size; i++)
            {
-                vsi_size_t page = c * (height * width) + b * (height * width * ch);
-                float sum = .0f;
-                float sumsq = .0f;
-                float mean = .0f;
-                float vari = .0f;
-                float data = 0;
-                float scaleVal = buffer[2][c];
-                float biasVal = buffer[1][c];
+                vsi_size_t index = page + i;
+                sum += buffer[0][index];
+            }

-                for (h = 0; h < height; h++)
-                {
-                    vsi_size_t len = page + h * width;
+            mean = sum / (float)norm_size;

-                    for (w = 0; w < width; w++)
-                    {
-                        vsi_size_t index = len + w;
-                        sum += buffer[0][index];
-                    }
-                }
-                mean = sum / (width * height);
-                for (h = 0; h < height; h++)
-                {
-                    vsi_size_t len = page + h * width;
-                    for (w = 0; w < width; w++)
-                    {
-                        vsi_size_t index = len + w;
-                        data = buffer[0][index] - mean;
-                        sumsq += data * data;
-                    }
-                }
-                vari = sumsq / (width * height);
-                vari = (float)(1.0 / sqrtf(vari + eps));
-                for (h = 0; h < height; h++)
-                {
-                    vsi_size_t len = page + h * width;
-                    for (w = 0; w < width; w++)
-                    {
-                        float normVal = 0;
-                        vsi_size_t index = len + w;
-                        data = buffer[0][index] - mean;
+            for (i = 0; i < norm_size; i++)
+            {
+                vsi_size_t index = page + i;
+                data = buffer[0][index] - mean;
+                sumsq += data * data;
+            }

-                        normVal = data * vari * scaleVal + biasVal;
-                        buffer[3][index] = normVal;
-                    }
-                }
+            vari = sumsq / (float)norm_size;
+            vari = (float)(1.0 / sqrtf(vari + eps));
+
+            for (i = 0; i < norm_size; i++)
+            {
+                float normVal = 0;
+                vsi_size_t index = page + i;
+                data = buffer[0][index] - mean;
+
+                normVal = data * vari * scaleVal + biasVal;
+                buffer[3][index] = normVal;
            }
        }
    }
@ -256,4 +255,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( instance_norm, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c
@ -104,7 +104,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for(i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -311,4 +310,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( resize_bilinear, _setup )
-
--- a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
@ -63,6 +63,11 @@ __BEGIN_DECLS
        CVIVANTE_NAMESPACE("evis.argmax_axis"#AXIS"_F16to"#OUT_DTYPE"_2D"), \
        HASH_ARGMAX_KERNEL_SOURCE_NAME(AXIS) },

+#define HASH_ARGMAX_KERNELS_MIX_OPT( AXIS, IN_DTYPE, OUT_DTYPE) \
+        { HASH_ARGMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 2), \
+        CVIVANTE_NAMESPACE("evis.argmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_opt"), \
+        HASH_ARGMAX_KERNEL_SOURCE_NAME(AXIS) },
+
 static const struct {
        uint32_t key;
        char* function_name;
@ -132,6 +137,8 @@ static const struct {
    HASH_ARGMAX_KERNELS_2D(2, U8,  I16)
    HASH_ARGMAX_KERNELS_2D(2, I16, U8)
    HASH_ARGMAX_KERNELS_2D(2, I16, I16)
+    HASH_ARGMAX_KERNELS_MIX_OPT(2, U8,  I16)
+    HASH_ARGMAX_KERNELS_MIX_OPT(2, I8,  I16)
 };

 static vx_param_description_t kernel_param_def[] =
@ -228,7 +235,18 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer)
        if (attr[0]->dtype == I8 ||
            attr[0]->dtype == U8)
        {
-            if ( attr[1]->dtype == I8 ||
+            if (axis == 2 &&
+                input_shape->data[2] > 1 &&
+                ((attr[1]->dtype == I8 || attr[1]->dtype == U8)
+                  || (attr[1]->dtype == I16 && input_shape->data[2] < 256)))
+            {
+                uint32_t pack = ((argLenSub1 & 0xFF) << 24) | ((argLenSub1 & 0xFF) << 16)
+                                 | ((argLenSub1 & 0xFF) << 8) | (argLenSub1 & 0xFF);
+                packedArgIdx[0] = packedArgIdx[1] = pack;
+                packedArgIdx[2] = packedArgIdx[3] = pack;
+                gpu_param.global_scale[0]  = 16;
+            }
+            else if ( attr[1]->dtype == I8 ||
                 attr[1]->dtype == U8)
            {
                uint32_t pack = ((argLenSub1 & 0xFF) << 24) | ((argLenSub1 & 0xFF) << 16)
@ -302,7 +320,6 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer)
        }
        break;
    case 1:
-    case 2:
        {
            gpu_dp_inst_t uniExtractData_2x8 = {{
                0x11111111, // TCfg
@ -324,6 +341,52 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer)
            CHECK_STATUS_FAIL_GOTO(status, final );
        }
        break;
+    case 2:
+        {
+            gpu_dp_inst_t uniExtractData_2x8 = {{
+                0x11111111, // TCfg
+                0x00000000, // ASelt
+                0x03020100, 0x07060504, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniExtract1stU8toI16_2x8 = {{
+                0x11111111, // TCfg
+                0x00000000, // ASelt
+                0x03020100, 0x07060504, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniExtract2ndU8toI16_2x8 = {{
+                0x11111111, // TCfg
+                0x00000000, // ASelt
+                0x0b0a0908, 0x0f0e0d0c, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtractData_2x8", &uniExtractData_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtract1stU8toI16_2x8", &uniExtract1stU8toI16_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtract2ndU8toI16_2x8", &uniExtract2ndU8toI16_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "argLenSub1", &argLenSub1 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "packedArgIdx", packedArgIdx );
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }
+        break;
    default:
        break;
    }
@ -354,6 +417,16 @@ static vsi_status _query_kernel

    input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if ((input_dtype == I8 || input_dtype == U8)
+        && output_dtype == I16
+        && axis == 2
+        && inputs[0]->attr.size[2] < 256
+        && image_2d == 0)
+    {
+        image_2d = 2;
+    }
+
    key = HASH_ARGMAX_HASH_KEY( axis, input_dtype, output_dtype, image_2d );

    for( i = 0; i < _cnt_of_array(_argmax_evis_kernel_map); i ++ )
--- a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
@ -85,12 +85,12 @@ typedef enum

 #define COMPARISONS_KERNELS_HALF(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \
    {   HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 0), \
-        HASH_COMPARISONS_SH_KERNEL_NAME(FUNC_NAME, F16, F16), \
+        HASH_COMPARISONS_SH_KERNEL_NAME(FUNC_NAME, BF16, BF16), \
        SOURCE },

 #define COMPARISONS_KERNELS_HALF_2D(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \
    {   HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 1), \
-        HASH_COMPARISONS_SH_KERNEL_2D_NAME(FUNC_NAME, F16, F16), \
+        HASH_COMPARISONS_SH_KERNEL_2D_NAME(FUNC_NAME, BF16, BF16), \
        SOURCE },

 #define LESS_OP              less
@ -396,6 +396,26 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
                0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
                0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+                    0x11111111, // TCfg
+                    0x01010101, // ASelt
+                    0x01050004, 0x03070206, // ABin
+                    0x22222222, // BSelt
+                    0x00000000, 0x00000000, // BBin
+                    0x00000600, // AccumType, ConstantType, and PostShift
+                    0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                    0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+                    0x11111111, // TCfg
+                    0x01010101, // ASelt
+                    0x05050404, 0x07070606, // ABin
+                    0x22222222, // BSelt
+                    0x00000000, 0x00000000, // BBin
+                    0x00000600, // AccumType, ConstantType, and PostShift
+                    0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                    0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };

            status = vsi_nn_kernel_gpu_add_param( node,
                    "uniExtract8Data_2x8", &uniExtractInteger_2x8 );
@ -403,6 +423,10 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
                    "uniDatatoFp32Part0_4x4", &uniDatatoFp32Part0_4x4 );
            status |= vsi_nn_kernel_gpu_add_param( node,
                    "uniDatatoFp32Part1_4x4", &uniDatatoFp32Part1_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
            status |= vsi_nn_kernel_gpu_add_param( node,
                    "input0Scale", &input0Scale );
            status |= vsi_nn_kernel_gpu_add_param( node,
@ -453,7 +477,7 @@ static vsi_status _query_kernel
    int i;

    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
-    input1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
    output_dtype = output_dtype == I8 ? BOOL8 : output_dtype;
    key = HASH_COMPARISONS_KEY( operation, input0_dtype, input1_dtype, output_dtype, image_2d );
--- a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
@ -301,6 +301,7 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
        case _PACK_SELECT_KEY( I8, I8):
        case _PACK_SELECT_KEY( I16, I16):
        case _PACK_SELECT_KEY( F16, F16):
+        case _PACK_SELECT_KEY( BF16, BF16):
            {
                gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift);
                multAndoutZP0[0] = (uint32_t)(M0);
@ -367,6 +368,16 @@ static vsi_status _query_kernel
    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

+    if (input0_dtype == BF16)
+    {
+        input0_dtype = F16;
+    }
+
+    if (output_dtype == BF16)
+    {
+        output_dtype = F16;
+    }
+
    key = HASH_DEPTH2SPACE_CRD_KEY( input0_dtype, output_dtype, blk_flg );

    for( i = 0; i < _cnt_of_array(depth2space_crd_map); i ++ )
--- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
@ -42,6 +42,7 @@ __BEGIN_DECLS
 typedef enum
 {
    UNARY_SIN,
+    UNARY_COS,
    UNARY_EXP,
    UNARY_LOG,
    UNARY_ELU,
@ -79,6 +80,7 @@ typedef enum
        SOURCE },

 #define SIN_OPERATION           sin
+#define COS_OPERATION           cos
 #define EXP_OPERATION           exp
 #define LOG_OPERATION           log
 #define ELU_OPERATION           elu
@ -106,6 +108,17 @@ static const struct {
    TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, I8,   I8   , KERNEL_SOURCE_3D)
    TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, I8,   F16  , KERNEL_SOURCE_3D)
    TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, BF16, BF16 , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16,  F16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16,  I16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16,  U8   , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16,  I8   , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I16,  I16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I16,  F16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, U8,   U8   , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, U8,   F16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I8,   I8   , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I8,   F16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, BF16, BF16 , KERNEL_SOURCE_3D)
    TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16,  F16  , KERNEL_SOURCE_3D)
    TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16,  I16  , KERNEL_SOURCE_3D)
    TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16,  U8   , KERNEL_SOURCE_3D)
@ -162,6 +175,17 @@ static const struct {
    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8,   I8   , KERNEL_SOURCE_2D)
    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8,   F16  , KERNEL_SOURCE_2D)
    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, BF16, BF16 , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16,  F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16,  I16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16,  U8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16,  I8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I16,  I16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I16,  F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8,   U8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8,   F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I8,   I8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I8,   F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, BF16, BF16 , KERNEL_SOURCE_2D)
    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16,  F16  , KERNEL_SOURCE_2D)
    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16,  I16  , KERNEL_SOURCE_2D)
    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16,  U8   , KERNEL_SOURCE_2D)
@ -317,6 +341,7 @@ static const struct {
 };

 #undef SIN_OPERATION
+#undef COS_OPERATION
 #undef EXP_OPERATION
 #undef LOG_OPERATION
 #undef ELU_OPERATION
@ -443,6 +468,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
    switch( pack_key )
    {
        case _PACK_SELECT_KEY( UNARY_SIN, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_COS, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_EXP, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_LOG, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_ELU, BF16, BF16 ):
@ -736,6 +762,7 @@ OnError:
    REGISTER_BACKEND_EVIS( KERNEL_NAME, _##KERNEL_NAME##_setup )

 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( sin, UNARY_SIN )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( cos, UNARY_COS )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( exp, UNARY_EXP )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( log, UNARY_LOG )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( elu, UNARY_ELU )
--- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
@ -64,6 +64,28 @@ __BEGIN_DECLS
 #define VX_KERNEL_NAME_GATHER_AXIS0_U8TOF16   CVIVANTE_NAMESPACE("evis.gather_U8toF16_axis0")
 #define VX_KERNEL_NAME_GATHER_AXIS0_F16TOU8   CVIVANTE_NAMESPACE("evis.gather_F16toU8_axis0")

+#define VX_KERNEL_NAME_GATHER_BATCH_U8TOU8       CVIVANTE_NAMESPACE("evis.gather_batch_U8toU8")
+#define VX_KERNEL_NAME_GATHER_BATCH_I8TOI8       CVIVANTE_NAMESPACE("evis.gather_batch_I8toI8")
+#define VX_KERNEL_NAME_GATHER_BATCH_I16TOI16     CVIVANTE_NAMESPACE("evis.gather_batch_I16toI16")
+#define VX_KERNEL_NAME_GATHER_BATCH_F16TOF16     CVIVANTE_NAMESPACE("evis.gather_batch_F16toF16")
+#define VX_KERNEL_NAME_GATHER_BATCH_I8TOF16      CVIVANTE_NAMESPACE("evis.gather_batch_I8toF16")
+#define VX_KERNEL_NAME_GATHER_BATCH_I16TOF16     CVIVANTE_NAMESPACE("evis.gather_batch_I16toF16")
+#define VX_KERNEL_NAME_GATHER_BATCH_F16TOI8      CVIVANTE_NAMESPACE("evis.gather_batch_F16toI8")
+#define VX_KERNEL_NAME_GATHER_BATCH_F16TOI16     CVIVANTE_NAMESPACE("evis.gather_batch_F16toI16")
+#define VX_KERNEL_NAME_GATHER_BATCH_U8TOF16      CVIVANTE_NAMESPACE("evis.gather_batch_U8toF16")
+#define VX_KERNEL_NAME_GATHER_BATCH_F16TOU8      CVIVANTE_NAMESPACE("evis.gather_batch_F16toU8")
+
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_U8TOU8    CVIVANTE_NAMESPACE("evis.gather_batch_U8toU8_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_I8TOI8    CVIVANTE_NAMESPACE("evis.gather_batch_I8toI8_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_I16TOI16  CVIVANTE_NAMESPACE("evis.gather_batch_I16toI16_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_F16TOF16  CVIVANTE_NAMESPACE("evis.gather_batch_F16toF16_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_I8TOF16   CVIVANTE_NAMESPACE("evis.gather_batch_I8toF16_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_I16TOF16  CVIVANTE_NAMESPACE("evis.gather_batch_I16toF16_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_F16TOI8   CVIVANTE_NAMESPACE("evis.gather_batch_F16toI8_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_F16TOI16  CVIVANTE_NAMESPACE("evis.gather_batch_F16toI16_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_U8TOF16   CVIVANTE_NAMESPACE("evis.gather_batch_U8toF16_axis0")
+#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_F16TOU8   CVIVANTE_NAMESPACE("evis.gather_batch_F16toU8_axis0")
+
 #define VX_KERNEL_NAME_GATHER_ARRAY_U8TOU8    CVIVANTE_NAMESPACE("evis.gather_U8toU8_array")
 #define VX_KERNEL_NAME_GATHER_ARRAY_I8TOI8    CVIVANTE_NAMESPACE("evis.gather_I8toI8_array")
 #define VX_KERNEL_NAME_GATHER_ARRAY_I16TOI16  CVIVANTE_NAMESPACE("evis.gather_I16toI16_array")
@ -77,31 +99,43 @@ __BEGIN_DECLS
 #define KERNEL_SOURCE_1    "gather"
 #define KERNEL_SOURCE_2    "gather_mix"
 #define KERNEL_SOURCE_3    "gather_array"
+#define KERNEL_SOURCE_4    "gather_batch"
+#define KERNEL_SOURCE_5    "gather_mix_batch"

 // Add kernel hashtable here
-#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_axis0, _is_max) \
-    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_axis0 << 4) | (_is_max))
+#define HASH_GATHER_KEY(_in0_type, _in1_type, _out_type, _axis0, _max, _batch) \
+    ((_in0_type << 24) | (_in1_type << 16) | (_out_type << 8) | (_axis0 << 6) | (_max << 4) | (_batch))

 #define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0), \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0, 0), \
        VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \
        SOURCE },

 #define TENSOR_GATHER_AXIS0_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0), \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0, 0), \
        VX_KERNEL_NAME_GATHER_AXIS0_##IN0_TYPE##TO##OUT_TYPE, \
        SOURCE },

 #define TENSOR_GATHER_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 1), \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 1, 0), \
        VX_KERNEL_NAME_GATHER_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \
        SOURCE },

 #define TENSOR_GATHER_AXIS0_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 1), \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 1, 0), \
        VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \
        SOURCE },

+#define TENSOR_GATHER_BATCH_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0, 1), \
+        VX_KERNEL_NAME_GATHER_BATCH_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
+#define TENSOR_GATHER_BATCH_AXIS0_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0, 1), \
+        VX_KERNEL_NAME_GATHER_BATCH_AXIS0_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
 static const struct {
        uint32_t key;
        char* function_name;
@ -136,6 +170,26 @@ static const struct {
    TENSOR_GATHER_AXIS0_ARRAY_KERNELS(I8, I32,  I8,    KERNEL_SOURCE_3)
    TENSOR_GATHER_AXIS0_ARRAY_KERNELS(I16, I32, I16,   KERNEL_SOURCE_3)
    TENSOR_GATHER_AXIS0_ARRAY_KERNELS(F16, I32, F16,   KERNEL_SOURCE_3)
+    TENSOR_GATHER_BATCH_KERNELS(U8, I32,  U8,          KERNEL_SOURCE_4)
+    TENSOR_GATHER_BATCH_KERNELS(I8, I32,  I8,          KERNEL_SOURCE_4)
+    TENSOR_GATHER_BATCH_KERNELS(I16, I32, I16,         KERNEL_SOURCE_4)
+    TENSOR_GATHER_BATCH_KERNELS(F16, I32, F16,         KERNEL_SOURCE_4)
+    TENSOR_GATHER_BATCH_KERNELS(I8, I32,  F16,         KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_KERNELS(I16, I32, F16,         KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_KERNELS(F16, I32, I8,          KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_KERNELS(F16, I32, I16,         KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_KERNELS(U8, I32,  F16,         KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_KERNELS(F16, I32, U8,          KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(U8, I32,  U8,    KERNEL_SOURCE_4)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(I8, I32,  I8,    KERNEL_SOURCE_4)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(I16, I32, I16,   KERNEL_SOURCE_4)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(F16, I32, F16,   KERNEL_SOURCE_4)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(I8, I32,  F16,   KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(I16, I32, F16,   KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(F16, I32, I8,    KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(F16, I32, I16,   KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(U8, I32,  F16,   KERNEL_SOURCE_5)
+    TENSOR_GATHER_BATCH_AXIS0_KERNELS(F16, I32, U8,    KERNEL_SOURCE_5)
 };

 /*
@ -158,6 +212,7 @@ static vsi_status get_gather_tensor_reshape_size
    vsi_nn_tensor_t ** inputs,
    vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
    vsi_size_t block_size,
+    vsi_size_t batch_dims,
    uint32_t idxFlg,
    int32_t* arrayFlg
    )
@ -167,13 +222,19 @@ static vsi_status get_gather_tensor_reshape_size
    vsi_size_t *input_size = inputs[0]->attr.size;
    uint32_t i = 0;
    vsi_size_t elementCnt = 1;
+    vsi_size_t outerCnt = 1;
 #define VSI_NN_MAX_IMAGE_WIDTH  (65536)

-    for(i = 0; i < dims_num; ++i)
+    for(i = 0; i < dims_num - batch_dims; ++i)
    {
        elementCnt *= input_size[i];
    }

+    for(; i < dims_num; ++i)
+    {
+        outerCnt *= input_size[i];
+    }
+
    for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
    {
        sizes[i] = 1;
@ -182,13 +243,14 @@ static vsi_status get_gather_tensor_reshape_size
    if (idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH)
    {
        sizes[0] = elementCnt;
-        sizes[1] = 1;
+        sizes[1] = outerCnt;
        status = VSI_SUCCESS;
    }
    else
    {
        sizes[0] = block_size;
        sizes[1] = elementCnt / block_size;
+        sizes[2] = outerCnt;
        if ((elementCnt / block_size) > VSI_NN_MAX_IMAGE_WIDTH)
        {
            arrayFlg[0] = 1;
@ -222,6 +284,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
    int32_t       block_num   = 0;
    int32_t       indices_num = 1;
    uint32_t      input_dims1 = 0;
+    int32_t       batch       = 1;
    vx_uint32     i           = 0;
    vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
    vsi_size_array_t * input1_shape = NULL;
@ -283,7 +346,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)

    input1_shape  = attr[1]->shape;
    input_dims1   = (uint32_t)input1_shape->size;
-    for (i = 0; i < input_dims1; i++)
+    for (i = 0; i < input_dims1 - 1; i++)
    {
        indices_num *= (int32_t)(input1_shape->data[i]);
    }
@ -376,6 +439,11 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
 #undef _PACK_SELECT_KEY

    status = vsi_nn_kernel_gpu_add_param(node, "indices_num", &indices_num);
+    if (attr[2]->shape->size > 2)
+    {
+        batch = (int32_t)attr[2]->shape->data[2];
+        status = vsi_nn_kernel_gpu_add_param(node, "batch", &batch);
+    }
    CHECK_STATUS_FAIL_GOTO(status, OnError );

 OnError:
@ -415,6 +483,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)

    int32_t       block_num   = 0;
    int32_t       indices_num = 1;
+    int32_t       batch       = 1;
    uint32_t      input_dims1 = 0;
    vx_uint32     i           = 0;
    vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
@ -475,10 +544,11 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)

    input1_shape  = attr[1]->shape;
    input_dims1   = (uint32_t)input1_shape->size;
-    for (i = 0; i < input_dims1; i++)
+    for (i = 0; i < input_dims1 - 1; i++)
    {
        indices_num *= (int32_t)(input1_shape->data[i]);
    }
+    batch = (int32_t)(input1_shape->data[input_dims1 - 1]);

    shaderParam.global_scale[0]  = 4;
    shaderParam.global_scale[1]  = 1;
@ -486,7 +556,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
    shaderParam.global_size[0]   = gpu_align_p2((indices_num + shaderParam.global_scale[0] - 1)
        / shaderParam.global_scale[0], 4);
    shaderParam.global_size[1]   = block_num;
-    shaderParam.global_size[2]   = 1;
+    shaderParam.global_size[2]   = batch;

    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
    CHECK_STATUS_FAIL_GOTO(status, OnError);
@ -585,6 +655,10 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
 #undef _PACK_SELECT_KEY

    status = vsi_nn_kernel_gpu_add_param(node, "indices_num", &indices_num);
+    if (attr[2]->shape->size > 2)
+    {
+        status |= vsi_nn_kernel_gpu_add_param(node, "batch", &batch);
+    }
    CHECK_STATUS_FAIL_GOTO(status, OnError );

 OnError:
@ -617,7 +691,8 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel,
    const vsi_nn_kernel_param_t * params,
    int32_t axis,
-    int32_t is_array
+    int32_t is_array,
+    int32_t is_batch
    )
 {
    vsi_status status = VSI_FAILURE;
@ -638,7 +713,7 @@ static vsi_status _query_kernel
        output_dtype = F16;
    }

-    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis, is_array);
+    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis, is_array, is_batch);

    for( i = 0; i < _cnt_of_array(gather_map); i ++ )
    {
@ -688,25 +763,30 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t tmp_params[_GATHER_PARAM_NUM] = { NULL };
    vsi_nn_kernel_node_t node = NULL;
    vsi_size_t  shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
    int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
    int32_t block_num   = vsi_nn_kernel_param_get_int32( params, "block_num" );
    int32_t axis_num    = vsi_nn_kernel_param_get_int32( params, "axis_num" );
    int32_t axis        = vsi_nn_kernel_param_get_int32( params, "axis" );
+    int32_t batch_dims  = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
    int32_t axis0_flg   = 0;
    int32_t is_array    = block_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0;
+    int32_t is_batch    = batch_dims > 0 ? 1 : 0;
+    vsi_size_t rs_dim   = batch_dims == 0 ? 2 : 3;
+    int32_t i           = 0;

    if (axis == 0)
    {
-        status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, 0, &is_array);
-        status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1, &is_array);
-        status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], 0, &is_array);
+        status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, batch_dims, 0, &is_array);
+        status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array);
+        status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], batch_dims, 0, &is_array);
        axis0_flg = 1;
    }
    else
    {
-        status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0, &is_array);
-        status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1, &is_array);
-        status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &is_array);
+        status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0, &is_array);
+        status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array);
+        status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0, &is_array);
        axis0_flg = 0;
    }
 #undef VSI_NN_MAX_BLOCK_SIZE
@ -715,38 +795,45 @@ static vsi_nn_kernel_node_t _setup
        return NULL;
    }

+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+        inputs[0], shapes[0], rs_dim );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+        inputs[1], shapes[1], 2 );
+    reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+        outputs[0], shapes[2], rs_dim );
+
    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
        return NULL;
    }

-    status = _query_kernel( inputs, outputs, kernel, params, axis0_flg, is_array);
+    status = _query_kernel( inputs, outputs, kernel, params, axis0_flg, is_array, is_batch);
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
        if ( node )
        {
-            uint32_t index = 0;
-#define RESHAPE_DIM 2
+            uint32_t index = 3;
+
            /* Pass parameters to node. */
-            tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t,  shapes[0], RESHAPE_DIM );
-            tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t,  shapes[1], RESHAPE_DIM );
-            tmp_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], RESHAPE_DIM );
-#undef RESHAPE_DIM
+            vsi_nn_kernel_node_pack_io( tmp_params, _GATHER_PARAM_NUM,
+                reshape_tensors, 2, &reshape_tensors[2], 1 );
            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num );
            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
            status = vsi_nn_kernel_node_pass_param( node, tmp_params, _GATHER_PARAM_NUM );
-            CHECK_STATUS(status);
-            vsi_nn_kernel_tensor_release( &tmp_params[0] );
-            vsi_nn_kernel_tensor_release( &tmp_params[1] );
-            vsi_nn_kernel_tensor_release( &tmp_params[2] );
            vsi_nn_kernel_scalar_release( &tmp_params[3] );
            vsi_nn_kernel_scalar_release( &tmp_params[4] );
            vsi_nn_kernel_scalar_release( &tmp_params[5] );
        }
    }
+
+    for (i = 0; i < 3; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
    return node;
 } /* _setup() */

--- a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
@ -22,7 +22,6 @@
 *
 *****************************************************************************/

-
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -994,44 +993,6 @@ static vsi_status _query_kernel
    return status;
 } /* _query_kernel() */

-static int32_t _optimize_gn_shape
-    (
-    vsi_nn_tensor_t ** inputs,
-    vsi_size_t group_size,
-    int32_t group_num,
-    vsi_size_t* opt_shape,
-    int32_t* is2D_flg
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_size_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
-    vsi_size_t new_rank = 0;
-    group_shape[0] = inputs[0]->attr.size[0];
-    group_shape[1] = inputs[0]->attr.size[1];
-    group_shape[2] = group_size;
-
-    vsi_nn_kernel_optimize_element_shape( group_shape, 3, opt_shape, &new_rank );
-
-    if (opt_shape[1] == 1)
-    {
-        opt_shape[1] = group_num;
-        opt_shape[2] = 1;
-        opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
-        is2D_flg[0] = 1;
-    }
-    else if (new_rank == 2)
-    {
-        opt_shape[2] = group_num;
-        opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
-    }
-    else
-    {
-        status = VSI_FAILURE;
-    }
-
-    return status;
-}
-
 static vsi_nn_kernel_node_t _setup
    (
    vsi_nn_graph_t              * graph,
@ -1077,11 +1038,13 @@ static vsi_nn_kernel_node_t _setup
        return NULL;
    }

-    status = _optimize_gn_shape(inputs, group_size, group_num, new_shape, &is2D_flg);
+    status =  vsi_nn_kernel_optimize_group_norm_shape( (const vsi_size_t*)inputs[0]->attr.size,
+        inputs[0]->attr.dim_num, group_num, 0, new_shape);
    if ( VSI_SUCCESS != status )
    {
        goto final;
    }
+    is2D_flg = (new_shape[2] == 1) && ((int32_t)new_shape[1] == group_num);
    rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4);
    rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4);

--- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
@ -1004,12 +1004,15 @@ static vsi_nn_kernel_node_t _setup
    uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
    uint32_t hashkey = 0;
    int32_t i = 0;
+    uint32_t rank = outputs[0]->attr.dim_num;
    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
-    int32_t reshape_flg  = vsi_nn_kernel_param_get_int32( params, "reshape_flg" );
+    int32_t reshape_flg  = outputs[0]->attr.size[1] * outputs[0]->attr.size[2] < GPU_TENSOR_MAX_WIDTH
+            && rank > 2;

    // Check if gpu can support the size
    if ( !vsi_nn_kernel_gpu_check_shape(
-        outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
+        outputs[0]->attr.size, outputs[0]->attr.dim_num ) ||
+        rank > 4 )
    {
        return NULL;
    }
--- a/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c
@ -76,9 +76,15 @@ static const _kernel_map_type _logical_ops_kernel_map[] =
    PACK_KERNEL_MAP(VSI_NN_LOGICAL_OR,  I8,  I8,  "or"),
    PACK_KERNEL_MAP(VSI_NN_LOGICAL_AND, I8,  I8,  "and"),
    PACK_KERNEL_MAP(VSI_NN_LOGICAL_XOR, I8,  I8,  "xor"),
+    PACK_KERNEL_MAP(VSI_NN_LOGICAL_OR,  BF16,  I8,  "or"),
+    PACK_KERNEL_MAP(VSI_NN_LOGICAL_AND, BF16,  I8,  "and"),
+    PACK_KERNEL_MAP(VSI_NN_LOGICAL_XOR, BF16,  I8,  "xor"),
    PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_OR,  I8,  I8,  "or"),
    PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_AND, I8,  I8,  "and"),
    PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_XOR, I8,  I8,  "xor"),
+    PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_OR,  BF16,  I8,  "or"),
+    PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_AND, BF16,  I8,  "and"),
+    PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_XOR, BF16,  I8,  "xor"),
 };


@ -159,6 +165,22 @@ DEF_KERNEL_INITIALIZER(_logical_ops_initializer)
        status = vsi_nn_kernel_gpu_add_param( node, "uniMulShortMinus1toFp16_2x8", &uniMulShortMinus1toFp16_2x8);
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
+    else if (BF16 == input_dtype)
+    {
+        gpu_dp_inst_t uniConvertInt16toInt8_2x8 = {{
+                0x11111111, // TCfg
+                0x00000000, // ASelt
+                0x03020100, 0x07060504, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000700, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status = vsi_nn_kernel_gpu_add_param( node, "uniConvertInt16toInt8_2x8", &uniConvertInt16toInt8_2x8);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }

    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
    CHECK_STATUS_FAIL_GOTO(status, final );
@ -209,9 +231,13 @@ static vsi_status _query_kernel
        return VSI_FAILURE;
    }

-    if (BOOL8 == in_dtype && BOOL8 == out_dtype)
+    if (BOOL8 == in_dtype)
    {
        in_dtype  = I8;
+    }
+
+    if (BOOL8 == out_dtype)
+    {
        out_dtype = I8;
    }

--- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
@ -56,6 +56,7 @@ __BEGIN_DECLS
 #define KERNEL_SOURCE_12   "matrixmul_u8u8_f16"
 #define KERNEL_SOURCE_13   "matrixmul_i16"
 #define KERNEL_SOURCE_14   "matrixmul_f16i16_i16"
+#define KERNEL_SOURCE_15   "matrixmul_bf16"

 #define HASH_MATRIX_MUL_KEY(_input0_type, _input1_type, _output_type, _trans_a, _trans_b) \
    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_trans_a << 4) | (_trans_b))
@ -110,6 +111,7 @@ static const struct {
    TENSOR_MATRIX_MUL_KERNELS(I8,  F16, F16,      KERNEL_SOURCE_8)
    TENSOR_MATRIX_MUL_KERNELS(I16, F16, F16,      KERNEL_SOURCE_8)
    TENSOR_MATRIX_MUL_KERNELS(F16, F16, F16,      KERNEL_SOURCE_2)
+    TENSOR_MATRIX_MUL_KERNELS(BF16,BF16,BF16,     KERNEL_SOURCE_15)
    TENSOR_MATRIX_MUL_KERNELS(F16, F16, U8,       KERNEL_SOURCE_11)
    TENSOR_MATRIX_MUL_KERNELS(F16, F16, I8,       KERNEL_SOURCE_11)
    TENSOR_MATRIX_MUL_KERNELS(F16, F16, I16,      KERNEL_SOURCE_11)
@ -119,6 +121,7 @@ static const struct {
    TENSOR_MATRIX_MUL_TRANSB_KERNELS(F16, U8,  U8,     KERNEL_SOURCE_4)
    TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8,  U8,  F16,    KERNEL_SOURCE_5)
    TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8,  U8,  U8,     KERNEL_SOURCE_5)
+    TENSOR_MATRIX_MUL_TRANSB_KERNELS(BF16,BF16,BF16,   KERNEL_SOURCE_15)
    TENSOR_MATRIX_MUL_TRANSA_KERNELS(U8,  U8,  U8,     KERNEL_SOURCE_7)
    TENSOR_MATRIX_MUL_TRANSA_KERNELS(I8,  I8,  I8,     KERNEL_SOURCE_7)
    TENSOR_MATRIX_MUL_TRANSA_KERNELS(I16, I16, I16,    KERNEL_SOURCE_7)
@ -126,6 +129,7 @@ static const struct {
    TENSOR_MATRIX_MUL_TRANSA_KERNELS(I8,  F16, I8,     KERNEL_SOURCE_7)
    TENSOR_MATRIX_MUL_TRANSA_KERNELS(I16, F16, I16,    KERNEL_SOURCE_7)
    TENSOR_MATRIX_MUL_TRANSA_KERNELS(F16, F16, F16,    KERNEL_SOURCE_7)
+    TENSOR_MATRIX_MUL_TRANSA_KERNELS(BF16,BF16,BF16,   KERNEL_SOURCE_15)
 };

 /*
@ -587,6 +591,36 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
            0x00000600, // AccumType, ConstantType, and PostShift
            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x05050404, 0x07070606, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractOddData_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x07050301, 0x07050301, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};

        float scaleIn0divOut = src0Scale / dstScale;
        float scaleIn1divOut = src1Scale / dstScale;
@ -936,6 +970,22 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
                CHECK_STATUS_FAIL_GOTO(status, OnError );
            }
            break;
+        case _PACK_SELECT_KEY( BF16, BF16, BF16, 0, 0, 0 ):
+        case _PACK_SELECT_KEY( BF16, BF16, BF16, 0, 0, 1 ):
+        case _PACK_SELECT_KEY( BF16, BF16, BF16, 0, 1, 0 ):
+        case _PACK_SELECT_KEY( BF16, BF16, BF16, 0, 1, 1 ):
+        case _PACK_SELECT_KEY( BF16, BF16, BF16, 1, 0, 0 ):
+        case _PACK_SELECT_KEY( BF16, BF16, BF16, 1, 0, 1 ):
+            {
+                status = vsi_nn_kernel_gpu_add_param( node,
+                        "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
        case _PACK_SELECT_KEY( F16, F16, F16, 0, 0, 1 ):
            {
                status = vsi_nn_kernel_gpu_add_param( node,
--- a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
@ -64,6 +64,10 @@ __BEGIN_DECLS
 #define KERNEL_NAME_MAXIMUM_F16F16TOI8_2D           CVIVANTE_NAMESPACE("evis.maximum_F16F16toI8_2D")
 #define KERNEL_NAME_MAXIMUM_F16F16TOI16             CVIVANTE_NAMESPACE("evis.maximum_F16F16toI16")
 #define KERNEL_NAME_MAXIMUM_F16F16TOI16_2D          CVIVANTE_NAMESPACE("evis.maximum_F16F16toI16_2D")
+#define KERNEL_NAME_MAXIMUM_I16I16TOU8              CVIVANTE_NAMESPACE("evis.maximum_I16I16toU8")
+#define KERNEL_NAME_MAXIMUM_I16I16TOU8_2D           CVIVANTE_NAMESPACE("evis.maximum_I16I16toU8_2D")
+#define KERNEL_NAME_MAXIMUM_U8U8TOI16               CVIVANTE_NAMESPACE("evis.maximum_U8U8toI16")
+#define KERNEL_NAME_MAXIMUM_U8U8TOI16_2D            CVIVANTE_NAMESPACE("evis.maximum_U8U8toI16_2D")

 #define KERNEL_SOURCE_1    "maximum",
 #define KERNEL_SOURCE_2    "maximum_fp16",
@ -109,6 +113,7 @@ static const struct {
    TENSOR_MAX_KERNELS(F16, F16, I8,        KERNEL_SOURCE_1)
    TENSOR_MAX_KERNELS(I8,  I8, I8,         KERNEL_SOURCE_1)
    TENSOR_MAX_KERNELS(U8,  U8, U8,         KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS(U8,  U8, I16,        KERNEL_SOURCE_1)
    TENSOR_MAX_KERNELS(I16, I16, I16,       KERNEL_SOURCE_1)

    TENSOR_MAX_KERNELS(F16, F16, U8,        KERNEL_SOURCE_2)
@ -120,12 +125,14 @@ static const struct {
    TENSOR_MAX_KERNELS(I16, F16, I16,       KERNEL_SOURCE_3)
    TENSOR_MAX_KERNELS(I16, F16, F16,       KERNEL_SOURCE_3)
    TENSOR_MAX_KERNELS(F16, F16, I16,       KERNEL_SOURCE_3)
+    TENSOR_MAX_KERNELS(I16, I16, U8,        KERNEL_SOURCE_3)

    TENSOR_MAX_KERNELS_2D_HALF(F16, F16, F16,    KERNEL_SOURCE_1)
    TENSOR_MAX_KERNELS_2D_HALF(BF16, BF16, BF16, KERNEL_SOURCE_1)
    TENSOR_MAX_KERNELS_2D(F16, F16, I8,     KERNEL_SOURCE_1)
    TENSOR_MAX_KERNELS_2D(I8,  I8, I8,      KERNEL_SOURCE_1)
    TENSOR_MAX_KERNELS_2D(U8,  U8, U8,      KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS_2D(U8,  U8,  I16,    KERNEL_SOURCE_1)
    TENSOR_MAX_KERNELS_2D(I16, I16, I16,    KERNEL_SOURCE_1)

    TENSOR_MAX_KERNELS_2D(F16, F16, U8,     KERNEL_SOURCE_2)
@ -137,6 +144,7 @@ static const struct {
    TENSOR_MAX_KERNELS_2D(I16, F16, I16,    KERNEL_SOURCE_3)
    TENSOR_MAX_KERNELS_2D(I16, F16, F16,    KERNEL_SOURCE_3)
    TENSOR_MAX_KERNELS_2D(F16, F16, I16,    KERNEL_SOURCE_3)
+    TENSOR_MAX_KERNELS_2D(I16, I16, U8,     KERNEL_SOURCE_3)
 };

 static vx_param_description_t kernel_param_def[] =
@ -192,6 +200,14 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
        in0_fl = (uint8_t)attr[0]->dfp.fl;
+        if (in0_fl > 0)
+        {
+            src0Scale = 1.0f / (float) ((int64_t)1 << in0_fl);
+        }
+        else
+        {
+            src0Scale = (float)((int64_t)1 << -in0_fl);
+        }
    }
    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
        || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM)
@ -203,6 +219,14 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
        in1_fl = (uint8_t)attr[1]->dfp.fl;
+        if (in1_fl > 0)
+        {
+            src0Scale = 1.0f / (float) ((int64_t)1 << in1_fl);
+        }
+        else
+        {
+            src0Scale = (float)((int64_t)1 << -in1_fl);
+        }
    }
    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
        || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
@ -242,7 +266,8 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
            attr[1]->dtype, attr[2]->dtype );

    if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16)
-        || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) )
+        || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16))
+        || (attr[0]->dtype == I16 && attr[1]->dtype == I16 && attr[2]->dtype == U8) )
    {
        gpu_param.global_scale[0] = 8;
        gpu_param.global_scale[1] = 1;
@ -384,6 +409,8 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
    case _PACK_SELECT_KEY( U8, U8, U8 ):
    case _PACK_SELECT_KEY( U8, F16, U8 ):
    case _PACK_SELECT_KEY( F16, F16, U8 ):
+    case _PACK_SELECT_KEY( U8, U8, I16 ):
+    case _PACK_SELECT_KEY( I16, I16, U8 ):
        {
            uint16_t M0               = 0;
            uint16_t M1               = 0;
@ -427,12 +454,15 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
            status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
            CHECK_STATUS_FAIL_GOTO(status, final );

-            if (attr[0]->dtype == U8)
+            if (attr[0]->dtype == U8 || attr[0]->dtype == I16)
            {
                status = vsi_nn_kernel_gpu_add_param( node,
                        "uniU8MulAndPostShift0_Lo_2x8",  &uniU8MulAndPostShift_Lo_2x8 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                        "uniU8MulAndPostShift0_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
+                if (attr[0]->dtype != I16)
+                {
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniU8MulAndPostShift0_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
+                }
                status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
                CHECK_STATUS_FAIL_GOTO(status, final );
            }
@ -461,8 +491,11 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
                gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 );
                status = vsi_nn_kernel_gpu_add_param( node,
                        "uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                        "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
+                if (attr[0]->dtype != I16)
+                {
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
+                }
                CHECK_STATUS_FAIL_GOTO(status, final );
            }
        }
@ -751,7 +784,6 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
                    tmp_inputs, 2, outputs, 1 );
            status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
-
        }
    }
    return node;
@ -760,4 +792,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_EVIS( maximum, _setup )
-
--- a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
@ -64,6 +64,10 @@ __BEGIN_DECLS
 #define KERNEL_NAME_MINIMUM_F16F16TOI8_2D           CVIVANTE_NAMESPACE("evis.minimum_F16F16toI8_2D")
 #define KERNEL_NAME_MINIMUM_F16F16TOI16             CVIVANTE_NAMESPACE("evis.minimum_F16F16toI16")
 #define KERNEL_NAME_MINIMUM_F16F16TOI16_2D          CVIVANTE_NAMESPACE("evis.minimum_F16F16toI16_2D")
+#define KERNEL_NAME_MINIMUM_I16I16TOU8              CVIVANTE_NAMESPACE("evis.minimum_I16I16toU8")
+#define KERNEL_NAME_MINIMUM_I16I16TOU8_2D           CVIVANTE_NAMESPACE("evis.minimum_I16I16toU8_2D")
+#define KERNEL_NAME_MINIMUM_U8U8TOI16               CVIVANTE_NAMESPACE("evis.minimum_U8U8toI16")
+#define KERNEL_NAME_MINIMUM_U8U8TOI16_2D            CVIVANTE_NAMESPACE("evis.minimum_U8U8toI16_2D")

 #define KERNEL_SOURCE_1    "minimum",
 #define KERNEL_SOURCE_2    "minimum_fp16",
@ -109,6 +113,7 @@ static const struct {
    TENSOR_MIN_KERNELS(F16, F16, I8,        KERNEL_SOURCE_1)
    TENSOR_MIN_KERNELS(I8,  I8, I8,         KERNEL_SOURCE_1)
    TENSOR_MIN_KERNELS(U8,  U8, U8,         KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS(U8,  U8, I16,        KERNEL_SOURCE_1)
    TENSOR_MIN_KERNELS(I16, I16, I16,       KERNEL_SOURCE_1)

    TENSOR_MIN_KERNELS(F16, F16, U8,        KERNEL_SOURCE_2)
@ -120,12 +125,14 @@ static const struct {
    TENSOR_MIN_KERNELS(I16, F16, I16,       KERNEL_SOURCE_3)
    TENSOR_MIN_KERNELS(I16, F16, F16,       KERNEL_SOURCE_3)
    TENSOR_MIN_KERNELS(F16, F16, I16,       KERNEL_SOURCE_3)
+    TENSOR_MIN_KERNELS(I16, I16, U8,        KERNEL_SOURCE_3)

    TENSOR_MIN_KERNELS_2D_HALF(F16, F16, F16,    KERNEL_SOURCE_1)
    TENSOR_MIN_KERNELS_2D_HALF(BF16, BF16, BF16, KERNEL_SOURCE_1)
    TENSOR_MIN_KERNELS_2D(F16, F16, I8,     KERNEL_SOURCE_1)
    TENSOR_MIN_KERNELS_2D(I8,  I8, I8,      KERNEL_SOURCE_1)
    TENSOR_MIN_KERNELS_2D(U8,  U8, U8,      KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS_2D(U8,  U8,  I16,    KERNEL_SOURCE_1)
    TENSOR_MIN_KERNELS_2D(I16, I16, I16,    KERNEL_SOURCE_1)

    TENSOR_MIN_KERNELS_2D(F16, F16, U8,     KERNEL_SOURCE_2)
@ -137,6 +144,7 @@ static const struct {
    TENSOR_MIN_KERNELS_2D(I16, F16, I16,    KERNEL_SOURCE_3)
    TENSOR_MIN_KERNELS_2D(I16, F16, F16,    KERNEL_SOURCE_3)
    TENSOR_MIN_KERNELS_2D(F16, F16, I16,    KERNEL_SOURCE_3)
+    TENSOR_MIN_KERNELS_2D(I16, I16, U8,     KERNEL_SOURCE_3)
 };

 static vx_param_description_t kernel_param_def[] =
@ -192,6 +200,14 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
        in0_fl = (uint8_t)attr[0]->dfp.fl;
+        if (in0_fl > 0)
+        {
+            src0Scale = 1.0f / (float) ((int64_t)1 << in0_fl);
+        }
+        else
+        {
+            src0Scale = (float)((int64_t)1 << -in0_fl);
+        }
    }
    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
        || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM)
@ -203,6 +219,14 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
        in1_fl = (uint8_t)attr[1]->dfp.fl;
+        if (in1_fl > 0)
+        {
+            src0Scale = 1.0f / (float) ((int64_t)1 << in1_fl);
+        }
+        else
+        {
+            src0Scale = (float)((int64_t)1 << -in1_fl);
+        }
    }
    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
        || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
@ -242,7 +266,8 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
            attr[1]->dtype, attr[2]->dtype );

    if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16)
-        || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) )
+        || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16))
+        || (attr[0]->dtype == I16 && attr[1]->dtype == I16 && attr[2]->dtype == U8) )
    {
        gpu_param.global_scale[0] = 8;
        gpu_param.global_scale[1] = 1;
@ -384,6 +409,8 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
    case _PACK_SELECT_KEY( U8, U8, U8 ):
    case _PACK_SELECT_KEY( U8, F16, U8 ):
    case _PACK_SELECT_KEY( F16, F16, U8 ):
+    case _PACK_SELECT_KEY( U8, U8, I16 ):
+    case _PACK_SELECT_KEY( I16, I16, U8 ):
        {
            uint16_t M0               = 0;
            uint16_t M1               = 0;
@ -427,12 +454,15 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
            status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
            CHECK_STATUS_FAIL_GOTO(status, final );

-            if (attr[0]->dtype == U8)
+            if (attr[0]->dtype == U8 || attr[0]->dtype == I16)
            {
                status = vsi_nn_kernel_gpu_add_param( node,
                        "uniU8MulAndPostShift0_Lo_2x8",  &uniU8MulAndPostShift_Lo_2x8 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                        "uniU8MulAndPostShift0_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
+                if (attr[0]->dtype != I16)
+                {
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniU8MulAndPostShift0_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
+                }
                status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
                CHECK_STATUS_FAIL_GOTO(status, final );
            }
@ -461,8 +491,11 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
                gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 );
                status = vsi_nn_kernel_gpu_add_param( node,
                        "uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                        "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
+                if (attr[0]->dtype != I16)
+                {
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
+                }
                CHECK_STATUS_FAIL_GOTO(status, final );
            }
        }
@ -751,7 +784,6 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
                    tmp_inputs, 2, outputs, 1 );
            status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
-
        }
    }
    return node;
@ -760,4 +792,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_EVIS( minimum, _setup )
-
--- a/src/tim/vx/internal/src/kernel/evis/moments_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c
@ -101,14 +101,17 @@ static const struct {
    TENSOR_MOMENTS_KERNELS(I8,  F16, 0,    KERNEL_SOURCE_1)
    TENSOR_MOMENTS_KERNELS(I16, F16, 0,    KERNEL_SOURCE_1)
    TENSOR_MOMENTS_KERNELS(F16, F16, 0,    KERNEL_SOURCE_1)
+    TENSOR_MOMENTS_KERNELS(BF16,BF16,0,    KERNEL_SOURCE_1)
    TENSOR_MOMENTS_KERNELS(U8,  F16, 1,    KERNEL_SOURCE_2)
    TENSOR_MOMENTS_KERNELS(I8,  F16, 1,    KERNEL_SOURCE_2)
    TENSOR_MOMENTS_KERNELS(I16, F16, 1,    KERNEL_SOURCE_2)
    TENSOR_MOMENTS_KERNELS(F16, F16, 1,    KERNEL_SOURCE_2)
+    TENSOR_MOMENTS_KERNELS(BF16,BF16,1,    KERNEL_SOURCE_2)
    TENSOR_MOMENTS_KERNELS(U8,  F16, 2,    KERNEL_SOURCE_3)
    TENSOR_MOMENTS_KERNELS(I8,  F16, 2,    KERNEL_SOURCE_3)
    TENSOR_MOMENTS_KERNELS(I16, F16, 2,    KERNEL_SOURCE_3)
    TENSOR_MOMENTS_KERNELS(F16, F16, 2,    KERNEL_SOURCE_3)
+    TENSOR_MOMENTS_KERNELS(BF16,BF16,2,    KERNEL_SOURCE_3)
    TENSOR_MOMENTS_KERNELS(U8,  U8,  0,    KERNEL_SOURCE_6)
    TENSOR_MOMENTS_KERNELS(U8,  U8,  1,    KERNEL_SOURCE_6)
    TENSOR_MOMENTS_KERNELS(U8,  U8,  2,    KERNEL_SOURCE_6)
@ -116,26 +119,31 @@ static const struct {
    TENSOR_MOMENTS_TWO_AXIS_KERNELS(I8,  F16, 0, 1,       KERNEL_SOURCE_4)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS(I16, F16, 0, 1,       KERNEL_SOURCE_4)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS(F16, F16, 0, 1,       KERNEL_SOURCE_4)
+    TENSOR_MOMENTS_TWO_AXIS_KERNELS(BF16,BF16,0, 1,       KERNEL_SOURCE_7)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8,  U8,  0, 1,       KERNEL_SOURCE_6)
    TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8,  F16, 0, 1, 2,  KERNEL_SOURCE_5)
    TENSOR_MOMENTS_THREE_AXIS_KERNELS(I8,  F16, 0, 1, 2,  KERNEL_SOURCE_5)
    TENSOR_MOMENTS_THREE_AXIS_KERNELS(I16, F16, 0, 1, 2,  KERNEL_SOURCE_5)
    TENSOR_MOMENTS_THREE_AXIS_KERNELS(F16, F16, 0, 1, 2,  KERNEL_SOURCE_5)
+    TENSOR_MOMENTS_THREE_AXIS_KERNELS(BF16,BF16,0, 1, 2,  KERNEL_SOURCE_5)
    TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8,  U8,  0, 1, 2,  KERNEL_SOURCE_7)
    TENSOR_MOMENTS_KERNELS_2D(U8,  F16, 0,                KERNEL_SOURCE_1)
    TENSOR_MOMENTS_KERNELS_2D(I8,  F16, 0,                KERNEL_SOURCE_1)
    TENSOR_MOMENTS_KERNELS_2D(I16, F16, 0,                KERNEL_SOURCE_1)
    TENSOR_MOMENTS_KERNELS_2D(F16, F16, 0,                KERNEL_SOURCE_1)
+    TENSOR_MOMENTS_KERNELS_2D(BF16,BF16,0,                KERNEL_SOURCE_1)
    TENSOR_MOMENTS_KERNELS_2D(U8,  F16, 1,                KERNEL_SOURCE_2)
    TENSOR_MOMENTS_KERNELS_2D(I8,  F16, 1,                KERNEL_SOURCE_2)
    TENSOR_MOMENTS_KERNELS_2D(I16, F16, 1,                KERNEL_SOURCE_2)
    TENSOR_MOMENTS_KERNELS_2D(F16, F16, 1,                KERNEL_SOURCE_2)
+    TENSOR_MOMENTS_KERNELS_2D(BF16,BF16,1,                KERNEL_SOURCE_2)
    TENSOR_MOMENTS_KERNELS_2D(U8,  U8,  0,                KERNEL_SOURCE_6)
    TENSOR_MOMENTS_KERNELS_2D(U8,  U8,  1,                KERNEL_SOURCE_6)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(U8,  F16, 0, 1,    KERNEL_SOURCE_4)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(I8,  F16, 0, 1,    KERNEL_SOURCE_4)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(I16, F16, 0, 1,    KERNEL_SOURCE_4)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(F16, F16, 0, 1,    KERNEL_SOURCE_4)
+    TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(BF16,BF16,0, 1,    KERNEL_SOURCE_7)
    TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(U8,  U8,  0, 1,    KERNEL_SOURCE_6)
 };

@ -461,6 +469,36 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
            0x00000000, 0x00000000, 0x00000000, 0x00000000,
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x05050404, 0x07070606, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractOddData_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x07050301, 0x07050301, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};

        switch( pack_key )
        {
@ -494,6 +532,18 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
                CHECK_STATUS_FAIL_GOTO(status, OnError );
            }
            break;
+        case _PACK_SELECT_KEY( BF16, BF16, 1, 0):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8",
+                        &uniConvBF16toF32_Part0_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8",
+                        &uniConvBF16toF32_Part1_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
+                        &uniExtractOddData_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
        case _PACK_SELECT_KEY( U8,  F16, 1, 1):
        case _PACK_SELECT_KEY( I8,  F16, 1, 1):
        case _PACK_SELECT_KEY( I16, F16, 1, 1):
@ -518,6 +568,16 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
                CHECK_STATUS_FAIL_GOTO(status, OnError );
            }
            break;
+        case _PACK_SELECT_KEY( BF16, BF16, 1, 1):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8",
+                        &uniConvBF16toF32_Part0_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
+                        &uniExtractOddData_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
        case _PACK_SELECT_KEY( U8,  F16, 1, 2):
        case _PACK_SELECT_KEY( I8,  F16, 1, 2):
        case _PACK_SELECT_KEY( I16, F16, 1, 2):
@ -542,6 +602,15 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
                CHECK_STATUS_FAIL_GOTO(status, OnError );
            }
            break;
+        case _PACK_SELECT_KEY( BF16, BF16, 1, 2):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
+                        &uniExtractOddData_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
        case _PACK_SELECT_KEY( U8,  F16, 2, 0):
        case _PACK_SELECT_KEY( I8,  F16, 2, 0):
        case _PACK_SELECT_KEY( I16, F16, 2, 0):
@ -597,6 +666,18 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
                CHECK_STATUS_FAIL_GOTO(status, OnError );
            }
            break;
+        case _PACK_SELECT_KEY( BF16, BF16, 2, 0):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8",
+                        &uniConvBF16toF32_Part1_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
+                        &uniExtractOddData_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
+                status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
        case _PACK_SELECT_KEY( F16, F16, 3, 0):
            {
                status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
@ -608,6 +689,19 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
                CHECK_STATUS_FAIL_GOTO(status, OnError );
            }
            break;
+        case _PACK_SELECT_KEY( BF16, BF16, 3, 0):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8",
+                        &uniConvBF16toF32_Part1_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
+                        &uniExtractOddData_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
+                status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
+                status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
        case _PACK_SELECT_KEY( U8, U8, 1, 0):
        case _PACK_SELECT_KEY( U8, U8, 1, 1):
        case _PACK_SELECT_KEY( U8, U8, 1, 2):
--- a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
@ -68,27 +68,29 @@ typedef struct
 static const _kernel_map_type _one_hot_kernel_map[] =
 {
    // Register kernel here
-    PACK_ONE_HOT_KERNEL_3D( U8,  U8 ),
-    PACK_ONE_HOT_KERNEL_3D( U8,  F16 ),
-    PACK_ONE_HOT_KERNEL_3D( I8,  I8 ),
-    PACK_ONE_HOT_KERNEL_3D( I8,  F16 ),
-    PACK_ONE_HOT_KERNEL_3D( I16, I16 ),
-    PACK_ONE_HOT_KERNEL_3D( I16, F16 ),
-    PACK_ONE_HOT_KERNEL_3D( F16, F16 ),
-    PACK_ONE_HOT_KERNEL_3D( F16, I16 ),
-    PACK_ONE_HOT_KERNEL_3D( F16, U8 ),
-    PACK_ONE_HOT_KERNEL_3D( F16, I8 ),
+    PACK_ONE_HOT_KERNEL_3D( U8,   U8 ),
+    PACK_ONE_HOT_KERNEL_3D( U8,   F16 ),
+    PACK_ONE_HOT_KERNEL_3D( I8,   I8 ),
+    PACK_ONE_HOT_KERNEL_3D( I8,   F16 ),
+    PACK_ONE_HOT_KERNEL_3D( I16,  I16 ),
+    PACK_ONE_HOT_KERNEL_3D( I16,  F16 ),
+    PACK_ONE_HOT_KERNEL_3D( F16,  F16 ),
+    PACK_ONE_HOT_KERNEL_3D( F16,  I16 ),
+    PACK_ONE_HOT_KERNEL_3D( F16,  U8 ),
+    PACK_ONE_HOT_KERNEL_3D( F16,  I8 ),
+    PACK_ONE_HOT_KERNEL_3D( BF16, BF16 ),

-    PACK_ONE_HOT_KERNEL_2D( U8,  U8 ),
-    PACK_ONE_HOT_KERNEL_2D( U8,  F16 ),
-    PACK_ONE_HOT_KERNEL_2D( I8,  I8 ),
-    PACK_ONE_HOT_KERNEL_2D( I8,  F16 ),
-    PACK_ONE_HOT_KERNEL_2D( I16, I16 ),
-    PACK_ONE_HOT_KERNEL_2D( I16, F16 ),
-    PACK_ONE_HOT_KERNEL_2D( F16, F16 ),
-    PACK_ONE_HOT_KERNEL_2D( F16, I16 ),
-    PACK_ONE_HOT_KERNEL_2D( F16, U8 ),
-    PACK_ONE_HOT_KERNEL_2D( F16, I8 ),
+    PACK_ONE_HOT_KERNEL_2D( U8,   U8 ),
+    PACK_ONE_HOT_KERNEL_2D( U8,   F16 ),
+    PACK_ONE_HOT_KERNEL_2D( I8,   I8 ),
+    PACK_ONE_HOT_KERNEL_2D( I8,   F16 ),
+    PACK_ONE_HOT_KERNEL_2D( I16,  I16 ),
+    PACK_ONE_HOT_KERNEL_2D( I16,  F16 ),
+    PACK_ONE_HOT_KERNEL_2D( F16,  F16 ),
+    PACK_ONE_HOT_KERNEL_2D( F16,  I16 ),
+    PACK_ONE_HOT_KERNEL_2D( F16,  U8 ),
+    PACK_ONE_HOT_KERNEL_2D( F16,  I8 ),
+    PACK_ONE_HOT_KERNEL_2D( BF16, BF16 ),
 };


@ -274,6 +276,51 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer)
            "depth", &depth );
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
+    break;
+    case BF16:
+    {
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x05050404, 0x07070606, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractInteger_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status = vsi_nn_kernel_gpu_add_param( node,
+            "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "uniExtract8Data_2x8", &uniExtractInteger_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "depth", &depth );
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    break;
    default:
        break;
    }
--- a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
@ -98,7 +98,6 @@ static const struct {
    PRELU_KERNELS_2D(I8,   F16,  F16,  _2D,     KERNEL_SOURCE0)
    PRELU_KERNELS_2D(U8,   U8,   U8,   _2D,     KERNEL_SOURCE0)
    PRELU_KERNELS_2D(U8,   U8,   F16,  _2D,     KERNEL_SOURCE0)
-
 };

 static vx_param_description_t kernel_param_def[] =
@ -199,6 +198,7 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
    }
    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
    {
+        out_fl = 1;
        outputZP      = (float)attr[2]->asymm.zero_point;
        input_scale0   = input_scale0 / attr[2]->asymm.scale;
    }
@ -628,7 +628,6 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
                    reshape_tensors, 2, &reshape_tensors[2], 1 );
            status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
-
        }
    }

@ -643,4 +642,3 @@ final:
 __END_DECLS

 REGISTER_BACKEND_EVIS( prelu, _setup )
-
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@ -51,11 +51,13 @@ typedef enum
    UP_2X_HALF,
    UP_3X_HALF,
    UP_4X_HALF,
+    UP_8X_HALF,
 } _internal_scale_e;

 #define _RESIZE_BILINEAR_KERNEL_SOURCE(_input_type)      "resize_bilinear_"#_input_type
 #define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type)  "resize_bilinear_"#_input_type"_opt"
-#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers"
+#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_1"
+#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers_2"

 #define STR(a) #a
 // Add kernel hashtable here
@ -81,19 +83,25 @@ typedef enum
        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
            "_SAME_2x_upsample_half_pixel_centers"), \
-          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }

 #define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
            "_SAME_4x_upsample_half_pixel_centers"), \
-          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
+
+#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF ), \
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_SAME_8x_upsample_half_pixel_centers"), \
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(IN_DTYPE) }

 #define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF ), \
          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
            "_SAME_3x_upsample_half_pixel_centers"), \
-          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }

 typedef struct
 {
@ -120,6 +128,7 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
    PACK_KERNEL_MAP_UP_2X_HALF(U8, U8),
    PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
    PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
+    PACK_KERNEL_MAP_UP_8X_HALF(U8, U8),
 };


@ -224,6 +233,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
    vsi_bool    is_2x_up_kernel  = FALSE;
    vsi_bool    is_3x_up_kernel  = FALSE;
    vsi_bool    is_4x_up_kernel  = FALSE;
+    vsi_bool    is_8x_up_kernel  = FALSE;

    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
@ -280,6 +290,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
        is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
        is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
        is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
+        is_8x_up_kernel = (8 * in_width == out_width) && (8 * in_height == out_height);
    }

    if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
@ -330,7 +341,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
        outputZP     = 0;
    }

-    if (is_2x_up_kernel || is_4x_up_kernel)
+    if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
    {
        gpu_param.global_scale[0] = 16;
        gpu_param.global_scale[1] = 1;
@ -479,6 +490,76 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
+    else if (is_8x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
+            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
+            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
+            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
+            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
+            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
+            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
+            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
+            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
    else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
    {
        float dfpScale = input_scale * output_scale;
@ -965,25 +1046,25 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
        goto final;
    }

-    if (!is_2x_up_kernel && !is_3x_up_kernel && !is_4x_up_kernel)
+    if (!is_2x_up_kernel && !is_3x_up_kernel && !is_4x_up_kernel&& !is_8x_up_kernel)
    {
        status  = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value);
        CHECK_STATUS_FAIL_GOTO(status, final );
    }

-    if (is_2x_up_kernel || is_4x_up_kernel)
+    if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
    {
-        gpu_param.global_size[0]   = gpu_align_p2((out_width  + \
-                                     gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
-        gpu_param.global_size[1]   = depth;
-        gpu_param.dim              = 2;
+        gpu_param.global_size[0] = gpu_align_p2((out_width  + \
+                                   gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
+        gpu_param.global_size[1] = depth;
+        gpu_param.dim            = 2;
    }
    else
    {
-        gpu_param.global_size[0]   = gpu_align_p2((out_width  + \
-                                     gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
-        gpu_param.global_size[1]   = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1];
-        gpu_param.global_size[2]   = depth / gpu_param.global_scale[2];
+        gpu_param.global_size[0] = gpu_align_p2((out_width  + \
+                                   gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
+        gpu_param.global_size[1] = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1];
+        gpu_param.global_size[2] = depth / gpu_param.global_scale[2];
    }

    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
@ -1024,6 +1105,8 @@ static vsi_status _query_kernel
                    && (3 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
    vsi_bool is_4x_upsample =(4 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
                    && (4 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
+    vsi_bool is_8x_upsample =(8 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
+                    && (8 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
    _internal_scale_e scale_flag = UP;

    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
@ -1032,6 +1115,7 @@ static vsi_status _query_kernel
    is_2x_upsample &= (in_dtype == U8);
    is_3x_upsample &= (in_dtype == U8);
    is_4x_upsample &= (in_dtype == U8);
+    is_8x_upsample &= (in_dtype == U8);

    if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0])
    {
@ -1047,6 +1131,10 @@ static vsi_status _query_kernel
        {
            scale_flag = UP_4X_HALF;
        }
+        else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_8x_upsample)
+        {
+            scale_flag = UP_8X_HALF;
+        }
        else if (is_same_type && is_evis2)
        {
            scale_flag = UP_OPT;
@ -1123,7 +1211,6 @@ static vsi_status _query_kernel
    }

    return status;
-
 } /* _query_kernel() */

 static vsi_nn_tensor_t* _create_scale_tensor
@ -1307,4 +1394,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_EVIS( resize_bilinear, _setup )
-
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
@ -74,6 +74,7 @@ static const struct {
    TENSOR_SCATTER_ND_KERNELS(I32, U8,  U8,       KERNEL_SOURCE_1)
    TENSOR_SCATTER_ND_KERNELS(I32, I16, I16,      KERNEL_SOURCE_1)
    TENSOR_SCATTER_ND_KERNELS(I32, F16, F16,      KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_KERNELS(I32, BF16,BF16,     KERNEL_SOURCE_1)
    TENSOR_SCATTER_ND_BIG_KERNELS(I32, I8,  I8,   KERNEL_SOURCE_2)
    TENSOR_SCATTER_ND_BIG_KERNELS(I32, U8,  U8,   KERNEL_SOURCE_2)
    TENSOR_SCATTER_ND_BIG_KERNELS(I32, I16, I16,  KERNEL_SOURCE_2)
@ -250,8 +251,45 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer)
                0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
        }, GPU_DP_TYPE_16 };

+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x05050404, 0x07070606, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractOddData_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x07050301, 0x07050301, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+
        status = vsi_nn_kernel_gpu_add_param( node,
            "uniAccumulateSum_2x8", &uniAccumulateSum_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
        status |= vsi_nn_kernel_gpu_add_param( node, "index_num", &index_num );
        status |= vsi_nn_kernel_gpu_add_param( node, "zeropoint", &output_zp );
        status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX );
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@ -67,6 +67,13 @@ static vsi_status _gpu_register
    vsi_nn_kernel_t* kernel
    );

+static vsi_status _gpu_register_ext
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_kernel_t* kernel,
+    const char** resources
+    );
+
 static vx_program _create_program_from_executable
    (
    vsi_nn_graph_t* graph,
@ -79,6 +86,13 @@ static vx_program _create_program_from_code
    vsi_nn_kernel_t* kernel
    );

+static vx_program _create_program_from_code_ext
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_kernel_t* kernel,
+    const char** resources
+    );
+
 static const uint8_t* _load_internal_executable
    (
    const char* source_name,
@ -104,6 +118,14 @@ static void _kernel_clear_source

 static vsi_bool _check_shader_support(vsi_nn_graph_t* graph);

+static vsi_bool vsi_nn_kernel_is_asymmtric_int8
+    (
+    vsi_nn_tensor_t** inputs,
+    size_t input_num,
+    vsi_nn_tensor_t** outputs,
+    size_t output_num
+    );
+
 static vsi_status VX_CALLBACK _kernel_validator
    (
    vx_node node,
@ -290,7 +312,7 @@ static char* _load_source_code_from_file
    size_t read_bytes;
    source = NULL;
    //TODO: Pack new name
-    fp = fopen( source_name, "rb" );
+    fp = vsi_nn_fopen( source_name, "rb" );
    if( NULL == fp )
    {
        VSILOGE("Open program file %s fail.", source_name);
@ -414,6 +436,58 @@ static vx_program _create_program_from_code
    return program;
 } /* _create_program_from_code() */

+static vx_program _create_program_from_code_ext
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_kernel_t* kernel,
+    const char** resources
+    )
+{
+    const vsi_nn_kernel_source_info_t* source_info;
+    kernel_program_info_t* program_info;
+    size_t i;
+    vx_program program = NULL;
+    source_info = &kernel->gpu.sources[VSI_NN_GPU_SOURCE_FMT_CODE];
+
+    if( source_info->num == 0 )
+    {
+        VSILOGE("Not executable source found in kernel.");
+        return NULL;
+    }
+    program_info = (kernel_program_info_t*)malloc(
+            source_info->num * sizeof(kernel_program_info_t) );
+    if( !program_info )
+    {
+        VSILOGE("Malloc program memory fail.");
+        return NULL;
+    }
+    memset( program_info, 0, source_info->num * sizeof(kernel_program_info_t) );
+
+    for( i = 0; i < source_info->num; i ++ )
+    {
+        program_info[i].data = (const void*)(resources[i]);
+        if( !program_info[i].data )
+        {
+            program_info[i].reserve_mem = (void*)_load_source_code_from_file(
+                    source_info->data[i], &program_info[i].size );
+            program_info[i].data = (const void*)program_info[i].reserve_mem;
+        }
+    }
+    program = _create_program( graph->ctx->c, program_info, source_info->num );
+    if( program_info )
+    {
+        for( i = 0; i < source_info->num; i ++ )
+        {
+            if( program_info[i].reserve_mem )
+            {
+                free( program_info[i].reserve_mem );
+            }
+        }
+        free( program_info );
+    }
+    return program;
+} /* _create_program_from_code_ext() */
+
 static vx_program _create_program_from_executable
    (
    vsi_nn_graph_t* graph,
@ -547,6 +621,113 @@ static vsi_status _gpu_register
    return status;
 } /* _gpu_register() */

+static vsi_status _gpu_register_ext
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_kernel_t* kernel,
+    const char** resources
+    )
+{
+    vsi_status status;
+    vx_kernel_description_t* info;
+    vx_kernel obj;
+    vsi_nn_context_t context;
+    vx_program program = NULL;
+    const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
+
+#define MAX_BUILDPROGRAM_LEN 1024
+    char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
+    size_t cost_bytes = 0;
+
+    memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
+    context = graph->ctx;
+
+    status = VSI_FAILURE;
+    info = &(kernel->info);
+
+    switch( active_fmt )
+    {
+        case VSI_NN_GPU_SOURCE_FMT_CODE:
+            program = _create_program_from_code_ext( graph, kernel,resources );
+            break;
+        case VSI_NN_GPU_SOURCE_FMT_EXECUTABLE:
+            program = _create_program_from_executable( graph, kernel );
+            break;
+        default:
+            VSILOGE("Unknown source format %d", kernel->gpu.active_source_fmt);
+            break;
+    }
+    if( NULL == program )
+    {
+        return status;
+    }
+
+    if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE )
+    {
+        // set default evis version is 2
+        if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
+        {
+            cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
+                    "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
+                    context->config.use_40bits_va );
+        }
+    }
+    else
+    {
+        cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
+                "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
+                context->config.evis.ver, context->config.use_40bits_va );
+    }
+    // Pack build option
+    if( kernel->gpu.sources[active_fmt].build_option.data )
+    {
+        vsi_nn_kernel_build_option_t * option = &kernel->gpu.sources[active_fmt].build_option;
+        if( MAX_BUILDPROGRAM_LEN - cost_bytes > strlen( option->data ) + 1 )
+        {
+            snprintf( &cmd[cost_bytes], MAX_BUILDPROGRAM_LEN - cost_bytes,
+                    " %s", option->data );
+        }
+        else
+        {
+            VSILOGE("Build option is too long!");
+            VSI_ASSERT( FALSE );
+        }
+    }
+
+    status = vxBuildProgram( program, cmd );
+
+    if( VSI_SUCCESS != status )
+    {
+        VSILOGE("Build program fail.");
+        return status;
+    }
+
+    obj = vxAddKernelInProgram(
+        program,
+        info->name,
+        info->enumeration,
+        info->numParams,
+        info->validate,
+        info->initialize,
+        info->deinitialize
+        );
+
+    if( obj )
+    {
+        status = _kernel_init_obj( info, obj );
+        //vxReleaseKernel( &obj );
+    }
+    else
+    {
+        VSILOGE( "Add kernel %s fail.", info->name );
+    }
+    if( program )
+    {
+        vxReleaseProgram( &program );
+    }
+    return status;
+} /* _gpu_register_ext() */
+
 static vsi_status _kernel_init_obj
    (
    vx_kernel_description_t* info,
@ -620,6 +801,19 @@ vsi_status vsi_nn_kernel_register
    return status;
 } /* vsi_nn_kernel_register() */

+vsi_status vsi_nn_kernel_register_ext
+    (
+    vsi_nn_graph_t * graph,
+    vsi_nn_kernel_t * kernel,
+    const char** resources
+    )
+{
+    vsi_status status;
+    status = VSI_FAILURE;
+    status = _gpu_register_ext( graph, kernel,resources );
+    return status;
+} /* vsi_nn_kernel_register_ext */
+
 vsi_nn_kernel_node_t  vsi_nn_kernel_create_node
    (
    vsi_nn_graph_t* graph,
@ -667,7 +861,6 @@ vsi_nn_kernel_node_t  vsi_nn_kernel_create_node
    status = vxGetStatus( (vx_reference)obj );
    if (VSI_SUCCESS != status)
    {
-        fprintf(stderr, "\n"); // TODO: This is a hack for driver msg
        /* Register kernel */
        status = vsi_nn_kernel_register( graph, kernel );
        if( VSI_SUCCESS != status )
@ -712,6 +905,92 @@ vsi_nn_kernel_node_t  vsi_nn_kernel_create_node
    return (vsi_nn_kernel_node_t)node;
 } /* vsi_nn_kernel_create_node() */

+vsi_nn_kernel_node_t  vsi_nn_kernel_create_node_ext
+    (
+    vsi_nn_graph_t * graph,
+    vsi_nn_kernel_t * kernel,
+    const char** resources
+    ){
+    vsi_status status;
+    vx_context ctx;
+    vx_kernel obj;
+    vx_node node;
+    vx_kernel_description_t* info;
+
+    info = &(kernel->info);
+    // Validate kernel
+    if( !info->initialize )
+    {
+        VSILOGE("Kernel %s initializer is NULL", info->name);
+        return NULL;
+    }
+    if( !info->validate )
+    {
+        VSILOGE("Kernel %s validator is NULL", info->name);
+        return NULL;
+    }
+    if( !info->deinitialize )
+    {
+        VSILOGE("Kernel %s deinitializer is NULL", info->name);
+        return NULL;
+    }
+    if( info->enumeration == KERNEL_ID_PLACEHOLDER )
+    {
+        //VSILOGD("Kernel id: %#x, %#x", kernel->unique_id, info->enumeration);
+        info->enumeration = (vx_enum)kernel->unique_id;
+    }
+
+    ctx = vxGetContext( (vx_reference)graph->g );
+
+    obj = vxGetKernelByName( ctx, info->name );
+    status = vxGetStatus( (vx_reference)obj );
+    if (VSI_SUCCESS != status)
+    {
+        fprintf(stderr, "\n"); // TODO: This is a hack for driver msg
+        /* Register kernel */
+        status = vsi_nn_kernel_register_ext( graph, kernel,resources );
+        if( VSI_SUCCESS != status )
+        {
+            VSILOGE( "Register client kernel %s fail with %d.",
+                info->name, status );
+            return NULL;
+        }
+        else
+        {
+            VSILOGD( "Register client kernel %s successfully.",
+                info->name );
+        }
+
+        /* Load kernel */
+        obj = vxGetKernelByName( ctx, info->name );
+        status = vxGetStatus( (vx_reference)obj );
+    }
+    if( VSI_SUCCESS != status )
+    {
+        VSILOGE( "Load client kernel %s fail with %d.",
+            info->name, status );
+        return NULL;
+    }
+    node = vxCreateGenericNode( graph->g, obj );
+    vxReleaseKernel( &obj );
+    status = vxGetStatus( (vx_reference)node );
+    if( VSI_SUCCESS != status )
+    {
+        VSILOGE( "Load client node from kernel %s fail with %d.",
+            info->name, status );
+        return NULL;
+    }
+    if( node )
+    {
+        // Set default border mode.
+        vx_border_t border;
+        border.mode = VX_BORDER_REPLICATE;
+        border.constant_value.U32 = 0;
+        status |= vxSetNodeAttribute( node, VX_NODE_BORDER, &border, sizeof(border) );
+    }
+    return (vsi_nn_kernel_node_t)node;
+} /* vsi_nn_kernel_create_node_ext() */
+
 vsi_status vsi_nn_kernel_node_set_border
    (vsi_nn_kernel_node_t node,
    vx_border_t* border)
@ -987,7 +1266,8 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector

            /* Skip evis and cl when disable shader */
            if ( (type == VSI_NN_KERNEL_TYPE_EVIS || type == VSI_NN_KERNEL_TYPE_CL)
-                && _check_shader_support(graph) == FALSE)
+                && ( _check_shader_support(graph) == FALSE ||
+                vsi_nn_kernel_is_asymmtric_int8(inputs, input_num, outputs, output_num) ) )
            {
                continue;
            }
@ -1292,3 +1572,38 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)

    return FALSE;
 }
+
+static vsi_bool vsi_nn_kernel_is_asymmtric_int8
+    (
+    vsi_nn_tensor_t** inputs,
+    size_t input_num,
+    vsi_nn_tensor_t** outputs,
+    size_t output_num
+    )
+{
+    size_t i = 0;
+
+    for (i = 0; i < input_num; i++)
+    {
+        if ( inputs[i] &&
+             inputs[i]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
+             inputs[i]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
+           )
+        {
+            return TRUE;
+        }
+    }
+
+    for (i = 0; i < output_num; i++)
+    {
+        if ( outputs[i] &&
+             outputs[i]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
+             outputs[i]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
+           )
+        {
+            return TRUE;
+        }
+    }
+
+    return FALSE;
+}
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
@ -361,7 +361,6 @@ vsi_bool vsi_nn_kernel_optimize_softmax_shape
    return ret;
 } /* vsi_nn_kernel_optimize_softmax_shape() */

-
 typedef enum
 {
    TILE_STATE_AXIS_X  = 0,
@ -611,4 +610,47 @@ vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape
    *out_rank = vsi_nn_min(dim_num, 3);

    return TRUE;
+}
+
+vsi_bool vsi_nn_kernel_optimize_group_norm_shape
+    (
+    const vsi_size_t* shape, const uint32_t rank, int32_t groups,
+    int32_t is_sp_kernel, vsi_size_t* out_shape
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    uint32_t i = 0;
+    vsi_size_t out_rank = 0;
+    vsi_size_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
+    group_shape[0] = shape[0];
+    group_shape[1] = shape[1];
+    group_shape[2] = shape[2] / groups;
+
+    vsi_nn_kernel_optimize_element_shape( group_shape, 3, out_shape, &out_rank );
+
+    if (!is_sp_kernel && out_shape[1] == 1 && out_rank < 3)
+    {
+        out_shape[1] = groups;
+        out_shape[2] = 1;
+        out_shape[3] = 1;
+        for (i = 3; i < rank; i++)
+        {
+            out_shape[3] = out_shape[3] * shape[i];
+        }
+    }
+    else if (out_rank == 2)
+    {
+        out_shape[2] = groups;
+        out_shape[3] = 1;
+        for (i = 3; i < rank; i++)
+        {
+            out_shape[3] = out_shape[3] * shape[i];
+        }
+    }
+    else
+    {
+        status = VSI_FAILURE;
+    }
+
+    return status;
 }
--- a/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c
@ -0,0 +1,84 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include <float.h>
+#include "utils/vsi_nn_dtype_util_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_lut.h"
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vx_node node = NULL;
+    float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
+
+    node = vxBatchNormalizationLayer(
+        graph->g,
+        eps,
+        inputs[1]->t,
+        inputs[2]->t,
+        inputs[3]->t,
+        inputs[4]->t,
+        inputs[0]->t,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* _setup() */
+
+#define REGISTER_BATCH_NORM_OPENVX_KERNEL(KERNEL_NAME) \
+    static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num, \
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ) \
+    { \
+        return _setup(graph, inputs, input_num, outputs, output_num, \
+                params, kernel); \
+    } \
+    REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
+
+REGISTER_BATCH_NORM_OPENVX_KERNEL( batch_norm )
+
+#undef REGISTER_BATCH_NORM_OPENVX_KERNEL
--- a/src/tim/vx/internal/src/kernel/vx/convolutional.c
+++ b/src/tim/vx/internal/src/kernel/vx/convolutional.c
@ -181,6 +181,51 @@ static vsi_bool _build_vx_conv3d_param
 } /* _build_vx_conv2d_param() */
 #endif

+#if VX_DECONV_3D_API_SUPPORT
+static vsi_bool _build_vx_deconv3d_param
+    (
+    vx_nn_deconvolution_3d_params_t * param,
+    int32_t stride_d, int32_t stride_h, int32_t stride_w,
+    int32_t pad_d_front, int32_t pad_d_end,
+    int32_t pad_h_front, int32_t pad_h_end,
+    int32_t pad_w_front, int32_t pad_w_end,
+    int32_t outpadding_d, int32_t outpadding_h, int32_t outpadding_w,
+    int32_t group, vsi_enum overflow_policy,
+    vsi_enum rounding_policy, vsi_enum down_scale_size_rounding
+    )
+{
+    VSI_ASSERT( stride_d > 0 );
+    VSI_ASSERT( stride_h > 0 );
+    VSI_ASSERT( stride_w > 0 );
+    VSI_ASSERT( outpadding_d >= 0 );
+    VSI_ASSERT( outpadding_h >= 0 );
+    VSI_ASSERT( outpadding_w >= 0 );
+    VSI_ASSERT( group >= 0 );
+
+    param->padding_d_front  = (uint32_t)pad_d_front;
+    param->padding_d_rear   = (uint32_t)pad_d_end;
+    param->padding_h_top    = (uint32_t)pad_h_front;
+    param->padding_h_bottom = (uint32_t)pad_h_end;
+    param->padding_w_left   = (uint32_t)pad_w_front;
+    param->padding_w_right  = (uint32_t)pad_w_end;
+
+    param->a_w = outpadding_w;
+    param->a_h = outpadding_h;
+    param->a_d = outpadding_d;
+
+    param->overflow_policy = (vx_enum)overflow_policy;
+    param->rounding_policy = (vx_enum)rounding_policy;
+    param->down_scale_size_rounding = (vx_enum)down_scale_size_rounding;
+    param->channel_group = group;
+
+    param->stride_w = (uint32_t)stride_w;
+    param->stride_h = (uint32_t)stride_h;
+    param->stride_d = (uint32_t)stride_d;
+
+    return TRUE;
+} /* _build_vx_deconv3d_param() */
+#endif
+
 static vx_tensor _expand_tensor_dim
    ( vx_tensor tensor, vsi_ssize_t * shape, size_t rank, vsi_ssize_t expand_dim )
 {
@ -242,7 +287,7 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d )
    vx_node node = NULL;
    vx_nn_convolution_params_ext2_t vxparam;
    vx_tensor temp_tensors[3] = { NULL };
-    int i;
+    uint32_t i = 0;

    _build_vx_conv2d_param(
            &vxparam,
@ -270,7 +315,6 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d )
    {
        uint8_t    * data = NULL;
        vsi_nn_tensor_attr_t attr;
-        uint32_t i;

        data = vsi_nn_ConvertTensorToData( graph, inputs[1] );
        CHECK_PTR_FAIL_GOTO( data, "Convert data fail.", final );
@ -317,7 +361,7 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
    vx_node node = NULL;
    vx_nn_convolution_params_ext2_t vxparam;
    vx_tensor temp_tensors[3] = { NULL };
-    int32_t i;
+    uint32_t i = 0;
    vsi_bool need_explicit_padding = FALSE;

    _build_vx_conv2d_param(
@ -344,7 +388,7 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
        new_w_shape[0] = inputs[1]->attr.size[0];
        new_w_shape[1] = 1;
        new_w_shape[2] = 1;
-        for (i = 1; i < (int32_t)(inputs[1]->attr.dim_num); i++)
+        for (i = 1; i < inputs[1]->attr.dim_num; i++)
        {
            new_w_shape[2] *= inputs[1]->attr.size[i];
        }
@ -358,7 +402,6 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
    {
        uint8_t    * data = NULL;
        vsi_nn_tensor_attr_t attr;
-        uint32_t i;

        data = vsi_nn_ConvertTensorToData( graph, inputs[1] );
        CHECK_PTR_FAIL_GOTO( data, "Convert data fail.", final );
@ -576,4 +619,41 @@ REGISTER_CONV_OPENVX_KERNEL( conv3d )
    return (vsi_nn_kernel_node_t)node;
 } /* depthwise_conv2d*/

-#undef REGISTER_CONV_OPENVX_KERNEL
+REGISTER_CONV_OPENVX_KERNEL( deconv3d )
+{
+    vx_node node = NULL;
+#if VX_DECONV_3D_API_SUPPORT
+    vx_nn_deconvolution_3d_params_t vxparam;
+    memset(&vxparam, 0, sizeof(vxparam));
+
+    _build_vx_deconv3d_param(
+            &vxparam,
+            vsi_nn_kernel_param_get_int32(params, "stride_d"),
+            vsi_nn_kernel_param_get_int32(params, "stride_h"),
+            vsi_nn_kernel_param_get_int32(params, "stride_w"),
+            vsi_nn_kernel_param_get_int32(params, "pad_front"),
+            vsi_nn_kernel_param_get_int32(params, "pad_end"),
+            vsi_nn_kernel_param_get_int32(params, "pad_top"),
+            vsi_nn_kernel_param_get_int32(params, "pad_bottom"),
+            vsi_nn_kernel_param_get_int32(params, "pad_left"),
+            vsi_nn_kernel_param_get_int32(params, "pad_right"),
+            vsi_nn_kernel_param_get_int32(params, "outpadding_w"),
+            vsi_nn_kernel_param_get_int32(params, "outpadding_h"),
+            vsi_nn_kernel_param_get_int32(params, "outpadding_w"),
+            vsi_nn_kernel_param_get_int32(params, "group"),
+            vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
+            vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
+            vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding")
+            );
+
+    node = vxDeconv3dLayer( graph->g,
+        inputs[0]->t, inputs[1]->t, inputs[2] ? inputs[2]->t : NULL,
+        &vxparam,
+        sizeof( vxparam),
+        outputs[0]->t
+        );
+#endif
+    return (vsi_nn_kernel_node_t)node;
+} /* deconv3d */
+
+#undef REGISTER_CONV_OPENVX_KERNEL
--- a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
@ -0,0 +1,113 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+
+#define REGISTER_PAD2_OPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_PAD2_OPENVX_KERNEL( pad2 )
+{
+    vx_node node = NULL;
+    vx_nn_pad_params_t param;
+    size_t dim_num = 0;
+    int32_t* front_size = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "front_size", &dim_num);
+    int32_t* back_size = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "back_size", &dim_num);
+    int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "pad_mode");
+    int32_t pad_front_array[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t pad_back_array[VSI_NN_MAX_DIM_NUM] = {0};
+    vsi_nn_tensor_t *convert_tensor = NULL;
+    float const_val = vsi_nn_kernel_param_get_float32(params, "const_val");
+
+    memset(&param, 0, sizeof(param));
+    memset(pad_front_array, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
+    memset(pad_back_array, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
+
+    memcpy(pad_front_array, front_size, sizeof(int32_t) * dim_num);
+    memcpy(pad_back_array, back_size, sizeof(int32_t) * dim_num);
+
+    param.pad_mode = pad_mode;
+    param.pad_const = vxCreateScalar( graph->ctx->c, VX_TYPE_FLOAT32, &const_val );
+    param.numViewDimensions = (uint8_t)vsi_nn_max(dim_num, 2);
+    param.pad_front_array = pad_front_array;
+    param.pad_back_array = pad_back_array;
+
+    if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
+    {
+        vsi_nn_tensor_attr_t attr;
+        memcpy( &attr, &outputs[0]->attr, sizeof( attr ) );
+        memcpy( &attr.size, &inputs[0]->attr.size, sizeof( attr.size ) );
+        attr.vtl = FALSE;
+        attr.is_const = FALSE;
+
+        convert_tensor = vsi_nn_CreateTensor(graph, &attr);
+
+        node = vxTensorCopyNode(
+            graph->g,
+            inputs[0]->t,
+            convert_tensor->t
+            );
+    }
+    else
+    {
+        convert_tensor = vsi_nn_reshape_tensor( graph,
+            inputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num );
+    }
+
+    node = vxTensorPadNode( graph->g, convert_tensor->t, outputs[0]->t, &param, sizeof(param) );
+
+    vxReleaseScalar( &param.pad_const );
+    vsi_safe_release_tensor(convert_tensor);
+
+    return (vsi_nn_kernel_node_t)node;
+} /* pad2() */
+
+#undef REGISTER_PAD2_OPENVX_KERNEL
--- a/src/tim/vx/internal/src/libnnext/ops/cl/clip_BF16.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/clip_BF16.cl
@ -0,0 +1,37 @@
+#pragma OPENCL EXTENSION CL_VIV_asm : enable
+
+__kernel void clip_BF16toBF16(
+    __read_only  image2d_array_t  input,
+    __write_only image2d_array_t  output,
+                           float  minData,
+                           float  maxData)
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    uint4 src0 = read_imageui(input, coord);
+    src0 = src0 << 16;
+    float4 src;
+    _viv_asm(COPY, src, src0, 16);
+    float4 dst0 = clamp(src, minData, maxData);
+    uint4 dst;
+    _viv_asm(COPY, dst, dst0, 16);
+    dst = dst >> 16;
+    write_imageui(output, coord, dst);
+}
+
+__kernel void clip_BF16toBF16_2D(
+    __read_only  image2d_t  input,
+    __write_only image2d_t  output,
+                     float  minData,
+                     float  maxData)
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    uint4 src0 = read_imageui(input, coord);
+    src0 = src0 << 16;
+    float4 src;
+    _viv_asm(COPY, src, src0, 16);
+    float4 dst0 = clamp(src, minData, maxData);
+    uint4 dst;
+    _viv_asm(COPY, dst, dst0, 16);
+    dst = dst >> 16;
+    write_imageui(output, coord, dst);
+}
--- a/src/tim/vx/internal/src/libnnext/ops/cl/depth2space_crd.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/depth2space_crd.cl
@ -0,0 +1,17 @@
+
+__kernel void depth2space_crd_F32toF32(
+    image2d_array_t input, image2d_array_t output, int block_size)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord_out = (int4)(gidx, gidy, gidz, 0);
+    int block_e2 = block_size * block_size;
+    ushort blk = (ushort)block_size;
+    int inx = (int)((ushort)gidx / blk);
+    int iny = (int)((ushort)gidy / blk);
+    int inz = (gidx  % block_size) + (gidy % block_size) * block_size + gidz * block_e2;
+    int4 coord_in = (int4)(inx, iny, inz, 0);
+    float4 data = read_imagef(input, coord_in);
+    write_imagef(output, coord_out, data);
+}
--- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl
@ -3,6 +3,11 @@ float eltwise_unary_sin(float x, float alpha, float beta)
    return native_sin(x);
 }

+float eltwise_unary_cos(float x, float alpha, float beta)
+{
+    return native_cos(x);
+}
+
 #define logE        (1.44269502f)
 #define twoLogE     (logE * 2.0f)
 float eltwise_unary_exp(float x, float alpha, float beta)
@ -135,6 +140,7 @@ __kernel void func_name##_F32toF32 \
    write_imagef(output, coord, dst.xxxx); \
 }
 ELTWISE_UNARY_F32(sin)
+ELTWISE_UNARY_F32(cos)
 ELTWISE_UNARY_F32(exp)
 ELTWISE_UNARY_F32(log)
 ELTWISE_UNARY_F32(elu)
@ -168,6 +174,7 @@ __kernel void func_name##_F32toF32_2D \
    write_imagef(output, coord, dst.xxxx); \
 }
 ELTWISE_UNARY_F32_2D(sin)
+ELTWISE_UNARY_F32_2D(cos)
 ELTWISE_UNARY_F32_2D(exp)
 ELTWISE_UNARY_F32_2D(log)
 ELTWISE_UNARY_F32_2D(elu)
@ -202,6 +209,7 @@ __kernel void func_name##_U8toU8 \
    write_imageui(output, coord, dst); \
 }
 ELTWISE_UNARY_U8(sin)
+ELTWISE_UNARY_U8(cos)
 ELTWISE_UNARY_U8(exp)
 ELTWISE_UNARY_U8(log)
 ELTWISE_UNARY_U8(elu)
@ -236,6 +244,7 @@ __kernel void func_name##_U8toU8_2D \
    write_imageui(output, coord, dst); \
 }
 ELTWISE_UNARY_U8_2D(sin)
+ELTWISE_UNARY_U8_2D(cos)
 ELTWISE_UNARY_U8_2D(exp)
 ELTWISE_UNARY_U8_2D(log)
 ELTWISE_UNARY_U8_2D(elu)
--- a/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl
@ -1,7 +1,15 @@
-__kernel void floordiv_F32F32toF32(
+__kernel void floordiv_F32F32toF32
+    (
    __read_only  image2d_array_t  input,
    __read_only  image2d_array_t  input1,
-    __write_only image2d_array_t  output)
+    __write_only image2d_array_t  output,
+                 float            input0Scale,
+                 float            input0Tail,
+                 float            input1Scale,
+                 float            input1Tail,
+                 float            outputScale,
+                 float            outputTail
+     )
 {
    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    float4 src0;
@ -12,10 +20,18 @@ __kernel void floordiv_F32F32toF32(
    write_imagef(output, coord, dst);
 }

-__kernel void floordiv_F32F32toF32_2D(
-    __read_only  image2d_t  input,
-    __read_only  image2d_t  input1,
-    __write_only image2d_t  output)
+__kernel void floordiv_F32F32toF32_2D
+    (
+    __read_only  image2d_t input,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 float     input0Scale,
+                 float     input0Tail,
+                 float     input1Scale,
+                 float     input1Tail,
+                 float     outputScale,
+                 float     outputTail
+     )
 {
    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
    float4 src0 = read_imagef(input, coord);
@ -24,33 +40,8 @@ __kernel void floordiv_F32F32toF32_2D(
    write_imagef(output, coord, dst);
 }

-__kernel void floordiv_I32I32toI32(
-    __read_only  image2d_array_t  input,
-    __read_only  image2d_array_t  input1,
-    __write_only image2d_array_t  output)
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    int4 src0;
-    int4 src1;
-    READ_IMAGEI_2DARRAY(src0, input, coord);
-    READ_IMAGEI_2DARRAY(src1, input1, coord);
-    int4 dst  = convert_int4(floor(convert_float4(src0) / convert_float4(src1)));
-    write_imagei(output, coord, dst);
-}
-
-__kernel void floordiv_I32I32toI32_2D(
-    __read_only  image2d_t  input,
-    __read_only  image2d_t  input1,
-    __write_only image2d_t  output)
-{
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
-    int4 src0 = read_imagei(input, coord);
-    int4 src1 = read_imagei(input1, coord);
-    int4 dst  = convert_int4(floor(convert_float4(src0) / convert_float4(src1)));
-    write_imagei(output, coord, dst);
-}
-
-__kernel void floordiv_I32I32toU8(
+__kernel void floordiv_I32I32toI32
+    (
    __read_only  image2d_array_t  input,
    __read_only  image2d_array_t  input1,
    __write_only image2d_array_t  output,
@ -59,7 +50,56 @@ __kernel void floordiv_I32I32toU8(
                 float            input1Scale,
                 float            input1Tail,
                 float            outputScale,
-                 float            outputTail )
+                 float            outputTail
+     )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 src0;
+    int4 src1;
+    READ_IMAGEI_2DARRAY(src0, input, coord);
+    READ_IMAGEI_2DARRAY(src1, input1, coord);
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;
+    float4 out = floor(in0 / in1) * outputScale + outputTail;
+    int4 dst  = convert_int4(out);
+    write_imagei(output, coord, dst);
+}
+
+__kernel void floordiv_I32I32toI32_2D
+    (
+    __read_only  image2d_t input,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 float     input0Scale,
+                 float     input0Tail,
+                 float     input1Scale,
+                 float     input1Tail,
+                 float     outputScale,
+                 float     outputTail
+     )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    int4 src0 = read_imagei(input, coord);
+    int4 src1 = read_imagei(input1, coord);
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;
+    float4 out = floor(in0 / in1) * outputScale + outputTail;
+    int4 dst  = convert_int4(out);
+    write_imagei(output, coord, dst);
+}
+
+__kernel void floordiv_I32I32toU8
+    (
+    __read_only  image2d_array_t  input,
+    __read_only  image2d_array_t  input1,
+    __write_only image2d_array_t  output,
+                 float            input0Scale,
+                 float            input0Tail,
+                 float            input1Scale,
+                 float            input1Tail,
+                 float            outputScale,
+                 float            outputTail
+     )
 {
    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4 src0;
@ -73,16 +113,18 @@ __kernel void floordiv_I32I32toU8(
    write_imageui(output, coord, dst);
 }

-__kernel void floordiv_I32I32toU8_2D(
-    __read_only  image2d_t  input,
-    __read_only  image2d_t  input1,
-    __write_only image2d_t  output,
-                 float      input0Scale,
-                 float      input0Tail,
-                 float      input1Scale,
-                 float      input1Tail,
-                 float      outputScale,
-                 float      outputTail )
+__kernel void floordiv_I32I32toU8_2D
+    (
+    __read_only  image2d_t input,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 float     input0Scale,
+                 float     input0Tail,
+                 float     input1Scale,
+                 float     input1Tail,
+                 float     outputScale,
+                 float     outputTail
+     )
 {
    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
    int4 src0 = read_imagei(input, coord);
@ -94,7 +136,8 @@ __kernel void floordiv_I32I32toU8_2D(
    write_imageui(output, coord, dst);
 }

-__kernel void floordiv_U8U8toU8(
+__kernel void floordiv_U8U8toU8
+    (
    __read_only  image2d_array_t  input,
    __read_only  image2d_array_t  input1,
    __write_only image2d_array_t  output,
@ -103,7 +146,8 @@ __kernel void floordiv_U8U8toU8(
                 float            input1Scale,
                 float            input1Tail,
                 float            outputScale,
-                 float            outputTail )
+                 float            outputTail
+     )
 {
    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    uint4 src0, src1;
@ -117,16 +161,18 @@ __kernel void floordiv_U8U8toU8(
    write_imageui(output, coord, dst);
 }

-__kernel void floordiv_U8U8toU8_2D(
-    __read_only  image2d_t  input,
-    __read_only  image2d_t  input1,
-    __write_only image2d_t  output,
-                 float      input0Scale,
-                 float      input0Tail,
-                 float      input1Scale,
-                 float      input1Tail,
-                 float      outputScale,
-                 float      outputTail )
+__kernel void floordiv_U8U8toU8_2D
+    (
+    __read_only  image2d_t input,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 float     input0Scale,
+                 float     input0Tail,
+                 float     input1Scale,
+                 float     input1Tail,
+                 float     outputScale,
+                 float     outputTail
+     )
 {
    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
    uint4 src0 = read_imageui(input, coord);
@ -139,7 +185,8 @@ __kernel void floordiv_U8U8toU8_2D(
    write_imageui(output, coord, dst);
 }

-__kernel void floordiv_U8I32toU8(
+__kernel void floordiv_U8I32toU8
+    (
    __read_only  image2d_array_t  input,
    __read_only  image2d_array_t  input1,
    __write_only image2d_array_t  output,
@ -148,7 +195,8 @@ __kernel void floordiv_U8I32toU8(
                 float            input1Scale,
                 float            input1Tail,
                 float            outputScale,
-                 float            outputTail )
+                 float            outputTail
+     )
 {
    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    uint4 src0;
@ -163,16 +211,18 @@ __kernel void floordiv_U8I32toU8(
    write_imageui(output, coord, dst);
 }

-__kernel void floordiv_U8I32toU8_2D(
-    __read_only  image2d_t  input,
-    __read_only  image2d_t  input1,
-    __write_only image2d_t  output,
-                 float      input0Scale,
-                 float      input0Tail,
-                 float      input1Scale,
-                 float      input1Tail,
-                 float      outputScale,
-                 float      outputTail )
+__kernel void floordiv_U8I32toU8_2D
+    (
+    __read_only  image2d_t input,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 float     input0Scale,
+                 float     input0Tail,
+                 float     input1Scale,
+                 float     input1Tail,
+                 float     outputScale,
+                 float     outputTail
+     )
 {
    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
    uint4 src0 = read_imageui(input, coord);
--- a/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl
@ -5,7 +5,8 @@ __kernel void gather_U8toU8(
    int block_size,
    int block_num,
    int axis_num,
-    int indices_num
+    int indices_num,
+    int batch
    )
 {
    int gidx = get_global_id(0);  // block_size
@ -29,7 +30,8 @@ __kernel void gather_F16toF16(
    int block_size,
    int block_num,
    int axis_num,
-    int indices_num
+    int indices_num,
+    int batch
    )
 {
    int gidx = get_global_id(0);  // block_size
@ -53,7 +55,8 @@ __kernel void gather_I32toI32(
    int block_size,
    int block_num,
    int axis_num,
-    int indices_num
+    int indices_num,
+    int batch
    )
 {
    int gidx = get_global_id(0);  // block_size
@ -77,7 +80,8 @@ __kernel void gather_F32toF32(
    int block_size,
    int block_num,
    int axis_num,
-    int indices_num
+    int indices_num,
+    int batch
    )
 {
    int gidx = get_global_id(0);  // block_size
--- a/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl
@ -0,0 +1,123 @@
+__kernel void gather_batch_U8toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num,
+    int indices_num,
+    int batch
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        uint4 data = read_imageui(input0, coord_in);
+        coord_in.z++;
+        write_imageui(output, coord, data);
+        coord.z++;
+    }
+}
+
+__kernel void gather_batch_F16toF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num,
+    int indices_num,
+    int batch
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        float4 data = read_imagef(input0, coord_in);
+        coord_in.z++;
+        write_imagef(output, coord, data);
+        coord.z++;
+    }
+}
+
+__kernel void gather_batch_I32toI32(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num,
+    int indices_num,
+    int batch
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        int4 data = read_imagei(input0, coord_in);
+        coord_in.z++;
+        write_imagei(output, coord, data);
+        coord.z++;
+    }
+}
+
+__kernel void gather_batch_F32toF32(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num,
+    int indices_num,
+    int batch
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        float4 data = read_imagef(input0, coord_in);
+        coord_in.z++;
+        write_imagef(output, coord, data);
+        coord.z++;
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl
@ -112,6 +112,48 @@ __kernel void moments_axis0_I32toF32(
    vari.x = sqr * dimRatio * input_scale * input_scale;
    vari.x = vari.x - mean.x * mean.x;

+    int2 coord_out = (int2)(gidy, gidz);
+    write_imagef(output_mean, coord_out, mean);
+    write_imagef(output_vari, coord_out, vari);
+}
+
+__kernel void moments_axis0_BF16toF32(
+    __read_only image2d_array_t   input,
+    __write_only image2d_t  output_mean,
+    __write_only image2d_t  output_vari,
+    int axis,
+    int axis_num,
+    int input_zp,
+    float input_scale,
+    int width,
+    int height,
+    int chn,
+    float dimRatio
+    )
+{
+    int gidy = get_global_id(0);
+    int gidz = get_global_id(1);
+
+    int4 coord0 = (int4)(0, gidy, gidz, 0);
+    float4 data;
+    float sum = 0, sqr = 0;
+
+    for(coord0.x = 0; coord0.x < width;)
+    {
+        uint4 src0 = read_imageui(input, coord0);
+        src0 = src0 << 16;
+        _viv_asm(COPY, data, src0, 16);
+        coord0.x++;
+
+        sum = sum + data.x;
+        sqr = sqr + data.x * data.x;
+    }
+
+    float4 mean, vari;
+    mean.x = sum * dimRatio;
+    vari.x = sqr * dimRatio;
+    vari.x = vari.x - mean.x * mean.x;
+
    int2 coord_out = (int2)(gidy, gidz);
    write_imagef(output_mean, coord_out, mean);
    write_imagef(output_vari, coord_out, vari);
--- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl
@ -172,3 +172,63 @@ __kernel void moments_axis01_I32toF32(
        write_imagef(output_vari, coord_out, vari);
    }
 }
+
+__kernel void moments_axis01_BF16toF32(
+    image2d_array_t   input, image2d_t  output_mean, image2d_t  output_vari,
+    int axis, int axis_num, int input_zp, float input_scale,
+    int width, int height, int chn, float dimRatio
+    )
+{
+    int gidx = get_global_id(0);
+    int gidz = get_global_id(1);
+    int lidx = get_local_id(0);
+
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    float4 data;
+    float sum = 0, sqr = 0;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    for(coord.x = gidx; coord.x < width; coord.x += 16)
+    {
+        float tmpSum = 0, tmpSqr = 0;
+        for(coord.y = 0; coord.y < height;)
+        {
+            uint4 src0 = read_imageui(input, coord);
+            src0 = src0 << 16;
+            _viv_asm(COPY, data, src0, 16);
+            coord.y++;
+
+            tmpSum = tmpSum + data.x;
+            tmpSqr = tmpSqr + data.x * data.x;
+        }
+        sqr += tmpSqr;
+        sum += tmpSum;
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(gidz, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 mean, vari;
+        mean.x = sum * dimRatio;
+        vari.x = sqr * dimRatio;
+        vari.x = vari.x - mean.x * mean.x;
+        write_imagef(output_mean, coord_out, mean);
+        write_imagef(output_vari, coord_out, vari);
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl
@ -177,3 +177,64 @@ __kernel void moments_axis012_I32toF32(
        write_imagef(output_vari, coord_out, vari);
    }
 }
+
+__kernel void moments_axis012_BF16toF32(
+    image2d_array_t   input, image2d_t  output_mean, image2d_t  output_vari,
+    int axis, int axis_num, int input_zp, float input_scale,
+    int width, int height, int chn, float dimRatio
+    )
+{
+    int gidx = get_global_id(0);
+    int lidx = get_local_id(0);
+
+    int4 coord = (int4)(gidx, 0, 0, 0);
+    float4 data;
+    float sum = 0, sqr = 0;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    for(coord.z = 0; coord.z < chn; coord.z++)
+    {
+        for(coord.x = gidx; coord.x < width; coord.x += 16)
+        {
+            float tmpSum = 0, tmpSqr = 0;
+            for(coord.y = 0; coord.y < height;)
+            {
+                uint4 src0 = read_imageui(input, coord);
+                src0 = src0 << 16;
+                _viv_asm(COPY, data, src0, 16);
+                coord.y++;
+                tmpSum = tmpSum + data.x;
+                tmpSqr = tmpSqr + data.x * data.x;
+            }
+            sqr += tmpSqr;
+            sum += tmpSum;
+        }
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(0, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 mean, vari;
+        mean.x = sum * dimRatio;
+        vari.x = sqr * dimRatio;
+        vari.x = vari.x - mean.x * mean.x;
+        write_imagef(output_mean, coord_out, mean);
+        write_imagef(output_vari, coord_out, vari);
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl
@ -106,6 +106,47 @@ __kernel void moments_axis1_I32toF32(
    vari.x = sqr * dimRatio * input_scale * input_scale;
    vari.x = vari.x - mean.x * mean.x;

+    int2 coord_out = (int2)(gidx, gidz);
+    write_imagef(output_mean, coord_out, mean);
+    write_imagef(output_vari, coord_out, vari);
+}
+
+__kernel void moments_axis1_BF16toF32(
+    __read_only image2d_array_t   input,
+    __write_only image2d_t  output_mean,
+    __write_only image2d_t  output_vari,
+    int axis,
+    int axis_num,
+    int input_zp,
+    float input_scale,
+    int width,
+    int height,
+    int chn,
+    float dimRatio
+    )
+{
+    int gidx = get_global_id(0);
+    int gidz = get_global_id(1);
+
+    int4 coord0 = (int4)(gidx, 0, gidz, 0);
+    float4 data;
+    float sum = 0, sqr = 0;
+
+    for(coord0.y = 0; coord0.y < height;)
+    {
+        uint4 src0 = read_imageui(input, coord0);
+        src0 = src0 << 16;
+        _viv_asm(COPY, data, src0, 16);
+        coord0.y++;
+        sum = sum + data.x;
+        sqr = sqr + data.x * data.x;
+    }
+
+    float4 mean, vari;
+    mean.x = sum * dimRatio;
+    vari.x = sqr * dimRatio;
+    vari.x = vari.x - mean.x * mean.x;
+
    int2 coord_out = (int2)(gidx, gidz);
    write_imagef(output_mean, coord_out, mean);
    write_imagef(output_vari, coord_out, vari);
--- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl
@ -123,4 +123,46 @@ __kernel void moments_axis2_I32toF32(
    int2 coord_out = (int2)(gidx, gidy);
    write_imagef(output_mean, coord_out, mean);
    write_imagef(output_vari, coord_out, vari);
-}
+}
+
+__kernel void moments_axis2_BF16toF32(
+    __read_only image2d_array_t   input,
+    __write_only image2d_t  output_mean,
+    __write_only image2d_t  output_vari,
+    int axis,
+    int axis_num,
+    int input_zp,
+    float input_scale,
+    int width,
+    int height,
+    int chn,
+    float dimRatio
+    )
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int4 coord0 = (int4)(gidx, gidy, 0, 0);
+    float4 data;
+    float sum = 0, sqr = 0;
+
+    for(coord0.z = 0; coord0.z < chn;)
+    {
+        uint4 src0 = read_imageui(input, coord0);
+        src0 = src0 << 16;
+        _viv_asm(COPY, data, src0, 16);
+        coord0.z++;
+
+        sum = sum + data.x;
+        sqr = sqr + data.x * data.x;
+    }
+
+    float4 mean, vari;
+    mean.x = sum * dimRatio;
+    vari.x = sqr * dimRatio;
+    vari.x = vari.x - mean.x * mean.x;
+
+    int2 coord_out = (int2)(gidx, gidy);
+    write_imagef(output_mean, coord_out, mean);
+    write_imagef(output_vari, coord_out, vari);
+}
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
@ -0,0 +1,251 @@
+#define TOPK_F32(LOCAL_SIZE0, STAGES) \
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toF32_I32 \
+ ( \
+  __read_only  image2d_t input, \
+  __write_only image2d_t output, \
+  __write_only image2d_t indices, \
+               int       num_stages, \
+               int       width \
+  ) \
+ { \
+    uint local_id = get_local_id(0); \
+    uint work_group_size = get_local_size(0); \
+    uint offset = 0; \
+ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    __local float local_data[128]; \
+    __local uint local_indices[128]; \
+ \
+    float left = read_imagef(input, coord.xy).x; \
+    coord.z += work_group_size; \
+    float data = read_imagef(input, coord.zy).x; \
+    float right = coord.z < width ? data : -2147483647.0f; \
+ \
+    local_data[local_id] = left; \
+    local_indices[local_id] = local_id; \
+    local_data[local_id + work_group_size] = right; \
+    local_indices[local_id + work_group_size] = local_id + work_group_size; \
+ \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \
+    { \
+        uint signo = (local_id >> stage) & 1; \
+ \
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+        { \
+            uint postShift = (stage - passOfStage); \
+            uint pairDistance = 1 << postShift; \
+ \
+            uint left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \
+            uint right_id = left_id + pairDistance; \
+ \
+            uint left_idx = local_indices[left_id]; \
+            uint right_idx = local_indices[right_id]; \
+ \
+            float left_elem = local_data[left_id]; \
+            float right_elem = local_data[right_id]; \
+ \
+            if ((left_elem < right_elem) ^ signo) \
+            { \
+                local_data[left_id] = right_elem; \
+                local_data[right_id] = left_elem; \
+ \
+                local_indices[left_id] = right_idx; \
+                local_indices[right_id] = left_idx; \
+            } \
+ \
+            barrier(CLK_LOCAL_MEM_FENCE); \
+        } \
+    } \
+ \
+    float4 dst; \
+    dst.x = local_data[local_id]; \
+    dst.y = local_data[local_id + work_group_size]; \
+ \
+    write_imagef(output, coord.xy, dst.xxxx); \
+    write_imagef(output, coord.zy, dst.yyyy); \
+ \
+    int4 index; \
+    index.x = ((int*)local_indices)[local_id]; \
+    index.y = ((int*)local_indices)[local_id + work_group_size]; \
+ \
+    write_imagei(indices, coord.xy, index.xxxx); \
+    write_imagei(indices, coord.zy, index.yyyy); \
+ }
+TOPK_F32(1 << 0, 0)
+TOPK_F32(1 << 1, 1)
+TOPK_F32(1 << 2, 2)
+TOPK_F32(1 << 3, 3)
+TOPK_F32(1 << 4, 4)
+TOPK_F32(1 << 5, 5)
+TOPK_F32(1 << 6, 6)
+
+#define TOPK_U32(LOCAL_SIZE0, STAGES) \
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_U32toU32_I32 \
+ ( \
+  __read_only  image2d_t input, \
+  __write_only image2d_t output, \
+  __write_only image2d_t indices, \
+               int       num_stages, \
+               int       width \
+  ) \
+ { \
+    uint local_id = get_local_id(0); \
+    uint work_group_size = get_local_size(0); \
+    uint offset = 0; \
+ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    __local uint local_data[128]; \
+    __local uint local_indices[128]; \
+ \
+    uint left = read_imageui(input, coord.xy).x; \
+    coord.z += work_group_size; \
+    uint data = read_imageui(input, coord.zy).x; \
+    uint right = coord.z < width ? data : 0; \
+ \
+    local_data[local_id] = left; \
+    local_indices[local_id] = local_id; \
+    local_data[local_id + work_group_size] = right; \
+    local_indices[local_id + work_group_size] = local_id + work_group_size; \
+ \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \
+    { \
+        uint signo = (local_id >> stage) & 1; \
+ \
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+        { \
+            uint postShift = (stage - passOfStage); \
+            uint pairDistance = 1 << postShift; \
+ \
+            uint left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \
+            uint right_id = left_id + pairDistance; \
+ \
+            uint left_idx = local_indices[left_id]; \
+            uint right_idx = local_indices[right_id]; \
+ \
+            uint left_elem = local_data[left_id]; \
+            uint right_elem = local_data[right_id]; \
+ \
+            if ((left_elem < right_elem) ^ signo) \
+            { \
+                local_data[left_id] = right_elem; \
+                local_data[right_id] = left_elem; \
+ \
+                local_indices[left_id] = right_idx; \
+                local_indices[right_id] = left_idx; \
+            } \
+ \
+            barrier(CLK_LOCAL_MEM_FENCE); \
+        } \
+    } \
+ \
+    uint4 dst; \
+    dst.x = local_data[local_id]; \
+    dst.y = local_data[local_id + work_group_size]; \
+ \
+    write_imageui(output, coord.xy, dst.xxxx); \
+    write_imageui(output, coord.zy, dst.yyyy); \
+ \
+    int4 index; \
+    index.x = ((int*)local_indices)[local_id]; \
+    index.y = ((int*)local_indices)[local_id + work_group_size]; \
+ \
+    write_imagei(indices, coord.xy, index.xxxx); \
+    write_imagei(indices, coord.zy, index.yyyy); \
+ }
+TOPK_U32(1 << 0, 0)
+TOPK_U32(1 << 1, 1)
+TOPK_U32(1 << 2, 2)
+TOPK_U32(1 << 3, 3)
+TOPK_U32(1 << 4, 4)
+TOPK_U32(1 << 5, 5)
+TOPK_U32(1 << 6, 6)
+
+#define TOPK_I32(LOCAL_SIZE0, STAGES) \
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_I32toI32_I32 \
+ ( \
+  __read_only  image2d_t input, \
+  __write_only image2d_t output, \
+  __write_only image2d_t indices, \
+               int       num_stages, \
+               int       width \
+  ) \
+ { \
+    int local_id = get_local_id(0); \
+    int work_group_size = get_local_size(0); \
+    int offset = 0; \
+ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    __local int local_data[128]; \
+    __local int local_indices[128]; \
+ \
+    int left = read_imagei(input, coord.xy).x; \
+    coord.z += work_group_size; \
+    int data = read_imagei(input, coord.zy).x; \
+    int right = coord.z < width ? data : -2147483647; \
+ \
+    local_data[local_id] = left; \
+    local_indices[local_id] = local_id; \
+    local_data[local_id + work_group_size] = right; \
+    local_indices[local_id + work_group_size] = local_id + work_group_size; \
+ \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    for (int stage = 0; stage < num_stages + 1; ++stage) \
+    { \
+        int signo = (local_id >> stage) & 1; \
+ \
+        for (int passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+        { \
+            int postShift = (stage - passOfStage); \
+            int pairDistance = 1 << postShift; \
+ \
+            int left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \
+            int right_id = left_id + pairDistance; \
+ \
+            int left_idx = local_indices[left_id]; \
+            int right_idx = local_indices[right_id]; \
+ \
+            int left_elem = local_data[left_id]; \
+            int right_elem = local_data[right_id]; \
+ \
+            if ((left_elem < right_elem) ^ signo) \
+            { \
+                local_data[left_id] = right_elem; \
+                local_data[right_id] = left_elem; \
+ \
+                local_indices[left_id] = right_idx; \
+                local_indices[right_id] = left_idx; \
+            } \
+ \
+            barrier(CLK_LOCAL_MEM_FENCE); \
+        } \
+    } \
+ \
+    int4 dst; \
+    dst.x = local_data[local_id]; \
+    dst.y = local_data[local_id + work_group_size]; \
+ \
+    write_imagei(output, coord.xy, dst.xxxx); \
+    write_imagei(output, coord.zy, dst.yyyy); \
+ \
+    int4 index; \
+    index.x = ((int*)local_indices)[local_id]; \
+    index.y = ((int*)local_indices)[local_id + work_group_size]; \
+ \
+    write_imagei(indices, coord.xy, index.xxxx); \
+    write_imagei(indices, coord.zy, index.yyyy); \
+ }
+TOPK_I32(1 << 0, 0)
+TOPK_I32(1 << 1, 1)
+TOPK_I32(1 << 2, 2)
+TOPK_I32(1 << 3, 3)
+TOPK_I32(1 << 4, 4)
+TOPK_I32(1 << 5, 5)
+TOPK_I32(1 << 6, 6)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis2.vx
@ -3,6 +3,8 @@
 _viv_uniform int4 packedArgIdx;
 _viv_uniform int argLenSub1;
 _viv_uniform VXC_512Bits uniExtractData_2x8;
+_viv_uniform VXC_512Bits uniExtract1stU8toI16_2x8;
+_viv_uniform VXC_512Bits uniExtract2ndU8toI16_2x8;

 #define TENSOR_ARGMAX_AXIS2_16BITS(src_type_name, dst_type_name,\
                src_type, copy_type, axis_type, dst_type, inst_type) \
@ -67,6 +69,56 @@ TENSOR_ARGMAX_AXIS2_16BITS_2D(I16, U8,  vxc_short8, vxc_short8, vxc_short8, vxc_
 #define TENSOR_ARGMAX_AXIS2_8BITS(src_type_name, dst_type_name, src_type, dst_type) \
    __kernel void argmax_axis2_##src_type_name##to##dst_type_name( \
 __read_only  image2d_array_t  input, \
+__write_only image2d_array_t  output, \
+        int  axisVal \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \
+    src_type src; \
+    src_type maxVal; \
+    VXC_ReadImage2DArray(maxVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    dst_type axis; \
+    dst_type packIdx; \
+ \
+    _viv_asm(COPY, axis, packedArgIdx, 16); \
+    _viv_asm(COPY, packIdx, packedArgIdx, 16); \
+ \
+    coord.z --; \
+    do \
+    { \
+       VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+       coord.z --; \
+       packIdx --; \
+       maxVal = max(maxVal, src); \
+       src_type condition; \
+       VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \
+       axis = condition ? packIdx : axis; \
+    } while (coord.z >= 0); \
+ \
+    VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+}
+TENSOR_ARGMAX_AXIS2_8BITS(I8,  U8,  vxc_char16,  vxc_uchar16)
+TENSOR_ARGMAX_AXIS2_8BITS(U8,  U8,  vxc_uchar16, vxc_uchar16)
+
+#define TENSOR_ARGMAX_AXIS2_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \
+    __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_2D( \
+__read_only  image2d_array_t  input, \
+__write_only image2d_array_t  output, \
+        int  axisVal \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \
+    VXC_WriteImage(output, coord, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+TENSOR_ARGMAX_AXIS2_8BITS_2D(I8,  I16, vxc_char8,  vxc_short8)
+TENSOR_ARGMAX_AXIS2_8BITS_2D(I8,  U8,  vxc_char8,  vxc_uchar8)
+TENSOR_ARGMAX_AXIS2_8BITS_2D(U8,  I16, vxc_uchar8, vxc_short8)
+TENSOR_ARGMAX_AXIS2_8BITS_2D(U8,  U8,  vxc_uchar8, vxc_uchar8)
+
+#define TENSOR_ARGMAX_AXIS2_MIX(src_type_name, dst_type_name, src_type, dst_type) \
+    __kernel void argmax_axis2_##src_type_name##to##dst_type_name( \
+__read_only  image2d_array_t  input, \
 __write_only image2d_array_t  output, \
        int  axisVal \
    ) \
@ -95,23 +147,46 @@ __write_only image2d_array_t  output, \
 \
    VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
-TENSOR_ARGMAX_AXIS2_8BITS(I8,  I16, vxc_char8,  vxc_short8)
-TENSOR_ARGMAX_AXIS2_8BITS(I8,  U8,  vxc_char8,  vxc_uchar8)
-TENSOR_ARGMAX_AXIS2_8BITS(U8,  I16, vxc_uchar8, vxc_short8)
-TENSOR_ARGMAX_AXIS2_8BITS(U8,  U8,  vxc_uchar8, vxc_uchar8)
+TENSOR_ARGMAX_AXIS2_MIX(I8,  I16, vxc_char8,  vxc_short8)
+TENSOR_ARGMAX_AXIS2_MIX(U8,  I16, vxc_uchar8, vxc_short8)

-#define TENSOR_ARGMAX_AXIS2_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \
-    __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_2D( \
+#define TENSOR_ARGMAX_AXIS2_MIX_OPT(src_type_name, dst_type_name, src_type, dst_type) \
+    __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_opt( \
 __read_only  image2d_array_t  input, \
 __write_only image2d_array_t  output, \
        int  axisVal \
    ) \
 { \
-    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
-    dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \
-    VXC_WriteImage(output, coord, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \
+    src_type src; \
+    src_type maxVal; \
+    VXC_ReadImage2DArray(maxVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    vxc_uchar16 axis; \
+    vxc_uchar16 packIdx; \
+ \
+    _viv_asm(COPY, axis, packedArgIdx, 16); \
+    _viv_asm(COPY, packIdx, packedArgIdx, 16); \
+ \
+    coord.z --; \
+    do \
+    { \
+       VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+       coord.z --; \
+       packIdx --; \
+       maxVal = max(maxVal, src); \
+       src_type condition; \
+       VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \
+       axis = condition ? packIdx : axis; \
+    } while (coord.z >= 0); \
+    vxc_short8 dst0, dst1; \
+    VXC_DP2x8(dst0, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+                    uniExtract1stU8toI16_2x8); \
+    VXC_DP2x8(dst1, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+                    uniExtract2ndU8toI16_2x8); \
+ \
+    VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    coord.x += 8; \
+    VXC_WriteImage(output, coord.xy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
-TENSOR_ARGMAX_AXIS2_8BITS_2D(I8,  I16, vxc_char8,  vxc_short8)
-TENSOR_ARGMAX_AXIS2_8BITS_2D(I8,  U8,  vxc_char8,  vxc_uchar8)
-TENSOR_ARGMAX_AXIS2_8BITS_2D(U8,  I16, vxc_uchar8, vxc_short8)
-TENSOR_ARGMAX_AXIS2_8BITS_2D(U8,  U8,  vxc_uchar8, vxc_uchar8)
+TENSOR_ARGMAX_AXIS2_MIX_OPT(I8,  I16, vxc_char16,  vxc_short8)
+TENSOR_ARGMAX_AXIS2_MIX_OPT(U8,  I16, vxc_uchar16, vxc_short8)
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax.vx
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax.vx
@ -19,14 +19,13 @@ __kernel void Softmax2VXC
    int axis
    )
 {
-
   int4 coord_in = (int4)(0,0,0,0);
   float fMax = 0.0;
   for (int i = 0; i < sf_size; i++)
   {
       vxc_char8 val;
       coord_in.x = i;
-       VXC_ReadImage2DArray(val, input, coord_in, VXC_5BITOFFSET_XY(0,0), VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
       float fval;
       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);

@ -40,7 +39,7 @@ __kernel void Softmax2VXC
       vxc_char8 val;

       coord_in.x = i;
-       VXC_ReadImage2DArray(val, input, coord_in, VXC_5BITOFFSET_XY(0,0), VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
       float fval;
       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);

@ -57,7 +56,7 @@ __kernel void Softmax2VXC
       vxc_short8 val;
       vxc_half8  val_h;
       coord_in.x = i;
-       VXC_ReadImage2DArray(val, output, coord_in, VXC_5BITOFFSET_XY(0,0), VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
+       VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
       float fval;
       _viv_asm(COPY, val_h,val, 16);
       VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
@ -68,8 +67,4 @@ __kernel void Softmax2VXC
       _viv_asm(COPY,dst,hVal, 4);
       VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    }
-
 }
-
-
-
--- a/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine.vx
@ -0,0 +1,353 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float4 matrix0;
+_viv_uniform float2 matrix1;
+_viv_uniform float4 matrix4;
+__kernel void custom_warp_affine_nearest_neighbor_U8toU8_2D
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5
+)
+{
+    int2   coord = (int2)(get_global_id(0), get_global_id(1));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f = convert_float4(coord_in);
+
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
+
+    coord_in = convert_int4(coord_f);
+
+    vxc_uchar16 dst;
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_warp_affine_bilinear_U8toU8_2D
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5
+)
+{
+    int2   coord = (int2)(get_global_id(0), get_global_id(1));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f = convert_float4(coord_in);
+
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
+
+    coord_in = convert_int4(coord_f);
+
+    vxc_uchar16 src0, src1, dst;
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_warp_affine_nearest_neighbor_U8toU8
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5
+)
+{
+    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f = convert_float4(coord_in);
+
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
+
+    coord_in = convert_int4(coord_f);
+
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_input.w, baseAddr);
+
+    vxc_uchar16 dst;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    coord_input.xy = coord_in.xy;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    coord_input.xy = coord_in.xy;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    coord_input.xy = coord_in.xy;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_warp_affine_bilinear_U8toU8
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5
+)
+{
+    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f = convert_float4(coord_in);
+
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
+
+    coord_in = convert_int4(coord_f);
+
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_input.w, baseAddr);
+
+    vxc_uchar16 src0, src1, dst;
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+
+    coord_input.xy = coord_in.xy;
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    coord_input.xy = coord_in.xy;
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f = coord_f.zwzw + matrix4;
+    coord_in = convert_int4(coord_f);
+    coord_input.xy = coord_in.xy;
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_perspective.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_perspective.vx
@ -0,0 +1,395 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float4 matrix0;
+_viv_uniform float4 matrix1;
+_viv_uniform float4 matrix2;
+_viv_uniform float4 matrix4;
+__kernel void custom_warp_perspective_nearest_neighbor_U8toU8_2D
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5,
+                 float           _m6,
+                 float           _m7,
+                 float           _m8
+)
+{
+    int2   coord = (int2)(get_global_id(0), get_global_id(1));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f0 = convert_float4(coord_in);
+
+    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;
+    z0.zw = z0.zw + 2 * matrix1.z;
+    float4 z1 = z0 + 4 * matrix1.z;
+
+    z0 = 1.0f / z0;
+    z1 = 1.0f / z1;
+
+    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;
+    float4 coord_f = coord_f0 * z0.xxyy;
+
+    coord_in = convert_int4(coord_f);
+
+    vxc_uchar16 dst;
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z0.zzww;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z1.xxyy;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z1.zzww;
+    coord_in = convert_int4(coord_f);
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_warp_perspective_bilinear_U8toU8_2D
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5,
+                 float           _m6,
+                 float           _m7,
+                 float           _m8
+)
+{
+    int2   coord = (int2)(get_global_id(0), get_global_id(1));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f0 = convert_float4(coord_in);
+
+    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;
+    z0.zw = z0.zw + 2 * matrix1.z;
+    float4 z1 = z0 + 4 * matrix1.z;
+
+    z0 = 1.0f / z0;
+    z1 = 1.0f / z1;
+
+    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;
+    float4 coord_f = coord_f0 * z0.xxyy;
+
+    coord_in = convert_int4(floor(coord_f));
+
+    vxc_uchar16 src0, src1, dst;
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z0.zzww;
+    coord_in = convert_int4(floor(coord_f));
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z1.xxyy;
+    coord_in = convert_int4(floor(coord_f));
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z1.zzww;
+    coord_in = convert_int4(floor(coord_f));
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+#define IMAGE_LOAD_3D(dst, xoffset, yoffset, start, end) \
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, VXC_5BITOFFSET_XY(xoffset, yoffset), \
+        VXC_MODIFIER(start, end, 0, VXC_RM_TowardZero, 0));
+__kernel void custom_warp_perspective_nearest_neighbor_U8toU8
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5,
+                 float           _m6,
+                 float           _m7,
+                 float           _m8
+)
+{
+    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f0 = convert_float4(coord_in);
+
+    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;
+    z0.zw = z0.zw + 2 * matrix1.z;
+    float4 z1 = z0 + 4 * matrix1.z;
+
+    z0 = 1.0f / z0;
+    z1 = 1.0f / z1;
+
+    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;
+    float4 coord_f = coord_f0 * z0.xxyy;
+
+    coord_in = convert_int4(coord_f);
+
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_input.w, baseAddr);
+
+    vxc_uchar16 dst;
+    IMAGE_LOAD_3D(dst, 0, 0, 0, 0)
+    coord_input.xy = coord_in.zw;
+    IMAGE_LOAD_3D(dst, 0, 0, 1, 1)
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z0.zzww;
+    coord_in = convert_int4(coord_f);
+    coord_input.xy = coord_in.xy;
+    IMAGE_LOAD_3D(dst, 0, 0, 2, 2)
+    coord_input.xy = coord_in.zw;
+    IMAGE_LOAD_3D(dst, 0, 0, 3, 3)
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z1.xxyy;
+    coord_in = convert_int4(coord_f);
+    coord_input.xy = coord_in.xy;
+    IMAGE_LOAD_3D(dst, 0, 0, 4, 4)
+    coord_input.xy = coord_in.zw;
+    IMAGE_LOAD_3D(dst, 0, 0, 5, 5)
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z1.zzww;
+    coord_in = convert_int4(coord_f);
+    coord_input.xy = coord_in.xy;
+    IMAGE_LOAD_3D(dst, 0, 0, 6, 6)
+    coord_input.xy = coord_in.zw;
+    IMAGE_LOAD_3D(dst, 0, 0, 7, 7)
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_warp_perspective_bilinear_U8toU8
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5,
+                 float           _m6,
+                 float           _m7,
+                 float           _m8
+)
+{
+    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f0 = convert_float4(coord_in);
+
+    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;
+    z0.zw = z0.zw + 2 * matrix1.z;
+    float4 z1 = z0 + 4 * matrix1.z;
+
+    z0 = 1.0f / z0;
+    z1 = 1.0f / z1;
+
+    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;
+    float4 coord_f = coord_f0 * z0.xxyy;
+
+    coord_in = convert_int4(floor(coord_f));
+
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_input.w, baseAddr);
+
+    vxc_uchar16 src0, src1, dst;
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_input.xy = coord_in.zw;
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z0.zzww;
+    coord_in = convert_int4(floor(coord_f));
+    coord_input.xy = coord_in.xy;
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_input.xy = coord_in.zw;
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z1.xxyy;
+    coord_in = convert_int4(floor(coord_f));
+    coord_input.xy = coord_in.xy;
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_input.xy = coord_in.zw;
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_f0 = coord_f0.zwzw + matrix4;
+    coord_f = coord_f0 * z1.zzww;
+    coord_in = convert_int4(floor(coord_f));
+    coord_input.xy = coord_in.xy;
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_input.xy = coord_in.zw;
+    IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
+    IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx
@ -304,4 +304,4 @@ __kernel void depth2space_crd_F16toI16_blk2(
    VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
    coord_out.x += 8;
    VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx
@ -8,6 +8,11 @@ float4 eltwise_unary_sin(float4 x)
    return native_sin(x);
 }

+float4 eltwise_unary_cos(float4 x)
+{
+    return native_cos(x);
+}
+
 #define logE        (1.44269502f)
 #define twoLogE     (logE * 2.0f)
 float4 eltwise_unary_exp(float4 x)
@ -189,6 +194,17 @@ ELTSISE_UNARY_2D(sin, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_u
 ELTSISE_UNARY_2D(sin, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
 ELTSISE_UNARY_2D(sin, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
 ELTSISE_UNARY_2D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//COS
+ELTSISE_UNARY_2D(cos, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(cos, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(cos, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(cos, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(cos, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(cos, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(cos, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(cos, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(cos, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
 //LOG
 ELTSISE_UNARY_2D(log, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
 ELTSISE_UNARY_2D(log, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
@ -315,6 +331,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;
 ELTSISE_UNARY_BF16_2D(exp)
 //SIN
 ELTSISE_UNARY_BF16_2D(sin)
+//COS
+ELTSISE_UNARY_BF16_2D(cos)
 //LOG
 ELTSISE_UNARY_BF16_2D(log)
 //ELU
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx
@ -8,6 +8,11 @@ float4 eltwise_unary_sin(float4 x)
    return native_sin(x);
 }

+float4 eltwise_unary_cos(float4 x)
+{
+    return native_cos(x);
+}
+
 #define logE        (1.44269502f)
 #define twoLogE     (logE * 2.0f)
 float4 eltwise_unary_exp(float4 x)
@ -189,6 +194,17 @@ ELTSISE_UNARY_3D(sin, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_u
 ELTSISE_UNARY_3D(sin, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
 ELTSISE_UNARY_3D(sin, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
 ELTSISE_UNARY_3D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//COS
+ELTSISE_UNARY_3D(cos, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(cos, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(cos, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(cos, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(cos, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(cos, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(cos, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(cos, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(cos, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
 //LOG
 ELTSISE_UNARY_3D(log, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
 ELTSISE_UNARY_3D(log, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
@ -314,6 +330,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;
 ELTSISE_UNARY_BF16(exp)
 //SIN
 ELTSISE_UNARY_BF16(sin)
+//COS
+ELTSISE_UNARY_BF16(cos)
 //LOG
 ELTSISE_UNARY_BF16(log)
 //ELU
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
@ -91,8 +91,6 @@ __kernel void gather_F16toF16(
    int gidz = get_global_id(2);  // block_num

    int4 coord_in = (int4)(gidy, 0, gidx, 0);
-
-
    int4 indice = read_imagei(input1, coord_in.xy);
    coord_in.w = gidz * axis_num + indice.x;

--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx
@ -0,0 +1,237 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int indices_num;
+_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8;
+_viv_uniform int batch;
+
+__kernel void gather_batch_I8toI8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        vxc_char16 src;
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_in.z++;
+        VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord.z++;
+    }
+}
+
+__kernel void gather_batch_U8toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        vxc_uchar16 src;
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_in.z++;
+        VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord.z++;
+    }
+}
+
+__kernel void gather_batch_I16toI16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        vxc_short8 src;
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.z++;
+        VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord.z++;
+    }
+}
+
+__kernel void gather_batch_F16toF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        vxc_short8 src;
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.z++;
+        VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord.z++;
+    }
+}
+
+__kernel void gather_batch_I8toI8_axis0(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 indices = read_imagei(input1, coord.xz);
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
+
+    vxc_char16 src, dst;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.y;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.z;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.w;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniExtraCopyDpKeepinEvis_2x8);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_batch_U8toU8_axis0(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 indices = read_imagei(input1, coord.xz);
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
+
+    vxc_uchar16 src, dst;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.y;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.z;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.w;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniExtraCopyDpKeepinEvis_2x8);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_batch_I16toI16_axis0(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 indices = read_imagei(input1, coord.xz);
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.y;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.z;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.w;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniExtraCopyDpKeepinEvis_2x8);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_batch_F16toF16_axis0(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 indices = read_imagei(input1, coord.xz);
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.y;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.z;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.w;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniExtraCopyDpKeepinEvis_2x8);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx
@ -0,0 +1,236 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int indices_num;
+_viv_uniform int batch;
+
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
+
+_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8;
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
+
+#define GATHER_BATCH_8BITS_TO_F16(src0_type_name, read_type) \
+__kernel void gather_batch_##src0_type_name##toF16( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int block_num, \
+    int axis_num \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+ \
+    int2 coord_idx = (int2)(gidy, 0); \
+    int4 coord_in = (int4)(gidx, 0, 0, 0); \
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+ \
+    for(; coord_idx.y < batch;) \
+    { \
+        int4 indice = read_imagei(input1, coord_idx); \
+        coord_idx.y++; \
+        coord_in.y = gidz * axis_num + indice.x; \
+ \
+        read_type src; \
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.z++; \
+        vxc_half8  src0, src1; \
+        vxc_short8 dst0, dst1; \
+ \
+        VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \
+        VXC_DP2x8(src1,src,ms0, VXC_MODIFIER(0,7,8, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \
+        _viv_asm(COPY, dst0, src0, 16); \
+        _viv_asm(COPY, dst1, src1, 16); \
+        VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.x += 8; \
+        VXC_WriteImage2DArray(output, coord, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.z++; \
+        coord.x = gidx; \
+    } \
+}
+GATHER_BATCH_8BITS_TO_F16(U8, vxc_uchar16)
+GATHER_BATCH_8BITS_TO_F16(I8, vxc_char16)
+
+#define GATHER_BATCH_F16_TO_QINT(src1_type_name, write_type) \
+__kernel void gather_batch_F16to##src1_type_name( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int block_num, \
+    int axis_num \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+ \
+    int2 coord_idx = (int2)(gidy, 0); \
+    int4 coord_in = (int4)(gidx, 0, 0, 0); \
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); \
+    vxc_ushort8 mp1; \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    for(; coord_idx.y < batch;) \
+    { \
+        int4 indice = read_imagei(input1, coord_idx); \
+        coord_idx.y++; \
+        coord_in.y = gidz * axis_num + indice.x; \
+ \
+        vxc_short8 src; \
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.z++; \
+ \
+        vxc_half8 data; \
+        write_type dst; \
+        _viv_asm(COPY, data, src, 16); \
+        VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.z++; \
+    } \
+}
+GATHER_BATCH_F16_TO_QINT(U8, vxc_uchar16)
+GATHER_BATCH_F16_TO_QINT(I8, vxc_char16)
+GATHER_BATCH_F16_TO_QINT(I16, vxc_short8)
+
+__kernel void gather_batch_I16toF16(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+
+    int2 coord_idx = (int2)(gidy, 0);
+    int4 coord_in = (int4)(gidx, 0, 0, 0);
+    int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
+    vxc_half8  src0;
+    vxc_short8 dst0;
+    vxc_ushort8 ms0;
+    _viv_asm(COPY, ms0, multAndoutZP0, 16);
+    for(; coord_idx.y < batch;)
+    {
+        int4 indice = read_imagei(input1, coord_idx);
+        coord_idx.y++;
+        coord_in.y = gidz * axis_num + indice.x;
+
+        vxc_short8 src;
+        VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.z++;
+        VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniU8MulAndPostShift_0_Lo_2x8);
+        _viv_asm(COPY, dst0, src0, 16);
+        VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord.z++;
+    }
+}
+
+#define GATHER_BATCH_8BITS_TO_F16_AXIS0(src0_type_name, read_type) \
+__kernel void gather_batch_##src0_type_name##toF16_axis0( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int block_num, \
+    int axis_num \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int4 indices = read_imagei(input1, coord.xz); \
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \
+ \
+    read_type src; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = indices.y; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = indices.z; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = indices.w; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_half8  src0; \
+    vxc_short8 dst0; \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \
+    _viv_asm(COPY, dst0, src0, 16); \
+    VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+GATHER_BATCH_8BITS_TO_F16_AXIS0(U8, vxc_uchar16)
+GATHER_BATCH_8BITS_TO_F16_AXIS0(I8, vxc_char16)
+
+#define GATHER_BATCH_F16_TO_QINT_AXIS0(src1_type_name, write_type) \
+__kernel void gather_batch_F16to##src1_type_name##_axis0( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int block_num, \
+    int axis_num \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int4 indices = read_imagei(input1, coord.xz); \
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \
+ \
+    vxc_short8 src; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = indices.y; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = indices.z; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = indices.w; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_ushort8 mp1; \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    vxc_half8 data; \
+    write_type dst; \
+    _viv_asm(COPY, data, src, 16); \
+    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+GATHER_BATCH_F16_TO_QINT_AXIS0(U8, vxc_uchar16)
+GATHER_BATCH_F16_TO_QINT_AXIS0(I8, vxc_char16)
+GATHER_BATCH_F16_TO_QINT_AXIS0(I16, vxc_short8)
+
+__kernel void gather_batch_I16toF16_axis0(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 indices = read_imagei(input1, coord.xz);
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.y;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.z;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = indices.w;
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+
+    vxc_half8  src0;
+    vxc_short8 dst0;
+    vxc_ushort8 ms0;
+    _viv_asm(COPY, ms0, multAndoutZP0, 16);
+    VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniU8MulAndPostShift_0_Lo_2x8);
+    _viv_asm(COPY, dst0, src0, 16);
+
+    VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/logical_ops.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/logical_ops.vx
@ -1,5 +1,7 @@
 #include "cl_viv_vx_ext.h"

+_viv_uniform VXC_512Bits uniConvertInt16toInt8_2x8;
+
 #define TENSORLOGICAL_PROCESS(input_type, copy_type, output_type, out_copy_type,\
 lgc_op, lgc_op2, read_fun, write_fun) \
    input_type vA;\
@ -59,7 +61,7 @@ out_copy_type, lgc_op, lgc_op2,  read_fun, write_fun) \
    VXC_DP2x8(tmpOut,dst,dst,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero, 0),uniMulShortMinus1toFp16_2x8); \
    out_copy_type data; \
    _viv_asm(COPY, data, tmpOut, 16); \
-    write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));


 #define TENSORLOGICAL_FP(name0, src_type_name, dst_type_name, input_type,\
@ -86,6 +88,47 @@ copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \
    VXC_ReadImage, VXC_WriteImage) \
 }

+#define TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type,\
+out_copy_type, lgc_op, lgc_op2,  read_fun, write_fun) \
+    input_type vA;\
+    copy_type  src0;\
+    input_type vB;\
+    copy_type  src1;\
+    read_fun(vA,in0,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\
+    _viv_asm(COPY, src0, vA, 16); \
+    read_fun(vB,in1,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\
+    _viv_asm(COPY, src1, vB, 16); \
+    output_type dst; \
+    dst = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \
+    vxc_char8 data; \
+    VXC_DP2x8(data,dst,dst,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero, 1),uniConvertInt16toInt8_2x8); \
+    data &= 1; \
+    write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+#define TENSORLOGICAL_BFP16(name0, src_type_name, dst_type_name, input_type,\
+copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \
+    __kernel void logical_##name0##_##src_type_name##to##dst_type_name( \
+    __read_only  image2d_array_t in0, \
+    __read_only  image2d_array_t in1, \
+    __write_only image2d_array_t output) \
+{\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\
+    TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2,\
+    VXC_ReadImage2DArray, VXC_WriteImage2DArray) \
+}
+
+#define TENSORLOGICAL_BFP16_2D(name0, src_type_name, dst_type_name, input_type,\
+copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \
+    __kernel void logical_##name0##_##src_type_name##to##dst_type_name##_2D( \
+    __read_only  image2d_array_t in0, \
+    __read_only  image2d_array_t in1, \
+    __write_only image2d_array_t output) \
+{\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));\
+    TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2,\
+    VXC_ReadImage, VXC_WriteImage) \
+}
+
 //          name0, src_name, dst_name, input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2
 TENSORLOGICAL(or,      I8,     I8,     vxc_char8,   vxc_char8,   vxc_char8,   vxc_char8,   ||, )
 //TENSORLOGICAL(or,      U8,     U8,     vxc_uchar8,  vxc_uchar8,  vxc_char8,   vxc_uchar8,  ||, )
@ -100,6 +143,10 @@ TENSORLOGICAL(xor,     I8,     I8,     vxc_char8,   vxc_char8,   vxc_char8,   vx
 //TENSORLOGICAL(xor,     I16,    I16,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)
 //TENSORLOGICAL_FP(xor,  F16,    F16,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)

+TENSORLOGICAL_BFP16(or,   BF16,   I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ||, )
+TENSORLOGICAL_BFP16(and,  BF16,   I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  &&, )
+TENSORLOGICAL_BFP16(xor,  BF16,   I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)
+
 TENSORLOGICAL_2D(or,      I8,     I8,     vxc_char8,   vxc_char8,   vxc_char8,   vxc_char8,   ||, )
 //TENSORLOGICAL_2D(or,      U8,     U8,     vxc_uchar8,  vxc_uchar8,  vxc_char8,   vxc_uchar8,  ||, )
 //TENSORLOGICAL_2D(or,      I16,    I16,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ||, )
@ -112,3 +159,7 @@ TENSORLOGICAL_2D(xor,     I8,     I8,     vxc_char8,   vxc_char8,   vxc_char8,
 //TENSORLOGICAL_2D(xor,     U8,     U8,     vxc_uchar8,  vxc_uchar8,  vxc_char8,   vxc_uchar8,  ^, !!)
 //TENSORLOGICAL_2D(xor,     I16,    I16,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)
 //TENSORLOGICAL_FP_2D(xor,  F16,    F16,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)
+
+TENSORLOGICAL_BFP16_2D(or,   BF16,  I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ||, )
+TENSORLOGICAL_BFP16_2D(and,  BF16,  I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  &&, )
+TENSORLOGICAL_BFP16_2D(xor,  BF16,  I8,    vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  ^, !!)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx
@ -0,0 +1,272 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int ac2zero;
+_viv_uniform int bc2zero;
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+
+__kernel void gemm_BF16BF16toBF16(image2d_array_t inputA,
+            image2d_array_t inputB, image2d_array_t output,
+            int transposeA, int transposeB,
+            int adjointA, int adjointB, uint M, uint K, uint N)
+{
+    uint gidy = get_global_id(1);
+    int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0);
+    int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0);
+
+    vxc_ushort8 valC0, valC1, src0, src1;
+    vxc_ushort8 srcA0, srcB0, srcA1, srcB1, outC;
+    vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);
+    vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);
+
+    int8 inputA_desc, inputB_desc, output_desc;
+    _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
+    int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
+    _viv_asm(MOV, coord_a.w, baseAddr_a);
+    _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
+    int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
+    _viv_asm(MOV, coord_b.w, baseAddr_b);
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+
+    for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)
+    {
+        vxc_float4 tempA0, tempA1, tempA2, tempA3;
+        vxc_float4 tempB0, tempB1, tempB2, tempB3;
+        VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),
+                    VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),
+                    VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),
+                    VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),
+                    VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+        coord_a.x += 4;
+        coord_b.y += 4;
+
+        VXC_DP2x8(src0, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempA0, src0, 16);
+        VXC_DP2x8(src1, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, tempA1, src1, 16);
+        VXC_DP2x8(src0, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempA2, src0, 16);
+        VXC_DP2x8(src1, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, tempA3, src1, 16);
+        VXC_DP2x8(src0, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempB0, src0, 16);
+        VXC_DP2x8(src1, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, tempB1, src1, 16);
+        VXC_DP2x8(src0, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempB2, src0, 16);
+        VXC_DP2x8(src1, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, tempB3, src1, 16);
+
+        sum0 += (tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3);
+        sum1 += (tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3);
+        sum2 += (tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3);
+        sum3 += (tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3);
+    }
+    coord_b.y = gidy;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_b.w, baseAddr);
+    _viv_asm(COPY, valC0, sum0, 16);
+    _viv_asm(COPY, valC1, sum1, 16);
+    VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    coord_b.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    coord_b.y++;
+    _viv_asm(COPY, valC0, sum2, 16);
+    _viv_asm(COPY, valC1, sum3, 16);
+    VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    coord_b.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void gemm_transa_BF16BF16toBF16(
+                        image2d_array_t inputA,
+                        image2d_array_t inputB,
+                        image2d_array_t output,
+                                    int transposeA,
+                                    int transposeB,
+                                    int adjointA,
+                                    int adjointB,
+                        uint M, uint K, uint N)
+{
+    uint gidy = get_global_id(1);
+
+    vxc_ushort8 valC0, valC1;
+    vxc_ushort8 srcA, srcB, outC, src0, src1;
+
+    int4 coord_a = (int4)(gidy, 0, (ac2zero ? 0 : get_global_id(2)), 0);
+    int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0);
+
+    vxc_float4 sum0 = (vxc_float4)(0);
+    vxc_float4 sum1 = (vxc_float4)(0);
+    vxc_float4 sum2 = (vxc_float4)(0);
+    vxc_float4 sum3 = (vxc_float4)(0);
+
+    int8 inputA_desc, inputB_desc, output_desc;
+    _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
+    int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
+    _viv_asm(MOV, coord_a.w, baseAddr_a);
+    _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
+    int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
+    _viv_asm(MOV, coord_b.w, baseAddr_b);
+
+    vxc_float4 tempA0;
+    vxc_float4 tempB0;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;)
+    {
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        coord_a.y++;
+        coord_b.y++;
+
+        VXC_DP2x8(src0, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempA0, src0, 16);
+
+        VXC_DP2x8(src1, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempB0, src1, 16);
+
+        sum0 = (sum0 + tempA0.x * tempB0);
+        sum1 = (sum1 + tempA0.y * tempB0);
+        sum2 = (sum2 + tempA0.z * tempB0);
+        sum3 = (sum3 + tempA0.w * tempB0);
+    }
+    coord_b.y = gidy;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_b.w, baseAddr);
+    _viv_asm(COPY, valC0, sum0, 16);
+    _viv_asm(COPY, valC1, sum1, 16);
+    VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    coord_b.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    coord_b.y++;
+    _viv_asm(COPY, valC0, sum2, 16);
+    _viv_asm(COPY, valC1, sum3, 16);
+    VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+    coord_b.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void gemm_transb_BF16BF16toBF16(image2d_array_t inputA,
+                        image2d_array_t inputB,
+                        image2d_array_t output,
+                                    int transposeA,
+                                    int transposeB,
+                                    int adjointA,
+                                    int adjointB,
+                        uint M, uint K, uint N)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);
+    int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);
+
+    vxc_float4 sum0 = (vxc_float4)(0);
+    vxc_float4 sum1 = (vxc_float4)(0);
+    vxc_float4 sum2 = (vxc_float4)(0);
+    vxc_float4 sum3 = (vxc_float4)(0);
+
+    int8 inputA_desc, inputB_desc, output_desc;
+    _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
+    int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
+    _viv_asm(MOV, coord_a.w, baseAddr_a);
+    _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
+    int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
+    _viv_asm(MOV, coord_b.w, baseAddr_b);
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_ushort8 src0, src1;
+
+    for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;)
+    {
+        vxc_ushort8 srcA0,srcA1,srcA2,srcA3;
+        vxc_ushort8 srcB0,srcB1,srcB2,srcB3;
+        vxc_float4 tempA0, tempA1, tempA2, tempA3;
+        vxc_float4 tempB0, tempB1, tempB2, tempB3;
+        VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        coord_a.x += 4;
+        coord_b.x += 4;
+
+        VXC_DP2x8(src0, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempA0, src0, 16);
+        VXC_DP2x8(src1, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempA1, src1, 16);
+        VXC_DP2x8(src0, srcA2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempA2, src0, 16);
+        VXC_DP2x8(src1, srcA3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempA3, src1, 16);
+
+        VXC_DP2x8(src0, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempB0, src0, 16);
+        VXC_DP2x8(src1, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempB1, src1, 16);
+        VXC_DP2x8(src0, srcB2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempB2, src0, 16);
+        VXC_DP2x8(src1, srcB3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, tempB3, src1, 16);
+
+        sum0 += (float4)(dot(tempA0, tempB0), dot(tempA0, tempB1), dot(tempA0, tempB2), dot(tempA0, tempB3));
+        sum1 += (float4)(dot(tempA1, tempB0), dot(tempA1, tempB1), dot(tempA1, tempB2), dot(tempA1, tempB3));
+        sum2 += (float4)(dot(tempA2, tempB0), dot(tempA2, tempB1), dot(tempA2, tempB2), dot(tempA2, tempB3));
+        sum3 += (float4)(dot(tempA3, tempB0), dot(tempA3, tempB1), dot(tempA3, tempB2), dot(tempA3, tempB3));
+    }
+
+    vxc_ushort8 valC0, valC1, valDst;
+    _viv_asm(COPY, valC0, sum0, 16);
+    _viv_asm(COPY, valC1, sum1, 16);
+    VXC_DP2x8(valDst, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    _viv_asm(COPY, valC0, sum2, 16);
+    _viv_asm(COPY, valC1, sum3, 16);
+    VXC_DP2x8(valDst, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx
@ -11,6 +11,9 @@ _viv_uniform int ac2zero;
 _viv_uniform int bc2zero;

 _viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;

 #if (VX_VERSION==2)
 __kernel void gemm_F16F16toF16(image2d_array_t inputA,
@ -192,14 +195,9 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA,
 }
 #endif

-__kernel void gemm_F32F32toF32(image2d_array_t inputA,
-                               image2d_array_t inputB,
-                               image2d_array_t output,
-                                    int transposeA,
-                                    int transposeB,
-                                    int adjointA,
-                                    int adjointB,
-                        uint M, uint K, uint N)
+__kernel void gemm_F32F32toF32(
+        image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output,
+        int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N)
 {
    uint gidx = get_global_id(0);
    uint gidy = get_global_id(1);
@ -207,10 +205,8 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA,
    int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0);
    int4 coord_b = (int4)(gidx, 0, (bc2zero ? 0 : get_global_id(2)), 0);

-    vxc_float4 sum0 = (vxc_float4)(0);
-    vxc_float4 sum1 = (vxc_float4)(0);
-    vxc_float4 sum2 = (vxc_float4)(0);
-    vxc_float4 sum3 = (vxc_float4)(0);
+    vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);
+    vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);

    vxc_int4 tmpOut0, tmpOut1;
    vxc_uchar16 outC;
@ -224,7 +220,6 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA,

        coord_a.x = i;
        coord_a.y = gidy;
-
        coord_b.x = gidx;
        coord_b.y = i;

@ -257,4 +252,4 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA,
    write_imagef(output, coord_b, sum2);
    coord_b.y++;
    write_imagef(output, coord_b, sum3);
-}
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/maximum.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/maximum.vx
@ -222,6 +222,62 @@ __kernel void maximum_U8U8toU8_2D
    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
 }

+__kernel void maximum_U8U8toI16
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_uchar16 src0, src1;
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    vxc_short8 dst0, dst1, dst;
+    vxc_ushort8 mp0, mp1;
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift0_Lo_2x8);
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift1_Lo_2x8);
+    dst = max(dst0, dst1);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void maximum_U8U8toI16_2D
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+
+    vxc_uchar16 src0, src1;
+    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    vxc_short8 dst0, dst1, dst;
+    vxc_ushort8 mp0, mp1;
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift0_Lo_2x8);
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift1_Lo_2x8);
+    dst = max(dst0, dst1);
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
 _viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8;
 _viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8;
 __kernel void maximum_I16I16toI16
--- a/src/tim/vx/internal/src/libnnext/ops/vx/maximum_i16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/maximum_i16.vx
@ -170,4 +170,64 @@ __kernel void maximum_F16F16toI16_2D
    tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);
    VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
+__kernel void maximum_I16I16toU8
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src0, src1;
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    vxc_uchar16 dst0, dst1, dst;
+    vxc_ushort8 mp0, mp1;
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift0_Lo_2x8);
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift1_Lo_2x8);
+    dst = max(dst0, dst1);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void maximum_I16I16toU8_2D
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
+
+    vxc_short8 src0, src1;
+    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    vxc_uchar16 dst0, dst1, dst;
+    vxc_ushort8 mp0, mp1;
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift0_Lo_2x8);
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift1_Lo_2x8);
+    dst = max(dst0, dst1);
+
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/minimum.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/minimum.vx
@ -224,6 +224,62 @@ __kernel void minimum_U8U8toU8_2D
    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
 }

+__kernel void minimum_U8U8toI16
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_uchar16 src0, src1;
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    vxc_short8 dst0, dst1, dst;
+    vxc_ushort8 mp0, mp1;
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift0_Lo_2x8);
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift1_Lo_2x8);
+    dst = min(dst0, dst1);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void minimum_U8U8toI16_2D
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+
+    vxc_uchar16 src0, src1;
+    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    vxc_short8 dst0, dst1, dst;
+    vxc_ushort8 mp0, mp1;
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift0_Lo_2x8);
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift1_Lo_2x8);
+    dst = min(dst0, dst1);
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
 _viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8;
 _viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8;
 __kernel void minimum_I16I16toI16
--- a/src/tim/vx/internal/src/libnnext/ops/vx/minimum_i16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/minimum_i16.vx
@ -173,5 +173,65 @@ __kernel void minimum_F16F16toI16_2D
    tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);
    VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);

+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
+__kernel void minimum_I16I16toU8
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src0, src1;
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    vxc_uchar16 dst0, dst1, dst;
+    vxc_ushort8 mp0, mp1;
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift0_Lo_2x8);
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift1_Lo_2x8);
+    dst = min(dst0, dst1);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void minimum_I16I16toU8_2D
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
+
+    vxc_short8 src0, src1;
+    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    vxc_uchar16 dst0, dst1, dst;
+    vxc_ushort8 mp0, mp1;
+    _viv_asm(COPY, mp0, multAndoutZP0, 16);
+    _viv_asm(COPY, mp1, multAndoutZP1, 16);
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift0_Lo_2x8);
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
+        uniU8MulAndPostShift1_Lo_2x8);
+    dst = min(dst0, dst1);
+
    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis0.vx
@ -17,6 +17,9 @@ _viv_uniform float e2InScale;
 _viv_uniform float rowSumScale;
 _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
 _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;

 #define MOMENTS_AXIS0_QINT(src0_type_name, read0_type) \
 __kernel void moments_axis0_##src0_type_name##toF16( \
@ -262,6 +265,88 @@ __kernel void moments_axis0_I16toF16_2D(
    VXC_DP2x8(tmpVal, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
    _viv_asm(COPY, dst, tmpVal, 16);

+    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void moments_axis0_BF16toBF16(
+    image2d_array_t input,
+    image2d_t output_mean,
+    image2d_t output_vari,
+              int axis, int axis_num)
+{
+    int gidy = get_global_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(0, gidy, gidz, 0);
+    vxc_ushort8 src0, src1;
+    vxc_ushort8 val;
+    vxc_float4 mean_vari0 = (vxc_float4)(0);
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);
+    for(coord.x = 0; coord.x < width; coord.x += 8)
+    {
+        VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        float4 vec0, vec1;
+        VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, vec0, src0, 16);
+        VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, vec1, src1, 16);
+        mean_vari0.x += dot(vec0, one) + dot(vec1, one);
+        mean_vari0.y += dot(vec0, vec0) + dot(vec1, vec1);
+    }
+
+    mean_vari0 *= dimRatio;
+    mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0;
+
+    int2 coord_out = (int2)(gidy, gidz);
+
+    vxc_short8 dst;
+    _viv_asm(COPY, src0, mean_vari0, 16);
+    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void moments_axis0_BF16toBF16_2D(
+    image2d_t input,
+    image2d_t output_mean,
+    image2d_t output_vari,
+              int axis, int axis_num)
+{
+    int gidy = get_global_id(0);
+    int2 coord = (int2)(0, gidy);
+    vxc_ushort8 src0, src1;
+    vxc_ushort8 val;
+    vxc_float4 mean_vari0 = (vxc_float4)(0);
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);
+
+    for(coord.x = 0; coord.x < width; coord.x += 8)
+    {
+        VXC_ReadImage(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        float4 vec0, vec1;
+        VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, vec0, src0, 16);
+        VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, vec1, src1, 16);
+        mean_vari0.x += dot(vec0, one) + dot(vec1, one);
+        mean_vari0.y += dot(vec0, vec0) + dot(vec1, vec1);
+    }
+    mean_vari0 *= dimRatio;
+    mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0;
+
+    int2 coord_out = (int2)(gidy, 0);
+
+    vxc_short8 dst;
+    _viv_asm(COPY, src0, mean_vari0, 16);
+    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+
    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis012.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis012.vx
@ -18,6 +18,9 @@ _viv_uniform float e2InScale;
 _viv_uniform float rowSumScale;
 _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
 _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;

 #define MOMENTS_AXIS012_QINT(src0_type_name, read0_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_##src0_type_name##toF16( \
@ -236,4 +239,79 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_I1
        VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
        VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_BF16toBF16(
+    image2d_array_t input,
+    image2d_t output_mean,
+    image2d_t output_vari,
+              int axis,
+              int axis_num)
+{
+    int gidx = get_global_id(0) << 3;
+    int lidx = get_local_id(0);
+    int4 coord = (int4)(gidx, 0, 0, 0);
+    vxc_float4 sumsqr;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+    float tmpSum = 0;
+    float tmpSqr = 0;
+    vxc_ushort8 src0, src1;
+    vxc_ushort8 val;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);
+
+    for(coord.z = 0; coord.z < channel; coord.z++)
+    {
+        for(coord.x = gidx; coord.x < width; coord.x += 128)
+        {
+            for(coord.y = 0; coord.y < height;)
+            {
+                VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+                coord.y++;
+                float4 vec0, vec1;
+                VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                        uniConvBF16toF32_Part0_2x8);
+                _viv_asm(COPY, vec0, src0, 16);
+                VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                        uniConvBF16toF32_Part1_2x8);
+                _viv_asm(COPY, vec1, src1, 16);
+                tmpSum += dot(vec0, one) + dot(vec1, one);
+                tmpSqr += dot(vec0, vec0) + dot(vec1, vec1);
+            }
+        }
+    }
+    lcl_sum[lidx] = tmpSum;
+    lcl_sqr[lidx] = tmpSqr;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(0, 0);
+    if(lidx == 0)
+    {
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        float sum = (float)(0);
+        float sqr = (float)(0);
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 mean_vari;
+        mean_vari.x = sum * dimRatio;
+        mean_vari.y = sqr * dimRatio;
+        mean_vari.y = mean_vari.y  - mean_vari.x * mean_vari.x;
+
+        vxc_short8 dst;
+        _viv_asm(COPY, src0, mean_vari, 16);
+        VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+
+        VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+        VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis1.vx
@ -10,6 +10,8 @@ _viv_uniform float e2InScale;
 _viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
 _viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
 _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;

 #define MOMENTS_AXIS1_QINT(src0_type_name, read0_type) \
 __kernel void moments_axis1_##src0_type_name##toF16( \
@ -197,3 +199,85 @@ __kernel void moments_axis1_F16toF16_2D(
    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
    VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
 }
+
+__kernel void moments_axis1_BF16toBF16(
+    image2d_array_t input,
+    image2d_t output_mean,
+    image2d_t output_vari,
+              int axis, int axis_num)
+{
+    int gidx = get_global_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    vxc_ushort8 src0;
+    vxc_ushort8 val;
+    vxc_float4 sum = (vxc_float4)(0);
+    vxc_float4 sqr = (vxc_float4)(0);
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        float4 vec0;
+        VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, vec0, src0, 16);
+
+        sum += vec0;
+        sqr += (vec0 * vec0);
+    }
+
+    vxc_float4 mean = sum * dimRatio;
+    vxc_float4 vari = sqr * dimRatio;
+    vari = vari - mean * mean;
+
+    int2 coord_out = (int2)(gidx, gidz);
+    vxc_short8 tmpdst0, tmpdst1, dst;
+    _viv_asm(COPY, tmpdst0, mean, 16);
+    _viv_asm(COPY, tmpdst1, vari, 16);
+    VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+
+    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void moments_axis1_BF16toBF16_2D(
+    image2d_t input,
+    image2d_t output_mean,
+    image2d_t output_vari,
+              int axis, int axis_num)
+{
+    int gidx = get_global_id(0);
+    int2 coord = (int2)(gidx, 0);
+    vxc_ushort8 src0;
+    vxc_ushort8 val;
+    vxc_float4 sum = (vxc_float4)(0);
+    vxc_float4 sqr = (vxc_float4)(0);
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        float4 vec0;
+        VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, vec0, src0, 16);
+
+        sum += vec0;
+        sqr += (vec0 * vec0);
+    }
+
+    vxc_float4 mean = sum * dimRatio;
+    vxc_float4 vari = sqr * dimRatio;
+    vari = vari - mean * mean;
+
+    int2 coord_out = (int2)(gidx, 0);
+    vxc_short8 tmpdst0, tmpdst1, dst;
+    _viv_asm(COPY, tmpdst0, mean, 16);
+    _viv_asm(COPY, tmpdst1, vari, 16);
+    VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
--- a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis2.vx
@ -9,6 +9,8 @@ _viv_uniform float e2InScale;
 _viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
 _viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
 _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;

 #define MOMENTS_AXIS2_QINT(src0_type_name, read0_type) \
 __kernel void moments_axis2_##src0_type_name##toF16( \
@ -95,6 +97,50 @@ __kernel void moments_axis2_F16toF16(
    _viv_asm(CONV, tmpVari, vari);
    VXC_DP2x8(tmpVal, tmpMean, tmpVari, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
    _viv_asm(COPY, dst, tmpVal, 16);
+    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void moments_axis2_BF16toBF16(
+    image2d_array_t input,
+    image2d_t output_mean,
+    image2d_t output_vari,
+              int axis,
+              int axis_num)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int4 coord = (int4)(gidx, gidy, 0, 0);
+    vxc_ushort8 src0;
+    vxc_ushort8 val;
+    vxc_float4 sum = (vxc_float4)(0);
+    vxc_float4 sqr = (vxc_float4)(0);
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+
+    for(coord.z = 0; coord.z < channel; coord.z++)
+    {
+        VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        float4 vec0;
+        VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniConvBF16toF32_Part0_2x8);
+        _viv_asm(COPY, vec0, src0, 16);
+
+        sum += vec0;
+        sqr += (vec0 * vec0);
+    }
+
+    vxc_float4 mean = sum * dimRatio;
+    vxc_float4 vari = sqr * dimRatio;
+    vari = vari - mean * mean;
+
+    int2 coord_out = (int2)(gidx, gidy);
+
+    vxc_short8 tmpdst0, tmpdst1, dst;
+    _viv_asm(COPY, tmpdst0, mean, 16);
+    _viv_asm(COPY, tmpdst1, vari, 16);
+    VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+
    VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
    VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
 }
--- a/Show More
+++ b/Show More