Update internal ovxlib to rel/1.2.14 (#699)

Type: New Feature Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
2024-07-08 09:29:24 +08:00 · 2024-07-08 09:29:24 +08:00 · c8b7c410bf
parent 8894360c74
commit c8b7c410bf
94 changed files with 14958 additions and 320 deletions
--- a/2
+++ b/2
@ -1 +1 @@
-1.2.6
+1.2.14
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@ -199,3 +199,7 @@ DEF_OP(CROP_AND_RESIZE)
 DEF_OP(TAN)
 DEF_OP(RMSNORM)
 DEF_OP(SHAPE)
+DEF_OP(BITCAST)
+DEF_OP(GROUPED_CONV3D)
+DEF_OP(COL2IM)
+DEF_OP(L1_LAYER_NORM)
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_bitcast.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bitcast.h
@ -0,0 +1,44 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_BITCAST_H
+#define _VSI_NN_OP_BITCAST_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_bitcast_param
+{
+    struct _bitcast_local_data_t* local;
+} vsi_nn_bitcast_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_col2im.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_col2im.h
@ -0,0 +1,49 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_COL2IM_H
+#define _VSI_NN_OP_COL2IM_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_col2im_param
+{
+    const int32_t* image_shape;
+    const int32_t* block_shape;
+    int32_t      strides[3];
+    int32_t      pads[6];
+    int32_t      dilations[3];
+    int32_t      dim_num;
+} vsi_nn_col2im_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv3d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv3d.h
@ -0,0 +1,55 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_GROUPED_CONV3D_H
+#define _VSI_NN_OP_GROUPED_CONV3D_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_grouped_conv3d_param
+{
+    void*          local;
+    uint32_t       ksize[3];
+    uint32_t       stride[3];
+    /* Pad left, right, top, bottom, front, rear */
+    uint32_t       pad[6];
+    /* Pad type default value shall be AUTO */
+    vsi_nn_pad_e   pad_type;
+    uint32_t       weights;
+    uint32_t       group;
+    uint32_t       dilation[3];
+    int32_t        multiplier;
+    vsi_nn_pad_mode_e pad_mode;
+} vsi_nn_grouped_conv3d_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_l1_layer_norm.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_l1_layer_norm.h
@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_L1_LAYER_NORM_H
+#define _VSI_NN_OP_L1_LAYER_NORM_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_l1_layer_norm_param
+{
+    struct _l1_layer_norm_local_data_t * local;
+    float eps;
+    int32_t axis;
+} vsi_nn_l1_layer_norm_param;
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/utils/vsi_nn_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h
@ -349,7 +349,7 @@ vsi_bool vsi_nn_IsEVISFeatureAvaiable
    vsi_nn_context_t context
    );

-int32_t vsi_nn_compareVersion
+OVXLIB_API int32_t vsi_nn_compareVersion
    (
    vsi_nn_graph_t * graph,
    uint32_t version_major,
--- a/src/tim/vx/internal/include/vsi_nn/vsi_nn.h
+++ b/src/tim/vx/internal/include/vsi_nn/vsi_nn.h
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@ -26,6 +26,7 @@
 #define _VSI_NN_CONTEXT_H

 #include "vsi_nn_platform.h"
+#include "vsi_nn_types.h"

 #ifdef __cplusplus
 extern "C" {
@ -75,12 +76,19 @@ typedef struct _vsi_nn_runtime_option_t
    int32_t enable_shader;
    int32_t enable_opcheck;
    int32_t enable_concat_optimize;
-    int32_t enable_asymi8_to_u8;
+    /*  0: disable convert int8 to uint8
+     *  1: enable convert asymm int8 to asymm uint8
+     *  2: enable convert both asymm and sym int8 to asymm uint8
+     */
+    int32_t enable_i8_to_u8;
    int32_t enable_dataconvert_optimize;
    int32_t enable_stream_processor;
    int32_t enable_rgb88_planar_nhwc;
    int32_t enable_slice_optimize;
    int32_t enable_batch_opt;
+    int32_t enable_save_file_type;
+    int32_t enable_use_image_process;
+    int32_t enable_use_from_handle;
 } vsi_nn_runtime_option_t;

 /**
@ -101,6 +109,10 @@ typedef struct _vsi_nn_context_t
 OVXLIB_API vsi_nn_context_t vsi_nn_CreateContext
    ( void );

+OVXLIB_API vsi_status vsi_nn_initOptions
+    (
+    vsi_nn_runtime_option_t *options
+    );
 /**
 * Release context
 * Release ovxlib NN runtime resource and reset context handle to NULL.
--- a/src/tim/vx/internal/include/vsi_nn_feature_config.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h
@ -53,5 +53,9 @@
 #if defined(VX_13_NN_COMPATIBLITY)
 #define VSI_MAP_TENSOR_PATCH_SUPPORT
 #endif
+#if defined (VX_QUANT_PER_GROUP_SUPPORT)
+#define VSI_PER_GROUP_QUANTIZATION_SUPPORT
+#endif
+#define VSI_GRAPH_RUNTIME_ENV_SUPPORT

 #endif
--- a/src/tim/vx/internal/include/vsi_nn_graph.h
+++ b/src/tim/vx/internal/include/vsi_nn_graph.h
@ -814,11 +814,77 @@ OVXLIB_API vsi_status vsi_nn_ExecuteGraphLoop
    vsi_nn_tensor_t *max_iteration_tensor
    );

-OVXLIB_API vsi_status vsi_nn_SetGraphTransformOption
+/**
+ * Set runtime variable
+ * Set runtime variable for ovxlib and driver.
+ *
+ * @param[in] graph Graph handle
+ * @param[in] key Ovxlib and driver Envoriment variable name
+ * Ovxlib supported keys:
+ * VSI_NN_ENABLE_I8TOU8
+ * VSI_NN_ENABLE_OPCHECK
+ * VSI_SAVE_FILE_TYPE
+ * VSI_USE_IMAGE_PROCESS
+ * VSI_NN_ENABLE_CONCAT_OPTIMIZE
+ * VSI_NN_ENABLE_DATACONVERT_OPTIMIZE
+ * VSI_VX_ENABLE_STREAM_PROCESSOR
+ * VSI_NN_FORCE_RGB888_OUT_NHWC
+ * VSI_NN_ENABLE_SLICE_OPTIMIZE
+ * VSI_VX_ENABLE_BATCH_OPT
+ * VSI_USE_FROM_HANDLE
+ * Driver keys:
+ * VIV_VX_ENABLE_GRAPH_TRANSFORM
+ * VIV_VX_ENABLE_SHADER
+ * In addition to the ovxlib keys listed above, all others will be treated as the driver envoriment variable.
+ * @return VSI_SUCCESS on success, or appropriate error code otherwise
+ */
+OVXLIB_API vsi_status vsi_nn_SetRunTimeVariable
    (
    vsi_nn_graph_t* graph,
-    const char* ctrl_str,
-    size_t size
+    const char* key,
+    const char* value
+    );
+
+/**
+ * Get runtime variable
+ * Get runtime variable of ovxlib.
+ *
+ * @param[in] graph Graph handle
+ * @param[in] key Envoriment variable name
+ * Supported keys:
+ * VSI_NN_ENABLE_I8TOU8
+ * VSI_NN_ENABLE_OPCHECK
+ * VSI_SAVE_FILE_TYPE
+ * VSI_USE_IMAGE_PROCESS
+ * VSI_NN_ENABLE_CONCAT_OPTIMIZE
+ * VSI_NN_ENABLE_DATACONVERT_OPTIMIZE
+ * VSI_VX_ENABLE_STREAM_PROCESSOR
+ * VSI_NN_FORCE_RGB888_OUT_NHWC
+ * VSI_NN_ENABLE_SLICE_OPTIMIZE
+ * VSI_VX_ENABLE_BATCH_OPT
+ * VSI_USE_FROM_HANDLE
+ * VIV_VX_ENABLE_GRAPH_TRANSFORM
+ * VIV_VX_ENABLE_SHADER
+ * Only supported the keys listed above.
+ * @return Variable's value on success, or NULL otherwise, attention: if success,
+ *                 the caller need release the memory after use the return value.
+ */
+OVXLIB_API char* vsi_nn_GetRunTimeVariable
+    (
+    const vsi_nn_graph_t* graph,
+    const char* key
+    );
+
+int32_t vsi_nn_GetVariable(const char* variableKey);
+
+OVXLIB_API char* vsi_nn_GenerateGraphJson
+    (
+    vsi_nn_graph_t* graph
+    );
+
+OVXLIB_API vsi_status vsi_nn_ReleaseGraphJson
+    (
+    char* json
    );

 /**
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@ -212,6 +212,10 @@
 #include "ops/vsi_nn_op_crop_and_resize.h"
 #include "ops/vsi_nn_op_rmsnorm.h"
 #include "ops/vsi_nn_op_shape.h"
+#include "ops/vsi_nn_op_bitcast.h"
+#include "ops/vsi_nn_op_grouped_conv3d.h"
+#include "ops/vsi_nn_op_col2im.h"
+#include "ops/vsi_nn_op_l1_layer_norm.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 #include "ops/vsi_nn_op_inverse_sigmoid.h"
@ -412,6 +416,10 @@ typedef union _vsi_nn_nn_param
    vsi_nn_crop_and_resize_param    crop_and_resize;
    vsi_nn_rmsnorm_param            rmsnorm;
    vsi_nn_shape_param              shape;
+    vsi_nn_bitcast_param            bitcast;
+    vsi_nn_grouped_conv3d_param     grouped_conv3d;
+    vsi_nn_col2im_param             col2im;
+    vsi_nn_l1_layer_norm_param      l1_layer_norm;
    void*                         client_param;

    /* custom node data struct define */
--- a/src/tim/vx/internal/include/vsi_nn_tensor.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor.h
@ -86,6 +86,8 @@ typedef enum
    VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6,
    /** perchannel float8 */
    VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7,
+    /** GPQT */
+    VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC = 0x8,
    /** undefined type */
    VSI_NN_QNT_TYPE_NA = 0xff,
 } vsi_nn_qnt_type_e;
@ -126,6 +128,16 @@ typedef struct vsi_nn_dtype
                const int32_t * zero_points;
                int32_t         zero_points_dim;
            };
+#endif
+#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
+            /** Meanful in GPTQ_SYMMETRIC */
+            struct {
+                const float* group_scales;
+                int32_t group_channel_dim;
+                int32_t group_size;
+                const int32_t* group_zero_points;
+                int32_t group_count;
+            };
 #endif
        };
    };
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@ -33,7 +33,7 @@ extern "C"{

 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 2
-#define VSI_NN_VERSION_PATCH 5
+#define VSI_NN_VERSION_PATCH 14
 #define VSI_NN_VERSION \
    (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

--- a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
@ -35,6 +35,8 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"

+#if (!VX_ARGMAX_VX_SUPPORT)
+
 __BEGIN_DECLS


@ -289,3 +291,5 @@ OnError:
 __END_DECLS

 REGISTER_BACKEND_CL( argmax, _setup )
+
+#endif
--- a/src/tim/vx/internal/src/kernel/cl/col2im_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/col2im_cl.c
@ -0,0 +1,432 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+#define _COL2IM_KERNEL_SOURCE_NAME      "col2im"
+
+// Add kernel hashtable here
+#define COL2IM_HASH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d) \
+        (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 | (_image_2d)))
+#define COL2IM_KERNELS( IN_DTYPE, OUT_DTYPE ) \
+        { COL2IM_HASH_KEY( IN_DTYPE, OUT_DTYPE , 0), \
+         CVIVANTE_NAMESPACE("cl.col2im_"#IN_DTYPE"to"#OUT_DTYPE), \
+         _COL2IM_KERNEL_SOURCE_NAME }
+
+#define COL2IM_KERNELS_2D( IN_DTYPE, OUT_DTYPE ) \
+        { COL2IM_HASH_KEY( IN_DTYPE, OUT_DTYPE , 1), \
+         CVIVANTE_NAMESPACE("cl.col2im_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \
+         _COL2IM_KERNEL_SOURCE_NAME }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _col2im_kernel_map[] =
+{
+    // Register kernel here
+    COL2IM_KERNELS( F32, F32 ),
+    COL2IM_KERNELS( F32, U32 ),
+    COL2IM_KERNELS( F32, I32 ),
+    COL2IM_KERNELS( U32, U32 ),
+    COL2IM_KERNELS( U32, F32 ),
+    COL2IM_KERNELS( U32, I32 ),
+    COL2IM_KERNELS( I32, I32 ),
+    COL2IM_KERNELS( I32, U32 ),
+    COL2IM_KERNELS( I32, F32 ),
+
+    COL2IM_KERNELS_2D( F32, F32 ),
+    COL2IM_KERNELS_2D( F32, U32 ),
+    COL2IM_KERNELS_2D( F32, I32 ),
+    COL2IM_KERNELS_2D( U32, U32 ),
+    COL2IM_KERNELS_2D( U32, F32 ),
+    COL2IM_KERNELS_2D( U32, I32 ),
+    COL2IM_KERNELS_2D( I32, I32 ),
+    COL2IM_KERNELS_2D( I32, U32 ),
+    COL2IM_KERNELS_2D( I32, F32 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _col2im_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _COL2IM_PARAM_NUM  _cnt_of_array( _col2im_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_col2im_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,         // workdim
+        {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0}, // localWorkSize: local group size in thread
+        {0, 0, 0}  // globalWorkSize: image size in thread
+        };
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    vsi_size_array_t * in_shape = NULL;
+    int32_t stride_w = 1, stride_h = 1;
+    int32_t dilation_w = 1, dilation_h = 1, dilation_d = 1;
+    int32_t pad_w_front = 0, pad_w_end = 0, pad_h_front = 0, pad_h_end = 0, pad_d_front = 0, pad_d_end = 0;
+    int32_t kernel_w = 1, kernel_h = 1, kernel_d = 1;
+    int32_t move_time_x = 0;
+    int32_t move_time_y = 0;
+    int32_t width_pad = 0;
+    int32_t height_pad = 0;
+    int32_t depth_pad = 0;
+    int32_t kernel_x_new = 1;
+    int32_t kernel_y_new = 1;
+    int32_t kernel_z_new = 1;
+    int32_t batch = 1;
+    int32_t width = 1;
+    int32_t height = 1;
+    int32_t depth = 1;
+
+    VSI_UNREFERENCED(param_size);
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &stride_w);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &stride_h);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &dilation_w);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &dilation_h);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &dilation_d);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &pad_w_front);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &pad_w_end);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &pad_h_front);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &pad_h_end);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &pad_d_front);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &pad_d_end);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[14], &kernel_w);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[15], &kernel_h);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[16], &kernel_d);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    batch = (int32_t)(attr[0]->shape->data[2]);
+    width = (int32_t)(attr[1]->shape->data[0]);
+    height = (int32_t)(attr[1]->shape->data[1]);
+    depth  = (int32_t)(attr[1]->shape->data[2]) / batch;
+    width_pad = width + pad_w_front + pad_w_end;
+    height_pad = height + pad_h_front + pad_h_end;
+    depth_pad = depth + pad_d_front + pad_d_end;
+    move_time_x = (width_pad - ((kernel_w - 1) * dilation_w + 1) + stride_w) / stride_w;
+    move_time_y = (height_pad - ((kernel_h - 1) * dilation_h + 1) + stride_h) / stride_h;
+    kernel_x_new = (kernel_w - 1) * dilation_w + 1;
+    kernel_y_new = (kernel_h - 1) * dilation_h + 1;
+    kernel_z_new = (kernel_d - 1) * dilation_d + 1;
+
+    status = vsi_nn_kernel_gpu_add_param( node, "width_pad", &width_pad );
+    status |= vsi_nn_kernel_gpu_add_param( node, "height_pad", &height_pad );
+    status |= vsi_nn_kernel_gpu_add_param( node, "depth_pad", &depth_pad );
+    status |= vsi_nn_kernel_gpu_add_param( node, "move_time_x", &move_time_x );
+    status |= vsi_nn_kernel_gpu_add_param( node, "move_time_y", &move_time_y );
+    status |= vsi_nn_kernel_gpu_add_param( node, "kernel_x_new", &kernel_x_new );
+    status |= vsi_nn_kernel_gpu_add_param( node, "kernel_y_new", &kernel_y_new );
+    status |= vsi_nn_kernel_gpu_add_param( node, "kernel_z_new", &kernel_z_new );
+    status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    in_shape  = attr[1]->shape;
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+    gpu_param.global_size[0] = in_shape->data[0];
+    gpu_param.global_size[1] = in_shape->data[1];
+    gpu_param.global_size[2] = in_shape->data[2];
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+    }
+    return status;
+} /* _col2im_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool image_2d
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _col2im_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _col2im_kernel_map );
+    vx_param_description_t * param_def  = _col2im_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _col2im_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (F16 == in_dtype)
+    {
+        in_dtype = F32;
+    }
+    else if (U8 == in_dtype)
+    {
+        in_dtype = U32;
+    }
+    else if (I8 == in_dtype || I16 == in_dtype)
+    {
+        in_dtype = I32;
+    }
+
+    if (F16 == out_dtype)
+    {
+        out_dtype = F32;
+    }
+    else if (U8 == out_dtype)
+    {
+        out_dtype = U32;
+    }
+    else if (I8 == out_dtype || I16 == out_dtype)
+    {
+        out_dtype = I32;
+    }
+
+    key = COL2IM_HASH_KEY( in_dtype, out_dtype ,image_2d);
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _col2im_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_COL2IM_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_bool image_2d = FALSE;
+    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
+    float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
+    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
+    float inputZp  = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float outputZp  = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float inOutScale = inputScale / outputScale;
+    float inOutTile = outputZp - inOutScale * inputZp;
+    int32_t stride_w = vsi_nn_kernel_param_get_int32( params, "stride_w" );
+    int32_t stride_h = vsi_nn_kernel_param_get_int32( params, "stride_h" );
+    int32_t stride_d = vsi_nn_kernel_param_get_int32( params, "stride_d" );
+    int32_t dilation_w = vsi_nn_kernel_param_get_int32( params, "dilation_w" );
+    int32_t dilation_h = vsi_nn_kernel_param_get_int32( params, "dilation_h" );
+    int32_t dilation_d = vsi_nn_kernel_param_get_int32( params, "dilation_d" );
+    int32_t pad_w_front = vsi_nn_kernel_param_get_int32( params, "pad_w_front" );
+    int32_t pad_w_end = vsi_nn_kernel_param_get_int32( params, "pad_w_end" );
+    int32_t pad_h_front = vsi_nn_kernel_param_get_int32( params, "pad_h_front" );
+    int32_t pad_h_end = vsi_nn_kernel_param_get_int32( params, "pad_h_end" );
+    int32_t pad_d_front = vsi_nn_kernel_param_get_int32( params, "pad_d_front" );
+    int32_t pad_d_end = vsi_nn_kernel_param_get_int32( params, "pad_d_end" );
+    size_t dim_num = 0;
+    int32_t* block_shape = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "block_shape", &dim_num);
+    int32_t kernel_w = block_shape[0];
+    int32_t kernel_h = dim_num > 1 ? block_shape[1] : 1;
+    int32_t kernel_d = dim_num > 2 ? block_shape[2] : 1;
+
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    image_2d = dim_num > 2 ? FALSE : TRUE;
+
+    shapes[0][0] = inputs[0]->attr.size[0];
+    shapes[0][1] = inputs[0]->attr.size[1] / outputs[0]->attr.size[dim_num];
+    shapes[0][2] = inputs[0]->attr.size[2] * outputs[0]->attr.size[dim_num];
+
+    shapes[1][0] = outputs[0]->attr.size[0];
+    shapes[1][1] = outputs[0]->attr.size[1];
+    if (image_2d)
+    {
+        shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3];
+    }
+    else
+    {
+        shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3] * outputs[0]->attr.size[4];
+    }
+
+    rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], 3 );
+    rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], 3 );
+
+    if (rs_input == NULL || rs_output == NULL)
+    {
+        goto final;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, image_2d );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            node_params[0] = rs_input;
+            node_params[1] = rs_output;
+            node_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &stride_w );
+            node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &stride_h );
+            node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &stride_d );
+            node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_w );
+            node_params[6] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_h );
+            node_params[7] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_d );
+            node_params[8] = vsi_nn_kernel_scalar_create( graph, I32, &pad_w_front );
+            node_params[9] = vsi_nn_kernel_scalar_create( graph, I32, &pad_w_end );
+            node_params[10] = vsi_nn_kernel_scalar_create( graph, I32, &pad_h_front );
+            node_params[11] = vsi_nn_kernel_scalar_create( graph, I32, &pad_h_end );
+            node_params[12] = vsi_nn_kernel_scalar_create( graph, I32, &pad_d_front );
+            node_params[13] = vsi_nn_kernel_scalar_create( graph, I32, &pad_d_end );
+            node_params[14] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_w );
+            node_params[15] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_h );
+            node_params[16] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_d );
+            node_params[17] = vsi_nn_kernel_scalar_create( graph, F32, &inOutScale );
+            node_params[18] = vsi_nn_kernel_scalar_create( graph, F32, &inOutTile );
+
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _COL2IM_PARAM_NUM );
+            CHECK_STATUS(status);
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+            vsi_nn_kernel_scalar_release( &node_params[13] );
+            vsi_nn_kernel_scalar_release( &node_params[14] );
+            vsi_nn_kernel_scalar_release( &node_params[15] );
+            vsi_nn_kernel_scalar_release( &node_params[16] );
+            vsi_nn_kernel_scalar_release( &node_params[17] );
+        }
+    }
+final:
+    if (rs_input)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input );
+    }
+    if (rs_output)
+    {
+        vsi_nn_kernel_tensor_release( &rs_output );
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( col2im, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
@ -46,21 +46,36 @@ __BEGIN_DECLS

 #define KERNEL_SOURCE_1    "cumsum"
 #define KERNEL_SOURCE_2    "cumsum_2d"
+#define KERNEL_SOURCE_3    "cumsum_array_axis0"
+#define KERNEL_SOURCE_4    "cumsum_array_axis1"
+#define KERNEL_SOURCE_5    "cumsum_array_axis2"
+#define KERNEL_SOURCE_6    "cumsum_array_2d_axis0"
+#define KERNEL_SOURCE_7    "cumsum_array_2d_axis1"

 // Add kernel hashtable here
-#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
-    ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
+#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d, is_array) \
+    ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 8) | (_image_2d << 4) | (is_array))

 #define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \
-        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0), \
        CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
        KERNEL_SOURCE_1 },

 #define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \
-        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0), \
        CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
        KERNEL_SOURCE_2 },

+#define HASH_CUMSUM_ARRAY_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1), \
+        CVIVANTE_NAMESPACE("cl.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
+        SOURCE },
+
+#define HASH_CUMSUM_ARRAY_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 1), \
+        CVIVANTE_NAMESPACE("cl.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
+        SOURCE },
+
 static const struct {
        uint32_t key;
        char* function_name;
@ -82,6 +97,22 @@ static const struct {
    HASH_CUMSUM_KERNELS_2D(1, U8,  U8)
    HASH_CUMSUM_KERNELS_2D(1, F32, F32)
    HASH_CUMSUM_KERNELS_2D(1, F32, U8)
+
+    HASH_CUMSUM_ARRAY_KERNELS(0, U8,  U8, KERNEL_SOURCE_3)
+    HASH_CUMSUM_ARRAY_KERNELS(0, F32, F32, KERNEL_SOURCE_3)
+    HASH_CUMSUM_ARRAY_KERNELS(0, F32, U8, KERNEL_SOURCE_3)
+    HASH_CUMSUM_ARRAY_KERNELS(1, U8,  U8, KERNEL_SOURCE_4)
+    HASH_CUMSUM_ARRAY_KERNELS(1, F32, F32, KERNEL_SOURCE_4)
+    HASH_CUMSUM_ARRAY_KERNELS(1, F32, U8, KERNEL_SOURCE_4)
+    HASH_CUMSUM_ARRAY_KERNELS(2, U8,  U8, KERNEL_SOURCE_5)
+    HASH_CUMSUM_ARRAY_KERNELS(2, F32, F32, KERNEL_SOURCE_5)
+    HASH_CUMSUM_ARRAY_KERNELS(2, F32, U8, KERNEL_SOURCE_5)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, U8,  U8, KERNEL_SOURCE_6)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F32, F32, KERNEL_SOURCE_6)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F32, U8, KERNEL_SOURCE_6)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, U8,  U8, KERNEL_SOURCE_7)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F32, F32, KERNEL_SOURCE_7)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F32, U8, KERNEL_SOURCE_7)
 };

 /*
@ -197,7 +228,8 @@ static vsi_status _query_kernel
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
    int32_t axis,
-    int32_t is_2d
+    int32_t is_2d,
+    int32_t is_array
    /* Add extra params */
    )
 {
@ -230,7 +262,7 @@ static vsi_status _query_kernel
        output_dtype = F32;
    }

-    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d);
+    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d, is_array);

    for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
    {
@ -270,6 +302,7 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_t             * kernel
    )
 {
+#define VSI_NN_MAX_BLOCK_SIZE  GPU_TENSOR_MAX_WIDTH
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t node_params[_CUMSUM_PARAM_NUM] = {NULL};
    vsi_nn_kernel_node_t node = NULL;
@ -291,6 +324,7 @@ static vsi_nn_kernel_node_t _setup
    int32_t height     = 0;
    int32_t channel    = 1;
    uint32_t i = 0;
+    int32_t is_array   = 0;

    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
@ -326,13 +360,16 @@ static vsi_nn_kernel_node_t _setup
    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
        outputs[0], shapes[0], (vsi_size_t)rs_dim );

-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    for (i = 0; i < rs_dim; i++)
    {
-        return NULL;
+        if (shapes[0][i] > VSI_NN_MAX_BLOCK_SIZE)
+        {
+            is_array = 1;
+        }
    }
+#undef VSI_NN_MAX_BLOCK_SIZE

-    status = _query_kernel( kernel, inputs, outputs, axis_new, is_2d );
+    status = _query_kernel( kernel, inputs, outputs, axis_new, is_2d, is_array);
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
--- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/

-#if !(VX_TENSOR_GATHER_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-
+#if !(VX_TENSOR_GATHER_API_SUPPORT)
 __BEGIN_DECLS

 /*
--- a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/

-#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-
+#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
 __BEGIN_DECLS

 /*
--- a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/

-#if !(VX_LOGSOFTMAX_VX_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
-
+#if !(VX_LOGSOFTMAX_VX_SUPPORT)
 __BEGIN_DECLS


--- a/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c
@ -36,6 +36,8 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"

+#if (!VX_NEAREST_GRID_SAMPLE_VX_SUPPORT)
+
 __BEGIN_DECLS

 /*
@ -412,3 +414,4 @@ __END_DECLS

 REGISTER_BACKEND_CL( nearest_grid_sample, _setup )

+#endif
--- a/src/tim/vx/internal/src/kernel/cl/pow_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/pow_cl.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/

-#if !(VX_TENSOR_POW_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-
+#if !(VX_TENSOR_POW_API_SUPPORT)
 __BEGIN_DECLS

 /*
--- a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vx_lib_nnext.h"
-
+#if (!VX_RESIZE_BILINEAR_SH_SUPPORT)
 __BEGIN_DECLS

 #define _RESIZE_BILINEAR_KERNEL_SOURCE()      "resize_bilinear"
@ -319,3 +319,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( resize_bilinear, _setup )
+#endif
--- a/src/tim/vx/internal/src/kernel/cl/tile_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/tile_cl.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/

-#if !(VX_TENSOR_TILE_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
-
+#if !(VX_TENSOR_TILE_API_SUPPORT)
 __BEGIN_DECLS


--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@ -34,20 +34,24 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

 #define _TOPK_KERNEL_SOURCE      "topk"
 #define STR(a) #a
 // Add kernel hashtable here
-#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \
-        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) )
+#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, SECTION ) \
+        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) | (SECTION << 26))
 #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \
-        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \
+        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, 0 ), \
          CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
          _TOPK_KERNEL_SOURCE }

+#define PACK_MERGE_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, 1 ), \
+          CVIVANTE_NAMESPACE("cl.topk_stage_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
+          "topk2" }
+
 #define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) )
 #define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
@ -79,6 +83,7 @@ static const _kernel_map_type _topk_kernel_map[] =
    PACK_KERNEL_MAP( F32, F32, 4 ),
    PACK_KERNEL_MAP( F32, F32, 5 ),
    PACK_KERNEL_MAP( F32, F32, 6 ),
+    PACK_KERNEL_MAP( F32, F32, 9 ),

    PACK_KERNEL_MAP( U32, U32, 0 ),
    PACK_KERNEL_MAP( U32, U32, 1 ),
@ -87,6 +92,7 @@ static const _kernel_map_type _topk_kernel_map[] =
    PACK_KERNEL_MAP( U32, U32, 4 ),
    PACK_KERNEL_MAP( U32, U32, 5 ),
    PACK_KERNEL_MAP( U32, U32, 6 ),
+    PACK_KERNEL_MAP( U32, U32, 9 ),

    PACK_KERNEL_MAP( I32, I32, 0 ),
    PACK_KERNEL_MAP( I32, I32, 1 ),
@ -95,6 +101,7 @@ static const _kernel_map_type _topk_kernel_map[] =
    PACK_KERNEL_MAP( I32, I32, 4 ),
    PACK_KERNEL_MAP( I32, I32, 5 ),
    PACK_KERNEL_MAP( I32, I32, 6 ),
+    PACK_KERNEL_MAP( I32, I32, 9 ),

    PACK_KERNEL_MAP( F32, U32, 0 ),
    PACK_KERNEL_MAP( F32, U32, 1 ),
@ -103,6 +110,7 @@ static const _kernel_map_type _topk_kernel_map[] =
    PACK_KERNEL_MAP( F32, U32, 4 ),
    PACK_KERNEL_MAP( F32, U32, 5 ),
    PACK_KERNEL_MAP( F32, U32, 6 ),
+    PACK_KERNEL_MAP( F32, U32, 9 ),

    PACK_KERNEL_MAP( F32, I32, 0 ),
    PACK_KERNEL_MAP( F32, I32, 1 ),
@ -111,6 +119,10 @@ static const _kernel_map_type _topk_kernel_map[] =
    PACK_KERNEL_MAP( F32, I32, 4 ),
    PACK_KERNEL_MAP( F32, I32, 5 ),
    PACK_KERNEL_MAP( F32, I32, 6 ),
+    PACK_KERNEL_MAP( F32, I32, 9 ),
+
+    PACK_MERGE_KERNEL_MAP(U32, U32),
+    PACK_MERGE_KERNEL_MAP(I32, I32),
 };

 static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
@ -254,7 +266,8 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
-    int32_t num_stages
+    int32_t num_stages,
+    vsi_bool is_bitnoic_segment
    )
 {
    vsi_status status = VSI_FAILURE;
@ -272,21 +285,23 @@ static vsi_status _query_kernel
    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

+    num_stages = is_bitnoic_segment ? 0 : num_stages;
+
    switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
    {
    case _PACK_SELECT_KEY(F32, F32):
    case _PACK_SELECT_KEY(F16, F16):
-        key = TOPK_HASH_KEY( F32, F32, num_stages );
+        key = TOPK_HASH_KEY( F32, F32, num_stages, is_bitnoic_segment );
        break;
    case _PACK_SELECT_KEY(U32, U32):
    case _PACK_SELECT_KEY(U16, U16):
    case _PACK_SELECT_KEY(U8,  U8):
-        key = TOPK_HASH_KEY( U32, U32, num_stages );
+        key = TOPK_HASH_KEY( U32, U32, num_stages, is_bitnoic_segment );
        break;
    case _PACK_SELECT_KEY(I32, I32):
    case _PACK_SELECT_KEY(I16, I16):
    case _PACK_SELECT_KEY(I8,  I8):
-        key = TOPK_HASH_KEY( I32, I32, num_stages );
+        key = TOPK_HASH_KEY( I32, I32, num_stages, is_bitnoic_segment );
        break;
    case _PACK_SELECT_KEY(F32, U32):
    case _PACK_SELECT_KEY(F16, U32):
@ -294,7 +309,7 @@ static vsi_status _query_kernel
    case _PACK_SELECT_KEY(F16, U16):
    case _PACK_SELECT_KEY(F32, U8):
    case _PACK_SELECT_KEY(F16, U8):
-        key = TOPK_HASH_KEY( F32, U32, num_stages );
+        key = TOPK_HASH_KEY( F32, U32, num_stages, is_bitnoic_segment );
        break;
    case _PACK_SELECT_KEY(F32, I32):
    case _PACK_SELECT_KEY(F16, I32):
@ -302,7 +317,7 @@ static vsi_status _query_kernel
    case _PACK_SELECT_KEY(F16, I16):
    case _PACK_SELECT_KEY(F32, I8):
    case _PACK_SELECT_KEY(F16, I8):
-        key = TOPK_HASH_KEY( F32, I32, num_stages );
+        key = TOPK_HASH_KEY( F32, I32, num_stages, is_bitnoic_segment );
        break;
    default:
        break;
@ -440,7 +455,12 @@ static vsi_nn_kernel_node_t _setup
    int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
    int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0);
    vsi_bool is_odd_even_sort = FALSE;
+    vsi_bool is_bitnoic_segment = FALSE;
    size_t param_num = _TOPK_PARAM_NUM;
+    int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2);
+    vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
    float inputScale  = vsi_nn_get_tensor_scale(inputs[0]);
    float inputTail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
@ -471,9 +491,22 @@ static vsi_nn_kernel_node_t _setup
    rs_tensors[0] = vsi_nn_reshape_tensor( graph,
        inputs[0], shape[0], 2 );

-    if (num_stages < 7)
+    is_bitnoic_segment = (num_stages >= 9) && (top_k <= 512 && max_stages > 9) &&
+        type0 == type1 && (type0 == U8 || type0 == I8 || type0 == I16 || type0 == U16 || type0 == I32 || type0 == U32);
+
+    if (is_bitnoic_segment && num_stages == 9)
    {
-        status = _query_kernel( kernel, inputs, outputs, num_stages );
+        is_bitnoic_segment = FALSE;
+    }
+    else
+    {
+        num_stages = is_bitnoic_segment ? 9 : num_stages;
+        max_stages = is_bitnoic_segment ? max_stages : 7;
+    }
+
+    if (num_stages < max_stages || is_bitnoic_segment)
+    {
+        status = _query_kernel( kernel, inputs, outputs, num_stages, is_bitnoic_segment );

        rs_tensors[1] = vsi_nn_reshape_tensor( graph,
            outputs[0], shape[1], 2 );
--- a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
@ -35,6 +35,8 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"

+#if (!VX_ARGMAX_VX_SUPPORT)
+
 __BEGIN_DECLS

 #define HASH_ARGMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
@ -510,3 +512,4 @@ __END_DECLS

 REGISTER_BACKEND_EVIS( argmax, _setup )

+#endif
--- a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
@ -51,26 +51,49 @@ __BEGIN_DECLS
 #define KERNEL_SOURCE_5    "cumsum_ex_rev_axis0"
 #define KERNEL_SOURCE_6    "cumsum_ex_rev_axis1"
 #define KERNEL_SOURCE_7    "cumsum_ex_rev_axis2"
+#define KERNEL_SOURCE_8    "cumsum_array"
+#define KERNEL_SOURCE_9    "cumsum_array_2d"
+#define KERNEL_SOURCE_10   "cumsum_array_bf16"
+#define KERNEL_SOURCE_11   "cumsum_array_f16_u8"
+#define KERNEL_SOURCE_12   "cumsum_array_ex_rev_axis0"
+#define KERNEL_SOURCE_13   "cumsum_array_ex_rev_axis1"
+#define KERNEL_SOURCE_14   "cumsum_array_ex_rev_axis2"
+#define KERNEL_SOURCE_15   "cumsum_array_f16_u8_2d"

 // Add kernel hashtable here
-#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, EX_REV, _image_2d) \
-    ((EX_REV << 24) | (AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
+#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, EX_REV, _image_2d, is_array) \
+    ((EX_REV << 24) | (AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 8) | (_image_2d << 4) | (is_array))

 #define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
-        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0), \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0, 0), \
        CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
        SOURCE },

 #define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
-        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1), \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1, 0), \
        CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
        SOURCE },

 #define HASH_CUMSUM_EX_REV_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
-        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0), \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0, 0), \
        CVIVANTE_NAMESPACE("evis.cumsum_ex_rev_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
        SOURCE },

+#define HASH_CUMSUM_ARRAY_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0, 1), \
+        CVIVANTE_NAMESPACE("evis.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
+        SOURCE },
+
+#define HASH_CUMSUM_ARRAY_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1, 1), \
+        CVIVANTE_NAMESPACE("evis.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
+        SOURCE },
+
+#define HASH_CUMSUM_ARRAY_EX_REV_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0, 1), \
+        CVIVANTE_NAMESPACE("evis.cumsum_ex_rev_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
+        SOURCE },
+
 static const struct {
        uint32_t key;
        char* function_name;
@ -135,6 +158,65 @@ static const struct {
    HASH_CUMSUM_EX_REV_KERNELS(2, F16,  U8,  KERNEL_SOURCE_4)
    HASH_CUMSUM_EX_REV_KERNELS(2, F16,  I8,  KERNEL_SOURCE_4)
    HASH_CUMSUM_EX_REV_KERNELS(2, F16,  I16, KERNEL_SOURCE_4)
+
+    HASH_CUMSUM_ARRAY_KERNELS(0, U8,   U8,   KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(0, I8,   I8,   KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(0, I16,  I16,  KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(0, F16,  F16,  KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(0, BF16, BF16, KERNEL_SOURCE_10)
+    HASH_CUMSUM_ARRAY_KERNELS(1, U8,   U8,   KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(1, I8,   I8,   KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(1, I16,  I16,  KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(1, F16,  F16,  KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(1, BF16, BF16, KERNEL_SOURCE_10)
+    HASH_CUMSUM_ARRAY_KERNELS(2, U8,   U8,   KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(2, I8,   I8,   KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(2, I16,  I16,  KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(2, F16,  F16,  KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(2, BF16, BF16, KERNEL_SOURCE_10)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, U8,   U8,   KERNEL_SOURCE_9)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, I8,   I8,   KERNEL_SOURCE_9)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, I16,  I16,  KERNEL_SOURCE_9)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16,  F16,  KERNEL_SOURCE_9)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, BF16, BF16, KERNEL_SOURCE_10)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, U8,   U8,   KERNEL_SOURCE_9)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, I8,   I8,   KERNEL_SOURCE_9)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, I16,  I16,  KERNEL_SOURCE_9)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16,  F16,  KERNEL_SOURCE_9)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, BF16, BF16, KERNEL_SOURCE_10)
+    HASH_CUMSUM_ARRAY_KERNELS(0, F16,  U8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS(0, F16,  I8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS(0, F16,  I16, KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS(1, F16,  U8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS(1, F16,  I8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS(1, F16,  I16, KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS(2, F16,  U8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS(2, F16,  I8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS(2, F16,  I16, KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16,  U8,  KERNEL_SOURCE_15)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16,  I8,  KERNEL_SOURCE_15)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16,  I16, KERNEL_SOURCE_15)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16,  U8,  KERNEL_SOURCE_15)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16,  I8,  KERNEL_SOURCE_15)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16,  I16, KERNEL_SOURCE_15)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, U8,   U8,  KERNEL_SOURCE_12)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, I8,   I8,  KERNEL_SOURCE_12)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, I16,  I16, KERNEL_SOURCE_12)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, F16,  F16, KERNEL_SOURCE_12)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, U8,   U8,  KERNEL_SOURCE_13)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, I8,   I8,  KERNEL_SOURCE_13)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, I16,  I16, KERNEL_SOURCE_13)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16,  F16, KERNEL_SOURCE_13)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, U8,   U8,  KERNEL_SOURCE_14)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, I8,   I8,  KERNEL_SOURCE_14)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, I16,  I16, KERNEL_SOURCE_14)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16,  F16, KERNEL_SOURCE_14)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16,  U8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16,  I8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16,  I16, KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16,  U8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16,  I8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16,  I16, KERNEL_SOURCE_11)
 };

 /*
@ -161,6 +243,7 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
    size_t                              param_size
    )
 {
+#define VSI_NN_MAX_BLOCK_SIZE  GPU_TENSOR_MAX_WIDTH
    vsi_status status = VSI_FAILURE;
    gpu_param_t shaderParam = {
        3,          // workdim
@ -188,6 +271,9 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
    float   in_out_zp_scale = 1.0f;
    float   in_out_scale    = 1.0f;

+    int32_t is_array        = 0;
+    int32_t remainder       = 0;
+
    uint32_t pack_key = 0;

    VSI_UNREFERENCED(param_size);
@ -219,7 +305,15 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
    height  = (int32_t)(input_shape->data[1]);
    channel = (int32_t)(dim > 2 ? input_shape->data[2] : 1);

+    if (width > VSI_NN_MAX_BLOCK_SIZE ||
+       height > VSI_NN_MAX_BLOCK_SIZE ||
+       channel > VSI_NN_MAX_BLOCK_SIZE)
+    {
+        is_array = 1;
+    }

+
+#undef VSI_NN_MAX_BLOCK_SIZE
    if (axis == 0)
    {
        w = 1;
@ -245,6 +339,7 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
    {
        shaderParam.global_scale[0]  = 16;
    }
+    remainder = w % shaderParam.global_scale[0];
    shaderParam.global_scale[1]  = 1;
    shaderParam.global_scale[2]  = 1;
    shaderParam.global_size[0]   = (w + shaderParam.global_scale[0] - 1) / shaderParam.global_scale[0];
@ -253,6 +348,12 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)

    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
    CHECK_STATUS_FAIL_GOTO(status, OnError);
+    if (is_array)
+    {
+        status = vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder);
+        status |= vsi_nn_kernel_gpu_add_param(node, "w_size", &w);
+        CHECK_STATUS_FAIL_GOTO(status, OnError);
+    }

 #define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, AXIS, DIM)    \
        (IN0_TYPE | (OUT_TYPE << 8) | (AXIS << 16) | (DIM << 24))
@ -767,7 +868,8 @@ static vsi_status _query_kernel
    const vsi_nn_kernel_param_t * params,
    int32_t axis,
    int32_t is_2d,
-    int32_t is_ex_rev
+    int32_t is_ex_rev,
+    int32_t is_array
    )
 {
    vsi_status status = VSI_FAILURE;
@ -781,7 +883,7 @@ static vsi_status _query_kernel
    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

-    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_ex_rev, is_2d);
+    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_ex_rev, is_2d, is_array);

    for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
    {
@ -819,6 +921,7 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_t             * kernel
    )
 {
+#define VSI_NN_MAX_BLOCK_SIZE  GPU_TENSOR_MAX_WIDTH
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t tmp_params[_CUMSUM_PARAM_NUM] = { NULL };
    vsi_nn_kernel_node_t node = NULL;
@ -831,7 +934,10 @@ static vsi_nn_kernel_node_t _setup
    int32_t is_2d      = 0;
    uint32_t rs_dim    = 2;
    uint32_t i         = 0;
+    int32_t is_array   = 0;
    int32_t is_ex_or_rev  = exclusive || reverse;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    int32_t width         = 0;

    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
@ -860,7 +966,30 @@ static vsi_nn_kernel_node_t _setup
    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
        outputs[0], shapes[0], (vsi_size_t)rs_dim );

-    status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d, is_ex_or_rev);
+    width = (int32_t)shapes[0][0];
+
+    for (i = 0; i < rs_dim; i++)
+    {
+        if (shapes[0][i] > VSI_NN_MAX_BLOCK_SIZE)
+        {
+            is_array = 1;
+        }
+    }
+
+#undef VSI_NN_MAX_BLOCK_SIZE
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+
+    if (is_array &&
+       ((axis_new == 0 && width < 8) ||
+       (axis_new > 0 && (((input0_dtype == U8 || input0_dtype == I8) && width < 16) ||
+       ((input0_dtype != U8 && input0_dtype != I8) && width < 8)))
+       ))
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d, is_ex_or_rev, is_array);
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
--- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/

-#if !(VX_TENSOR_GATHER_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -35,7 +35,7 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-
+#if !(VX_TENSOR_GATHER_API_SUPPORT)
 __BEGIN_DECLS

 /*
--- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
@ -58,14 +58,14 @@ __BEGIN_DECLS
    _3D
 } vsi_nn_kernel_coord_type_e;

-#define HASH_GATHER_ND_KEY(_input0_type, _output_type, _coord_dim, _batch_dim) \
-    ((_input0_type << 24) | (_output_type << 16) | (_coord_dim << 8) | (_batch_dim))
+#define HASH_GATHER_ND_KEY(_input0_type, _output_type, _coord_dim, _batch_dim, is_array) \
+    ((_input0_type << 24) | (_output_type << 16) | (_coord_dim << 8) | (_batch_dim << 4) | (is_array))

 #define HASH_GATHER_ND_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
    CVIVANTE_NAMESPACE("evis.gather_nd_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)

 #define TENSOR_GATHER_ND_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
-    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 0), \
+    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 0, 0), \
        HASH_GATHER_ND_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
        SOURCE },

@ -73,10 +73,26 @@ __BEGIN_DECLS
    CVIVANTE_NAMESPACE("evis.gather_nd_batch_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)

 #define TENSOR_GATHER_ND_BATCH_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
-    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 1), \
+    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 1, 0), \
        HASH_GATHER_ND_BATCH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
        SOURCE },

+#define HASH_GATHER_ND_ARRAY_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
+    CVIVANTE_NAMESPACE("evis.gather_nd_array_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
+
+#define TENSOR_GATHER_ND_ARRAY_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
+    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 0, 1), \
+        HASH_GATHER_ND_ARRAY_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
+        SOURCE },
+
+#define HASH_GATHER_ND_ARRAY_BATCH_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
+    CVIVANTE_NAMESPACE("evis.gather_nd_array_batch_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
+
+#define TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
+    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 1, 1), \
+        HASH_GATHER_ND_ARRAY_BATCH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
+        SOURCE },
+
 static const struct {
        uint32_t key;
        char* function_name;
@ -125,6 +141,50 @@ static const struct {
    TENSOR_GATHER_ND_BATCH_KERNELS(U8,  I32, U8,  _2D,      KERNEL_SOURCE_8)
    TENSOR_GATHER_ND_BATCH_KERNELS(I16, I32, I16, _2D,      KERNEL_SOURCE_8)
    TENSOR_GATHER_ND_BATCH_KERNELS(F16, I32, F16, _2D,      KERNEL_SOURCE_8)
+
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, I8,  _1D,      KERNEL_SOURCE_1)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, U8,  _1D,      KERNEL_SOURCE_1)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, I16, _1D,      KERNEL_SOURCE_1)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, F16, _1D,      KERNEL_SOURCE_1)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, I8,  _2D,      KERNEL_SOURCE_2)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, U8,  _2D,      KERNEL_SOURCE_2)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, I16, _2D,      KERNEL_SOURCE_2)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, F16, _2D,      KERNEL_SOURCE_2)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, I8,  _3D,      KERNEL_SOURCE_3)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, U8,  _3D,      KERNEL_SOURCE_3)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, I16, _3D,      KERNEL_SOURCE_3)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, F16, _3D,      KERNEL_SOURCE_3)
+
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, F16, _1D,      KERNEL_SOURCE_4)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, F16, _1D,      KERNEL_SOURCE_4)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I8,  _1D,      KERNEL_SOURCE_4)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I16, _1D,      KERNEL_SOURCE_4)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, F16, _1D,      KERNEL_SOURCE_4)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, U8,  _1D,      KERNEL_SOURCE_4)
+
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, F16, _2D,      KERNEL_SOURCE_5)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, F16, _2D,      KERNEL_SOURCE_5)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I8,  _2D,      KERNEL_SOURCE_5)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I16, _2D,      KERNEL_SOURCE_5)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, F16, _2D,      KERNEL_SOURCE_5)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, U8,  _2D,      KERNEL_SOURCE_5)
+
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, F16, _3D,      KERNEL_SOURCE_6)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, F16, _3D,      KERNEL_SOURCE_6)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I8,  _3D,      KERNEL_SOURCE_6)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I16, _3D,      KERNEL_SOURCE_6)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, F16, _3D,      KERNEL_SOURCE_6)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, U8,  _3D,      KERNEL_SOURCE_6)
+
+    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I8,  I32, I8,  _1D,      KERNEL_SOURCE_7)
+    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(U8,  I32, U8,  _1D,      KERNEL_SOURCE_7)
+    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I16, I32, I16, _1D,      KERNEL_SOURCE_7)
+    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(F16, I32, F16, _1D,      KERNEL_SOURCE_7)
+    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I8,  I32, I8,  _2D,      KERNEL_SOURCE_8)
+    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(U8,  I32, U8,  _2D,      KERNEL_SOURCE_8)
+    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I16, I32, I16, _2D,      KERNEL_SOURCE_8)
+    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(F16, I32, F16, _2D,      KERNEL_SOURCE_8)
+
 };

 /*
@ -148,7 +208,8 @@ static vsi_status get_gather_nd_tensor_reshape_size
    vsi_size_t block_size,
    uint32_t coordDim,
    int32_t* newDim,
-    uint32_t  batch_dims
+    uint32_t  batch_dims,
+    int32_t* arrayFlg
    )
 {
    vsi_status status = VSI_FAILURE;
@ -184,12 +245,20 @@ static vsi_status get_gather_nd_tensor_reshape_size
            for (i = 0; i < coordDim - 1; i++)
            {
                sizes[rank++] = input_size[i + offset];
+                if (sizes[i] >= VSI_NN_MAX_IMAGE_WIDTH)
+                {
+                    arrayFlg[0] = 1;
+                }
            }

            for (i = 0; i < batch_dims; i++)
            {
                sizes[rank] *= input_size[dims_num - i - 1];
            }
+            if (sizes[rank] >= VSI_NN_MAX_IMAGE_WIDTH)
+            {
+                arrayFlg[0] = 1;
+            }

            newDim[0] = rank + 1;
        }
@ -198,6 +267,10 @@ static vsi_status get_gather_nd_tensor_reshape_size
            for (i = coordDim-1; i > 0; i--)
            {
                sizes[i] = input_size[i + offset - 1];
+                if (sizes[i] >= VSI_NN_MAX_IMAGE_WIDTH)
+                {
+                    arrayFlg[0] = 1;
+                }
            }
            for (i = 0; i < offset; i++)
            {
@ -210,6 +283,10 @@ static vsi_status get_gather_nd_tensor_reshape_size
                newDim[0] = 2;
                sizes[0] = block_size;
                sizes[1] = elementCnt / block_size;
+                if ((elementCnt / block_size) >= VSI_NN_MAX_IMAGE_WIDTH)
+                {
+                    arrayFlg[0] = 1;
+                }
            }
            else if (coordDim == 4)
            {
@ -242,6 +319,14 @@ static vsi_status get_gather_nd_tensor_reshape_size
            status = VSI_SUCCESS;
            newDim[0] = 3;
        }
+        else
+        {
+            sizes[0] = block_size;
+            sizes[1] = elementCnt / block_size;
+            status = VSI_SUCCESS;
+            newDim[0] = 2;
+            arrayFlg[0] = 1;
+        }
    }
 #undef VSI_NN_MAX_IMAGE_WIDTH

@ -409,7 +494,8 @@ static vsi_status _query_kernel
    vsi_nn_tensor_t* const* const outputs,
    vsi_nn_kernel_t* kernel,
    int32_t coord_dim,
-    int32_t batch_dims
+    int32_t batch_dims,
+    int32_t is_array
    )
 {
    vsi_status status = VSI_FAILURE;
@ -444,7 +530,7 @@ static vsi_status _query_kernel
        coord_type = _3D;
    }

-    key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, batch_flg );
+    key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, batch_flg, is_array);

    for ( i = 0; i < _cnt_of_array(gather_nd_map); i ++ )
    {
@ -482,6 +568,7 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_t             * kernel
    )
 {
+#define VSI_NN_MAX_BLOCK_SIZE  GPU_TENSOR_MAX_WIDTH
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t tmp_params[_GATHER_ND_PARAM_NUM] = { NULL };
    vsi_nn_kernel_node_t node = NULL;
@ -489,26 +576,41 @@ static vsi_nn_kernel_node_t _setup
    int32_t batch_dims  = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
    int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
    int32_t coord_dim   = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
+    int32_t input_size  = 1;
+    int32_t no_block_batch_size = 1;
    int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
+    int32_t is_array    = 0;
+    int32_t i = 0;

    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);

-    status = get_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim, batch_dims);
-    status |= get_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim, batch_dims);
-    status |= get_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim, batch_dims);
+    for (i = 0; i < (int32_t)inputs[0]->attr.dim_num; i++)
+    {
+        input_size = input_size * (int32_t)inputs[0]->attr.size[i];
+    }
+    no_block_batch_size = input_size / block_size;
+    is_array = no_block_batch_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0;
+
+    status = get_gather_nd_tensor_reshape_size(&inputs[0], shapes[0],
+        block_size, coord_dim, &rs_in_dim, batch_dims, &is_array);
+    status |= get_gather_nd_tensor_reshape_size(&inputs[1], shapes[1],
+        coord_dim, 0, &rs_idx_dim, batch_dims, &is_array);
+    status |= get_gather_nd_tensor_reshape_size(&outputs[0], shapes[2],
+        block_size, 0, &rs_out_dim, batch_dims, &is_array);
+#undef VSI_NN_MAX_BLOCK_SIZE
    if (status != VSI_SUCCESS)
    {
        return NULL;
    }

-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
-    {
-        return NULL;
-    }
+    //if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    //            outputs[0]->attr.dim_num ) )
+    //{
+    //    return NULL;
+    //}

-    status = _query_kernel( inputs, outputs, kernel, coord_dim, batch_dims );
+    status = _query_kernel( inputs, outputs, kernel, coord_dim, batch_dims, is_array);
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
--- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/

-#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
-
+#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
 __BEGIN_DECLS

 #define SOURCE_AXIS0_0     "layer_normalization_0"
--- a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/

-#if !(VX_LOGSOFTMAX_VX_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
-
+#if !(VX_LOGSOFTMAX_VX_SUPPORT)
 __BEGIN_DECLS

 #define HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
--- a/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c
@ -36,6 +36,8 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"

+#if (!VX_NEAREST_GRID_SAMPLE_VX_SUPPORT)
+
 __BEGIN_DECLS

 /*
@ -625,3 +627,4 @@ __END_DECLS

 REGISTER_BACKEND_EVIS( nearest_grid_sample, _setup )

+#endif
--- a/src/tim/vx/internal/src/kernel/evis/pow_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pow_evis.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/

-#if !(VX_TENSOR_POW_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-
+#if !(VX_TENSOR_POW_API_SUPPORT)
 __BEGIN_DECLS

 #define KERNEL_SOURCE    "pow",
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
@ -750,6 +750,7 @@ static vsi_nn_kernel_node_t _setup
    shape[2] = 1;
    reshape_tensor = vsi_nn_reshape_tensor( graph,
            outputs[0], shape, outputs[0]->attr.dim_num );
+    CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor fail.", final);

    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
                outputs[0]->attr.dim_num ) )
@ -819,6 +820,7 @@ static vsi_nn_kernel_node_t _setup

 final:
    vsi_nn_safe_free(node_params);
+    vsi_safe_release_tensor(reshape_tensor);

    return node;
 } /* _setup() */
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
@ -911,6 +911,7 @@ static vsi_nn_kernel_node_t _setup
    shape[2] = 1;
    reshape_tensor = vsi_nn_reshape_tensor( graph,
            outputs[0], shape, outputs[0]->attr.dim_num );
+    CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor fail.", final);

    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
                outputs[0]->attr.dim_num ) )
@ -978,6 +979,7 @@ static vsi_nn_kernel_node_t _setup

 final:
    vsi_nn_safe_free(node_params);
+    vsi_safe_release_tensor(reshape_tensor);

    return node;
 } /* _setup() */
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_dtype_util_prv.h"
-
+#if (!VX_RESIZE_BILINEAR_SH_SUPPORT)
 __BEGIN_DECLS

 /*
@ -1515,3 +1515,4 @@ final:
 __END_DECLS

 REGISTER_BACKEND_EVIS( resize_bilinear, _setup )
+#endif
--- a/src/tim/vx/internal/src/kernel/evis/tile_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/tile_evis.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/

-#if !(VX_TENSOR_TILE_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
-
+#if !(VX_TENSOR_TILE_API_SUPPORT)
 __BEGIN_DECLS

 /*
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@ -29,6 +29,7 @@
 #include "vsi_nn_context.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_error.h"
@ -1673,7 +1674,7 @@ vsi_status vsi_nn_KernelGpuConfig

 static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
 {
-    int32_t enableShader = graph->ctx->options.enable_shader;
+    int32_t enableShader = ((vsi_nn_graph_prv_t*)graph)->options->enable_shader;

 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
    if ( graph->ctx->config.subGroupSize == 0 )
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
@ -181,6 +181,9 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(cos)
 #if (VX_LOGSOFTMAX_VX_SUPPORT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(log_softmax)
 #endif
+#if (VX_BITCAST_VX_SUPPORT)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(bitcast)
+#endif


 __END_DECLS
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
@ -916,11 +916,21 @@ vsi_nn_tensor_t * vsi_nn_kernel_insert_reshape_node
    {
        input = in_tensor;
        output = tensor;
+        /* Create a openvx tensor if it is not exist */
+        if (NULL == input->t)
+        {
+            vsi_nn_TensorReinit(graph, input);
+        }
    }
    else
    {
        input = tensor;
        output = in_tensor;
+        /* Create a openvx tensor if it is not exist */
+        if (NULL == output->t)
+        {
+            vsi_nn_TensorReinit(graph, output);
+        }
    }

    vxTensorReshapeNode(graph->g, input->t, &reshape_param, sizeof(reshape_param), output->t);
--- a/src/tim/vx/internal/src/kernel/vx/argmax_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/argmax_vx.c
@ -0,0 +1,79 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+#if (VX_ARGMAX_VX_SUPPORT)
+
+#define REGISTER_ARGMAXOPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_ARGMAXOPENVX_KERNEL( argmax )
+{
+    vx_node node = NULL;
+    int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    node = vxArgmaxLayer(graph->g,
+                        inputs[0]->t,
+                        axis,
+                        outputs[0]->t
+                        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* argmax() */
+
+#undef REGISTER_ARGMAXOPENVX_KERNEL
+
+#endif
--- a/src/tim/vx/internal/src/kernel/vx/bitcast_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/bitcast_vx.c
@ -0,0 +1,77 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+#if (VX_BITCAST_VX_SUPPORT)
+
+#define REGISTER_BITCASTOPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_BITCASTOPENVX_KERNEL( bitcast )
+{
+    vx_node node = NULL;
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    node = vxBitCastLayer(graph->g,
+                              inputs[0]->t,
+                              outputs[0]->t
+                              );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* bitcast() */
+
+#undef REGISTER_BITCASTOPENVX_KERNEL
+
+#endif
--- a/src/tim/vx/internal/src/kernel/vx/grid_sample_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/grid_sample_vx.c
@ -0,0 +1,91 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+#if (VX_NEAREST_GRID_SAMPLE_VX_SUPPORT)
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vx_node node = NULL;
+    int32_t mode =
+        vsi_nn_kernel_param_get_int32(params, "mode");
+    int32_t align_corners =
+        vsi_nn_kernel_param_get_int32(params, "align_corners");
+    int32_t pad_mode =
+        vsi_nn_kernel_param_get_int32(params, "padding_mode");
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
+    node = vxGridSampleLayer(
+        graph->g,
+        inputs[0]->t,
+        inputs[1]->t,
+        mode,
+        align_corners,
+        pad_mode,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* _setup() */
+
+#define REGISTER_NEAREST_GRID_SAMPLE_OPENVX_KERNEL(KERNEL_NAME) \
+    static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num, \
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ) \
+    { \
+        return _setup(graph, inputs, input_num, outputs, output_num, \
+                params, kernel); \
+    } \
+    REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
+
+REGISTER_NEAREST_GRID_SAMPLE_OPENVX_KERNEL( nearest_grid_sample )
+
+#undef REGISTER_NEAREST_GRID_SAMPLE_OPENVX_KERNEL
+
+#endif
--- a/src/tim/vx/internal/src/kernel/vx/l1_layer_norm_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/l1_layer_norm_vx.c
@ -0,0 +1,82 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+#if (VX_L1_LAYER_NORM_VX_SUPPORT)
+#define REGISTER_L1_LAYER_NORM_OPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_L1_LAYER_NORM_OPENVX_KERNEL( l1_layer_norm )
+{
+    vx_node node = NULL;
+    float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
+    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    node = vxL1LayerNormalizationLayer(
+        graph->g,
+        eps,
+        axis,
+        inputs[0]->t,
+        inputs[1]->t,
+        inputs[2]->t,
+        inputs[3]->t,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* l1_layer_norm() */
+
+#undef REGISTER_L1_LAYER_NORM_OPENVX_KERNEL
+#endif
--- a/src/tim/vx/internal/src/libnnext/ops/cl/col2im.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/col2im.cl
@ -0,0 +1,162 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int width_pad;
+_viv_uniform int height_pad;
+_viv_uniform int depth_pad;
+_viv_uniform int move_time_x;
+_viv_uniform int move_time_y;
+_viv_uniform int kernel_x_new;
+_viv_uniform int kernel_y_new;
+_viv_uniform int kernel_z_new;
+_viv_uniform int depth;
+
+#define COL2IM(name, read_type, dst_type ,convert_type, write_type) \
+__kernel void col2im_##name \
+( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+                 int              stride_w, \
+                 int              stride_h, \
+                 int              stride_d, \
+                 int              dilation_w, \
+                 int              dilation_h, \
+                 int              dilation_d, \
+                 int              pad_w_front, \
+                 int              pad_w_end, \
+                 int              pad_h_front, \
+                 int              pad_h_end, \
+                 int              pad_d_front, \
+                 int              pad_d_end, \
+                 int              kernel_x, \
+                 int              kernel_y, \
+                 int              kernel_z, \
+                 float            inOutScale, \
+                 float            inOutTile \
+) \
+{ \
+    int x = get_global_id(0); \
+    int y = get_global_id(1); \
+    int z = get_global_id(2); \
+    int4 coord_out = (int4)(x,y,z,0); \
+    int b = z / depth; \
+    z = z % depth; \
+    int4 coord_in = (int4)(0,0,b,0); \
+ \
+    float sum = 0.0f; \
+    x = x + pad_w_front; \
+    y = y + pad_h_front; \
+    z = z + pad_d_front; \
+    int offset_x = x % stride_w; \
+    int offset_y = y % stride_h; \
+    int offset_z = z % stride_d; \
+    int i,j,k; \
+    for (k = offset_z; k < kernel_z_new; k += stride_d) \
+    { \
+        if ((z - k) < 0 || (z + (kernel_z_new - k)) > depth_pad || k % dilation_d != 0) \
+        { \
+            continue; \
+        } \
+        for (j = offset_y; j < kernel_y_new; j = j + stride_h) \
+        { \
+            if ((y - j) < 0 || (y + (kernel_y_new - j)) > height_pad || j % dilation_h != 0) \
+            { \
+                continue; \
+            } \
+            for (i = offset_x; i < kernel_x_new; i = i + stride_w) \
+            { \
+                if ((x - i) < 0 || (x + (kernel_x_new - i)) > width_pad || i % dilation_w != 0) \
+                { \
+                    continue; \
+                } \
+                coord_in.x = (x - i + stride_w - 1) / stride_w + \
+                             (y - j + stride_h - 1) / stride_h * move_time_x + \
+                             (z - k + stride_d - 1) / stride_d * move_time_y * move_time_x; \
+                coord_in.y = i / dilation_w + j * kernel_x / dilation_h + k * kernel_x * kernel_y / dilation_d; \
+                sum = sum + convert_float(read_type(input, coord_in).x); \
+            } \
+        } \
+    } \
+    sum = sum * inOutScale + inOutTile; \
+    dst_type dst = 0; \
+    dst.x = convert_type(sum); \
+    write_type(output, coord_out, dst); \
+}
+COL2IM(U32toU32, read_imageui, uint4,  convert_uint,  write_imageui)
+COL2IM(U32toI32, read_imageui, int4,   convert_int,   write_imagei)
+COL2IM(U32toF32, read_imageui, float4, convert_float, write_imagef)
+COL2IM(I32toU32, read_imagei,  uint4,  convert_uint,  write_imageui)
+COL2IM(I32toI32, read_imagei,  int4,   convert_int,   write_imagei)
+COL2IM(I32toF32, read_imagei,  float4, convert_float, write_imagef)
+COL2IM(F32toU32, read_imagef,  uint4,  convert_uint,  write_imageui)
+COL2IM(F32toI32, read_imagef,  int4,   convert_int,   write_imagei)
+COL2IM(F32toF32, read_imagef,  float4, convert_float, write_imagef)
+
+#define COL2IM_2D(name, read_type, dst_type ,convert_type, write_type) \
+__kernel void col2im_##name##_2D \
+( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+                 int              stride_w, \
+                 int              stride_h, \
+                 int              stride_d, \
+                 int              dilation_w, \
+                 int              dilation_h, \
+                 int              dilation_d, \
+                 int              pad_w_front, \
+                 int              pad_w_end, \
+                 int              pad_h_front, \
+                 int              pad_h_end, \
+                 int              pad_d_front, \
+                 int              pad_d_end, \
+                 int              kernel_x, \
+                 int              kernel_y, \
+                 int              kernel_z, \
+                 float            inOutScale, \
+                 float            inOutTile \
+) \
+{ \
+    int x = get_global_id(0); \
+    int y = get_global_id(1); \
+    int z = get_global_id(2); \
+    int4 coord_out = (int4)(x,y,z,0); \
+    int4 coord_in = (int4)(0,0,z,0); \
+ \
+    float sum = 0.0f; \
+    x = x + pad_w_front; \
+    y = y + pad_h_front; \
+    int offset_x = x % stride_w; \
+    int offset_y = y % stride_h; \
+    int i,j; \
+    for (j = offset_y; j < kernel_y_new; j = j + stride_h) \
+    { \
+        if ((y - j) < 0 || (y + (kernel_y_new - j)) > height_pad || j % dilation_h != 0) \
+        { \
+            continue; \
+        } \
+        for (i = offset_x; i < kernel_x_new; i = i + stride_w) \
+        { \
+            if ((x - i) < 0 || (x + (kernel_x_new - i)) > width_pad || i % dilation_w != 0) \
+            { \
+                continue; \
+            } \
+            coord_in.x = (x - i + stride_w - 1) / stride_w + \
+                         (y - j + stride_h - 1) / stride_h * move_time_x; \
+            coord_in.y = i / dilation_w + j * kernel_x / dilation_h; \
+            sum = sum + convert_float(read_type(input, coord_in).x); \
+        } \
+    } \
+    sum = sum * inOutScale + inOutTile; \
+    dst_type dst = 0; \
+    dst.x = convert_type(sum); \
+    write_type(output, coord_out, dst); \
+}
+COL2IM_2D(U32toU32, read_imageui, uint4,  convert_uint,  write_imageui)
+COL2IM_2D(U32toI32, read_imageui, int4,   convert_int,   write_imagei)
+COL2IM_2D(U32toF32, read_imageui, float4, convert_float, write_imagef)
+COL2IM_2D(I32toU32, read_imagei,  uint4,  convert_uint,  write_imageui)
+COL2IM_2D(I32toI32, read_imagei,  int4,   convert_int,   write_imagei)
+COL2IM_2D(I32toF32, read_imagei,  float4, convert_float, write_imagef)
+COL2IM_2D(F32toU32, read_imagef,  uint4,  convert_uint,  write_imageui)
+COL2IM_2D(F32toI32, read_imagef,  int4,   convert_int,   write_imagei)
+COL2IM_2D(F32toF32, read_imagef,  float4, convert_float, write_imagef)
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis0.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis0.cl
@ -0,0 +1,332 @@
+
+__kernel void cumsum_array_F32toF32_axis0_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    float sum = (float)(0);
+    Image img1 = create_image_from_image2d(input, 4);
+    Image img2 = create_image_from_image2d(output, 4);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+    __global float* in_ptr = (__global float*)input_ptr;
+    __global float* out_ptr = (__global float*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord.x = width - 1;
+        coord.z = coord.x;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+
+        for(; coord.x > 0; coord.x--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord.z--;
+            sum += data;
+
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(exclusive)
+    {
+        coord.z = 0;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord.z++;
+            sum += data;
+
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else
+    {
+        for(coord.x = 0; coord.x < width; coord.x++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+}
+
+__kernel void cumsum_array_U8toU8_axis0_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    uint sum = (uint)(0);
+    uint dst = (uint)(0);
+
+    int tmp_zp = convert_int_rte(output_zp);
+    dst.x = convert_uint_sat(tmp_zp);
+
+    float cnt = 0.0f;
+
+    Image img1 = create_image_from_image2d(input, 4);
+    Image img2 = create_image_from_image2d(output, 4);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+    __global uint* in_ptr = (__global uint*)input_ptr;
+    __global uint* out_ptr = (__global uint*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord.x = width - 1;
+        coord.z = coord.x;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = dst;
+        for(; coord.x > 0; coord.x--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global uint*)input_ptr;
+            uint data = in_ptr[0];
+            coord.z--;
+            cnt += 1.0;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(exclusive)
+    {
+        coord.z = 0;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = dst;
+        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global uint*)input_ptr;
+            uint data = in_ptr[0];
+            cnt += 1.0f;
+            coord.z++;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global uint*)input_ptr;
+            uint data = in_ptr[0];
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else
+    {
+        for(coord.x = 0; coord.x < width; coord.x++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global uint*)input_ptr;
+            uint data = in_ptr[0];
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+}
+
+__kernel void cumsum_array_F32toU8_axis0_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    float4 sum = (float4)(0);
+    uint4 dst = (uint4)(0);
+    int tmp_zp = convert_int_rte(output_zp);
+    dst.x = convert_uint_sat(tmp_zp);
+
+    float cnt = 0.0f;
+    Image img1 = create_image_from_image2d(input, 4);
+    Image img2 = create_image_from_image2d(output, 4);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+    __global float* in_ptr = (__global float*)input_ptr;
+    __global uint* out_ptr = (__global uint*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord.x = width - 1;
+        coord.z = coord.x;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global uint*)output_ptr;
+        out_ptr[0] = dst;
+        for(; coord.x > 0; coord.x--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord.z--;
+            cnt += 1.0;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive)
+    {
+        coord.z = 0;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global uint*)output_ptr;
+        out_ptr[0] = dst;
+        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            cnt += 1.0f;
+            coord.z++;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else
+    {
+        for(coord.x = 0; coord.x < width; coord.x++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis1.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis1.cl
@ -0,0 +1,321 @@
+
+__kernel void cumsum_array_F32toF32_axis1_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    float sum = (float)(0);
+    Image img1 = create_image_from_image2d(input, 4);
+    Image img2 = create_image_from_image2d(output, 4);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+    __global float* in_ptr = (__global float*)input_ptr;
+    __global float* out_ptr = (__global float*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord.w = height - 1;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord.w--;
+            sum += data;
+
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(exclusive)
+    {
+        write_imagef(output, coord.zw, sum);
+        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord.w++;
+            sum += data;
+
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else
+    {
+        for(coord.y = 0; coord.y < height; coord.y++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+}
+
+__kernel void cumsum_array_U8toU8_axis1_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    uint sum = (uint)(0);
+    uint dst = (uint)(0);
+
+    int tmp_zp = convert_int_rte(output_zp);
+    dst = convert_uint_sat(tmp_zp);
+
+    float cnt = 0;
+    Image img1 = create_image_from_image2d(input, 4);
+    Image img2 = create_image_from_image2d(output, 4);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+    __global uint* in_ptr = (__global uint*)input_ptr;
+    __global uint* out_ptr = (__global uint*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord.w = height - 1;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = dst;
+        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global uint*)input_ptr;
+            uint data = in_ptr[0];
+            cnt += 1.0f;
+            coord.w--;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive)
+    {
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = dst;
+        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global uint*)input_ptr;
+            uint data = in_ptr[0];
+            cnt += 1.0f;
+            coord.w++;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global uint*)input_ptr;
+            uint data = in_ptr[0];
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else
+    {
+        for(coord.y = 0; coord.y < height; coord.y++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global uint*)input_ptr;
+            uint data = in_ptr[0];
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+}
+
+__kernel void cumsum_array_F32toU8_axis1_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    float sum = (float)(0);
+    uint dst = (uint)(0);
+    int tmp_zp = convert_int_rte(output_zp);
+    dst = convert_uint_sat(tmp_zp);
+
+    float cnt = 0;
+    Image img1 = create_image_from_image2d(input, 4);
+    Image img2 = create_image_from_image2d(output, 4);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+    __global float* in_ptr = (__global float*)input_ptr;
+    __global uint* out_ptr = (__global uint*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord.w = height - 1;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global uint*)output_ptr;
+        out_ptr[0] = dst;
+        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            cnt += 1.0f;
+            coord.w--;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive)
+    {
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global uint*)output_ptr;
+        out_ptr[0] = dst;
+        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            cnt += 1.0f;
+            coord.w++;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else
+    {
+        for(coord.y = 0; coord.y < height; coord.y++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis0.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis0.cl
@ -0,0 +1,215 @@
+
+__kernel void cumsum_array_F32toF32_axis0(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    float sum = (float)(0);
+    Tensor img1 = create_tensor_from_image2d_array(input, 4);
+    Tensor img2 = create_tensor_from_image2d_array(output, 4);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global float* in_ptr = (__global float*)input_ptr;
+    __global float* out_ptr = (__global float*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord_out.x = width - 1;
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+        for(coord.x = width - 1; coord.x > 0; coord.x--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord_out.x--;
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.x = 0;
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord_out.x++;
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else
+    {
+        for(coord.x = 0; coord.x < width; coord.x++)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+}
+
+#define CUMSUM_ARRAY_toU8_AXIS0_SH(name, src_type) \
+__kernel void cumsum_array_##name##toU8_axis0( \
+    __read_only image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+    int axis, \
+    int exclusive, \
+    int rev, \
+    int width, \
+    int height, \
+    int channel, \
+    int input_zp, \
+    float in_out_scale, \
+    float in_out_zp_scale, \
+    float output_zp \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int4 coord_out = coord; \
+ \
+    src_type sum = (src_type)(0); \
+    uint dst = (uint)(0); \
+    int tmp_zp = convert_int_rte(output_zp); \
+    dst = convert_uint_sat(tmp_zp); \
+ \
+    float cnt = 0; \
+ \
+    Tensor img1 = create_tensor_from_image2d_array(input, 4); \
+    Tensor img2 = create_tensor_from_image2d_array(output, 4); \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \
+    __global uint* out_ptr = (__global uint*)output_ptr; \
+    if(exclusive && rev) \
+    { \
+        coord_out.x = width - 1; \
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+        out_ptr = (__global uint*)output_ptr; \
+        out_ptr[0] = dst; \
+        for(coord.x = width - 1; coord.x > 0; coord.x--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            coord_out.x--; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive) \
+    { \
+        coord_out.x = 0; \
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+        out_ptr = (__global uint*)output_ptr; \
+        out_ptr[0] = dst; \
+        for(coord.x = 0; coord.x < width - 1; coord.x++) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            coord_out.x++; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(rev) \
+    { \
+        for(coord.x = width - 1; coord.x >= 0; coord.x--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else \
+    { \
+        for(coord.x = 0; coord.x < width; coord.x++) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+}
+CUMSUM_ARRAY_toU8_AXIS0_SH(U8,uint)
+CUMSUM_ARRAY_toU8_AXIS0_SH(F32,float)
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis1.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis1.cl
@ -0,0 +1,216 @@
+
+__kernel void cumsum_array_F32toF32_axis1(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    float sum = (float)(0);
+    Tensor img1 = create_tensor_from_image2d_array(input, 4);
+    Tensor img2 = create_tensor_from_image2d_array(output, 4);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global float* in_ptr = (__global float*)input_ptr;
+    __global float* out_ptr = (__global float*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord_out.y = height - 1;
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord_out.y--;
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.y = 0;
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord_out.y++;
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else
+    {
+        for(coord.y = 0; coord.y < height; coord.y++)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+}
+
+#define CUMSUM_ARRAY_toU8_AXIS1_SH(name, src_type) \
+__kernel void cumsum_array_##name##toU8_axis1( \
+    __read_only image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+    int axis, \
+    int exclusive, \
+    int rev, \
+    int width, \
+    int height, \
+    int channel, \
+    int input_zp, \
+    float in_out_scale, \
+    float in_out_zp_scale, \
+    float output_zp \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int4 coord_out = coord; \
+ \
+    src_type sum = (src_type)(0); \
+    uint dst = (uint4)(0); \
+    int tmp_zp = convert_int_rte(output_zp); \
+    dst = convert_uint_sat(tmp_zp); \
+ \
+    float cnt = 0; \
+ \
+    Tensor img1 = create_tensor_from_image2d_array(input, 4); \
+    Tensor img2 = create_tensor_from_image2d_array(output, 4); \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \
+    __global uint* out_ptr = (__global uint*)output_ptr; \
+    if(exclusive && rev) \
+    { \
+        coord_out.y = height - 1; \
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+        out_ptr = (__global uint*)output_ptr; \
+        out_ptr[0] = dst; \
+ \
+        for(coord.y = height - 1; coord.y > 0; coord.y--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            cnt += 1.0f; \
+            coord_out.y--; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive) \
+    { \
+        coord_out.y = 0; \
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+        out_ptr = (__global uint*)output_ptr; \
+        out_ptr[0] = dst; \
+        for(coord.y = 0; coord.y < height - 1; coord.y++) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            cnt += 1.0f; \
+            coord_out.y++; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(rev) \
+    { \
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else \
+    { \
+        for(coord.y = 0; coord.y < height; coord.y++) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+}
+CUMSUM_ARRAY_toU8_AXIS1_SH(U8,uint)
+CUMSUM_ARRAY_toU8_AXIS1_SH(F32,float)
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis2.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis2.cl
@ -0,0 +1,215 @@
+__kernel void cumsum_array_F32toF32_axis2(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    float sum = 0;
+    Tensor img1 = create_tensor_from_image2d_array(input, 4);
+    Tensor img2 = create_tensor_from_image2d_array(output, 4);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global float* in_ptr = (__global float*)input_ptr;
+    __global float* out_ptr = (__global float*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord_out.z = channel - 1;
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+
+        for(coord.z = channel - 1; coord.z > 0; coord.z--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord_out.z--;
+            sum += data;
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.z = 0;
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+        for(coord.z = 0; coord.z < channel - 1; coord.z++)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord_out.z++;
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else
+    {
+        for(coord.z = 0; coord.z < channel; coord.z++)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+}
+
+#define CUMSUM_ARRAY_toU8_AXIS2_SH(name, src_type) \
+__kernel void cumsum_array_##name##toU8_axis2( \
+    __read_only image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+    int axis, \
+    int exclusive, \
+    int rev, \
+    int width, \
+    int height, \
+    int channel, \
+    int input_zp, \
+    float in_out_scale, \
+    float in_out_zp_scale, \
+    float output_zp \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int4 coord_out = coord; \
+ \
+    src_type sum = (src_type)(0); \
+    uint dst = (uint)(0); \
+    int tmp_zp = convert_int_rte(output_zp); \
+    dst = convert_uint_sat(tmp_zp); \
+ \
+    float cnt = 0.0f; \
+    Tensor img1 = create_tensor_from_image2d_array(input, 4); \
+    Tensor img2 = create_tensor_from_image2d_array(output, 4); \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \
+    __global uint* out_ptr = (__global uint*)output_ptr; \
+ \
+    if(exclusive && rev) \
+    { \
+        coord_out.z = channel - 1; \
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+        out_ptr = (__global uint*)output_ptr; \
+        out_ptr[0] = dst; \
+        for(coord.z = channel - 1; coord.z > 0; coord.z--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            coord_out.z--; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive) \
+    { \
+        coord_out.z = 0; \
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+        out_ptr = (__global uint*)output_ptr; \
+        out_ptr[0] = dst; \
+        for(coord.z = 0; coord.z < channel - 1; coord.z++) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            coord_out.z++; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(rev) \
+    { \
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else \
+    { \
+        for(coord.z = 0; coord.z < channel; coord.z++) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+}
+CUMSUM_ARRAY_toU8_AXIS2_SH(U8,uint)
+CUMSUM_ARRAY_toU8_AXIS2_SH(F32,float)
+
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
@ -18,8 +18,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
 \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
 \
-    __local float local_data[128]; \
-    __local uint local_indices[128]; \
+    __local float local_data[LOCAL_SIZE0 * 2]; \
+    __local uint local_indices[LOCAL_SIZE0 * 2]; \
 \
    float left = read_imagef(input, coord.xy).x; \
    coord.z += work_group_size; \
@ -51,7 +51,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            float left_elem = local_data[left_id]; \
            float right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -78,13 +78,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
    write_imagei(indices, coord.xy, index.xxxx); \
    write_imagei(indices, coord.zy, index.yyyy); \
 }
-TOPK_F32(1 << 0, 0)
-TOPK_F32(1 << 1, 1)
-TOPK_F32(1 << 2, 2)
-TOPK_F32(1 << 3, 3)
-TOPK_F32(1 << 4, 4)
-TOPK_F32(1 << 5, 5)
-TOPK_F32(1 << 6, 6)
+TOPK_F32((1 << 0), 0)
+TOPK_F32((1 << 1), 1)
+TOPK_F32((1 << 2), 2)
+TOPK_F32((1 << 3), 3)
+TOPK_F32((1 << 4), 4)
+TOPK_F32((1 << 5), 5)
+TOPK_F32((1 << 6), 6)
+TOPK_F32((1 << 9), 9)

 #define TOPK_U32(LOCAL_SIZE0, STAGES) \
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_U32toU32_I32 \
@ -106,8 +107,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
 \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
 \
-    __local uint local_data[128]; \
-    __local uint local_indices[128]; \
+    __local uint local_data[LOCAL_SIZE0 * 2]; \
+    __local uint local_indices[LOCAL_SIZE0 * 2]; \
 \
    uint left = read_imageui(input, coord.xy).x; \
    coord.z += work_group_size; \
@ -139,7 +140,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            uint left_elem = local_data[left_id]; \
            uint right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -166,13 +167,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
    write_imagei(indices, coord.xy, index.xxxx); \
    write_imagei(indices, coord.zy, index.yyyy); \
 }
-TOPK_U32(1 << 0, 0)
-TOPK_U32(1 << 1, 1)
-TOPK_U32(1 << 2, 2)
-TOPK_U32(1 << 3, 3)
-TOPK_U32(1 << 4, 4)
-TOPK_U32(1 << 5, 5)
-TOPK_U32(1 << 6, 6)
+TOPK_U32((1 << 0), 0)
+TOPK_U32((1 << 1), 1)
+TOPK_U32((1 << 2), 2)
+TOPK_U32((1 << 3), 3)
+TOPK_U32((1 << 4), 4)
+TOPK_U32((1 << 5), 5)
+TOPK_U32((1 << 6), 6)
+TOPK_U32((1 << 9), 9)

 #define TOPK_I32(LOCAL_SIZE0, STAGES) \
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_I32toI32_I32 \
@ -194,8 +196,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
 \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
 \
-    __local int local_data[128]; \
-    __local int local_indices[128]; \
+    __local int local_data[LOCAL_SIZE0 * 2]; \
+    __local int local_indices[LOCAL_SIZE0 * 2]; \
 \
    int left = read_imagei(input, coord.xy).x; \
    coord.z += work_group_size; \
@ -227,7 +229,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            int left_elem = local_data[left_id]; \
            int right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -254,13 +256,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
    write_imagei(indices, coord.xy, index.xxxx); \
    write_imagei(indices, coord.zy, index.yyyy); \
 }
-TOPK_I32(1 << 0, 0)
-TOPK_I32(1 << 1, 1)
-TOPK_I32(1 << 2, 2)
-TOPK_I32(1 << 3, 3)
-TOPK_I32(1 << 4, 4)
-TOPK_I32(1 << 5, 5)
-TOPK_I32(1 << 6, 6)
+TOPK_I32((1 << 0), 0)
+TOPK_I32((1 << 1), 1)
+TOPK_I32((1 << 2), 2)
+TOPK_I32((1 << 3), 3)
+TOPK_I32((1 << 4), 4)
+TOPK_I32((1 << 5), 5)
+TOPK_I32((1 << 6), 6)
+TOPK_I32((1 << 9), 9)

 #define TOPK_F32toU32(LOCAL_SIZE0, STAGES) \
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toU32_I32 \
@ -282,8 +285,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
 \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
 \
-    __local float local_data[128]; \
-    __local uint local_indices[128]; \
+    __local float local_data[LOCAL_SIZE0 * 2]; \
+    __local uint local_indices[LOCAL_SIZE0 * 2]; \
 \
    float left = read_imagef(input, coord.xy).x; \
    coord.z += work_group_size; \
@ -315,7 +318,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            float left_elem = local_data[left_id]; \
            float right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -342,13 +345,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
    write_imagei(indices, coord.zy, index.yyyy); \
 }

-TOPK_F32toU32(1 << 0, 0)
-TOPK_F32toU32(1 << 1, 1)
-TOPK_F32toU32(1 << 2, 2)
-TOPK_F32toU32(1 << 3, 3)
-TOPK_F32toU32(1 << 4, 4)
-TOPK_F32toU32(1 << 5, 5)
-TOPK_F32toU32(1 << 6, 6)
+TOPK_F32toU32((1 << 0), 0)
+TOPK_F32toU32((1 << 1), 1)
+TOPK_F32toU32((1 << 2), 2)
+TOPK_F32toU32((1 << 3), 3)
+TOPK_F32toU32((1 << 4), 4)
+TOPK_F32toU32((1 << 5), 5)
+TOPK_F32toU32((1 << 6), 6)
+TOPK_F32toU32((1 << 9), 9)

 #define TOPK_F32toI32(LOCAL_SIZE0, STAGES) \
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toI32_I32 \
@ -370,8 +374,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
 \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
 \
-    __local float local_data[128]; \
-    __local uint local_indices[128]; \
+    __local float local_data[LOCAL_SIZE0 * 2]; \
+    __local uint local_indices[LOCAL_SIZE0 * 2]; \
 \
    float left = read_imagef(input, coord.xy).x; \
    coord.z += work_group_size; \
@ -403,7 +407,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            float left_elem = local_data[left_id]; \
            float right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -430,10 +434,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
    write_imagei(indices, coord.zy, index.yyyy); \
 }

-TOPK_F32toI32(1 << 0, 0)
-TOPK_F32toI32(1 << 1, 1)
-TOPK_F32toI32(1 << 2, 2)
-TOPK_F32toI32(1 << 3, 3)
-TOPK_F32toI32(1 << 4, 4)
-TOPK_F32toI32(1 << 5, 5)
-TOPK_F32toI32(1 << 6, 6)
+TOPK_F32toI32((1 << 0), 0)
+TOPK_F32toI32((1 << 1), 1)
+TOPK_F32toI32((1 << 2), 2)
+TOPK_F32toI32((1 << 3), 3)
+TOPK_F32toI32((1 << 4), 4)
+TOPK_F32toI32((1 << 5), 5)
+TOPK_F32toI32((1 << 6), 6)
+TOPK_F32toI32((1 << 9), 9)
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl
@ -0,0 +1,368 @@
+
+#define BITONIC_STEP(dtype) \
+void bitonic_step_##dtype(uint num_stages, int lx, \
+        __local dtype *local_data, __local int *local_indices) \
+{ \
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \
+    { \
+        uint signo = (lx >> stage) & 1; \
+ \
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+        { \
+            uint postShift = (stage - passOfStage); \
+            uint pairDistance = 1 << postShift; \
+ \
+            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
+            uint right_id = left_id + pairDistance; \
+ \
+            int left_idx = local_indices[left_id]; \
+            int right_idx = local_indices[right_id]; \
+ \
+            dtype left_elem = local_data[left_id]; \
+            dtype right_elem = local_data[right_id]; \
+ \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+            { \
+                local_data[left_id] = right_elem; \
+                local_data[right_id] = left_elem; \
+ \
+                local_indices[left_id] = right_idx; \
+                local_indices[right_id] = left_idx; \
+            } \
+ \
+            barrier(CLK_LOCAL_MEM_FENCE); \
+        } \
+    } \
+}
+BITONIC_STEP(int)
+BITONIC_STEP(uint)
+
+#define BITONIC_STEP_ASCEND(dtype) \
+void bitonic_step_ascend_##dtype(uint num_stages, int lx, \
+        __local dtype *p_share_k, __local int *p_share_v) \
+{ \
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \
+    { \
+        uint signo = (lx >> stage) & 1; \
+ \
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+        { \
+            uint postShift = (stage - passOfStage); \
+            uint pairDistance = 1 << postShift; \
+ \
+            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
+            uint right_id = left_id + pairDistance; \
+ \
+            int left_idx = p_share_v[left_id]; \
+            int right_idx = p_share_v[right_id]; \
+ \
+            dtype left_elem = p_share_k[left_id]; \
+            dtype right_elem = p_share_k[right_id]; \
+ \
+            if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \
+            { \
+                p_share_k[left_id] = right_elem; \
+                p_share_k[right_id] = left_elem; \
+ \
+                p_share_v[left_id] = right_idx; \
+                p_share_v[right_id] = left_idx; \
+            } \
+ \
+            barrier(CLK_LOCAL_MEM_FENCE); \
+        } \
+    } \
+}
+BITONIC_STEP_ASCEND(int)
+BITONIC_STEP_ASCEND(uint)
+
+#define BITONIC_MERGE(dtype) \
+void bitonic_merge_##dtype(uint num_stages, int lx, \
+        __local dtype *local_data, __local int *local_indices) \
+{ \
+    uint stage = num_stages; \
+    uint signo = (lx >> stage) & 1; \
+ \
+    for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+    { \
+        uint postShift = (stage - passOfStage); \
+        uint pairDistance = 1 << postShift; \
+ \
+        uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
+        uint right_id = left_id + pairDistance; \
+ \
+        int left_idx = local_indices[left_id]; \
+        int right_idx = local_indices[right_id]; \
+ \
+        dtype left_elem = local_data[left_id]; \
+        dtype right_elem = local_data[right_id]; \
+ \
+        if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+        { \
+            local_data[left_id] = right_elem; \
+            local_data[right_id] = left_elem; \
+ \
+            local_indices[left_id] = right_idx; \
+            local_indices[right_id] = left_idx; \
+        } \
+ \
+        barrier(CLK_LOCAL_MEM_FENCE); \
+    } \
+}
+BITONIC_MERGE(int)
+BITONIC_MERGE(uint)
+
+#define BLOCK_SIZE              (512)
+
+__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32
+(
+  __read_only  image2d_t input,
+  __write_only image2d_t output,
+  __write_only image2d_t indices,
+               float     input_scale,
+               float     input_tail,
+               float     output_scale,
+               float     output_tail,
+               int       _num_stages,
+               int       width
+  )
+ {
+    uint lx = get_local_id(0);
+    const int init_k = -2147483647;
+    const int init_v = -2147483647;
+    const int num_stages = 9;
+    const int threads_per_block = BLOCK_SIZE;
+    const int index_minus_1 = threads_per_block * 2 - 1;
+    uint offset = 0;
+    uint lx1 = lx + threads_per_block;
+
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    __local int local_data[1536];
+    __local int local_indices[1536];
+
+    int left = read_imagei(input, coord.xy).x;
+    coord.z += threads_per_block;
+    int right = read_imagei(input, coord.zy).x;
+
+    local_data[lx] = left;
+    local_indices[lx] = coord.x;
+    local_data[lx1] = right;
+    local_indices[lx1] = coord.z;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    bitonic_step_int(num_stages, lx, local_data, local_indices);
+
+    int min_data = local_data[511];
+
+    int *p_share_k = local_data + threads_per_block;
+    int *p_share_v = local_indices + threads_per_block;
+
+    int limit = (width >> 10) << 10;
+    p_share_k[lx] = init_k;
+    p_share_v[lx] = init_v;
+
+    p_share_k[lx1] = init_k;
+    p_share_v[lx1] = init_v;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
+    {
+        int2 data;
+        coord.z = coord.x + threads_per_block;
+        data.x = read_imagei(input, coord.xy).x;
+        data.y = read_imagei(input, coord.zy).x;
+
+        p_share_k[lx] = data.x;
+        p_share_v[lx] = coord.x;
+
+        p_share_k[lx1] = data.y;
+        p_share_v[lx1] = coord.z;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
+
+        if (p_share_k[index_minus_1] < min_data)
+        {
+            continue;
+        }
+
+        p_share_k[lx] = p_share_k[lx1];
+        p_share_v[lx] = p_share_v[lx1];
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_merge_int(num_stages, lx, local_data, local_indices);
+
+        min_data = local_data[511];
+        p_share_k[lx] = init_k;
+        p_share_v[lx] = init_v;
+        p_share_k[lx1] = init_k;
+        p_share_v[lx1] = init_v;
+    }
+
+    if (width > limit)
+    {
+        if (coord.x < width)
+        {
+            int2 data;
+            data.x = read_imagei(input, coord.xy).x;
+            coord.z = coord.x + threads_per_block;
+            data.y = read_imagei(input, coord.zy).x;
+
+            p_share_k[lx] = data.x;
+            p_share_v[lx] = coord.x;
+
+            p_share_k[lx1] = coord.z < width ? data.y : init_k;
+            p_share_v[lx1] = coord.z < width ? coord.z : init_v;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
+
+        if (p_share_k[index_minus_1] >= min_data)
+        {
+            p_share_k[lx] = p_share_k[lx1];
+            p_share_v[lx] = p_share_v[lx1];
+            barrier(CLK_LOCAL_MEM_FENCE);
+            bitonic_merge_int(num_stages, lx, local_data, local_indices);
+        }
+    }
+
+    int4 dst;
+    dst.x = local_data[lx];
+
+    coord.x = lx;
+    write_imagei(output, coord.xy, dst.xxxx);
+
+    int4 index;
+    index.x = local_indices[lx];
+
+    write_imagei(indices, coord.xy, index.xxxx);
+}
+
+__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32
+(
+  __read_only  image2d_t input,
+  __write_only image2d_t output,
+  __write_only image2d_t indices,
+               float     input_scale,
+               float     input_tail,
+               float     output_scale,
+               float     output_tail,
+               int       _num_stages,
+               int       width
+  )
+ {
+    uint lx = get_local_id(0);
+    const uint init_k = 0;
+    const int init_v = -2147483647;
+    const int num_stages = 9;
+    const int threads_per_block = BLOCK_SIZE;
+    const int index_minus_1 = threads_per_block * 2 - 1;
+    uint offset = 0;
+    uint lx1 = lx + threads_per_block;
+
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    __local uint local_data[1536];
+    __local int local_indices[1536];
+
+    uint left = read_imageui(input, coord.xy).x;
+    coord.z += threads_per_block;
+    uint right = read_imageui(input, coord.zy).x;
+
+    local_data[lx] = left;
+    local_indices[lx] = coord.x;
+    local_data[lx1] = right;
+    local_indices[lx1] = coord.z;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    bitonic_step_uint(num_stages, lx, local_data, local_indices);
+
+    uint min_data = local_data[511];
+
+    uint *p_share_k = local_data + threads_per_block;
+    int *p_share_v = local_indices + threads_per_block;
+
+    int limit = (width >> 10) << 10;
+    p_share_k[lx] = init_k;
+    p_share_v[lx] = init_v;
+
+    p_share_k[lx1] = init_k;
+    p_share_v[lx1] = init_v;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
+    {
+        uint2 data;
+        coord.z = coord.x + threads_per_block;
+        data.x = read_imageui(input, coord.xy).x;
+        data.y = read_imageui(input, coord.zy).x;
+
+        p_share_k[lx] = data.x;
+        p_share_v[lx] = coord.x;
+
+        p_share_k[lx1] = data.y;
+        p_share_v[lx1] = coord.z;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
+
+        if (p_share_k[index_minus_1] < min_data)
+        {
+            continue;
+        }
+
+        p_share_k[lx] = p_share_k[lx1];
+        p_share_v[lx] = p_share_v[lx1];
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_merge_uint(num_stages, lx, local_data, local_indices);
+
+        min_data = local_data[511];
+        p_share_k[lx] = init_k;
+        p_share_v[lx] = init_v;
+        p_share_k[lx1] = init_k;
+        p_share_v[lx1] = init_v;
+    }
+
+    if (width > limit)
+    {
+        if (coord.x < width)
+        {
+            uint2 data;
+            data.x = read_imageui(input, coord.xy).x;
+            coord.z = coord.x + threads_per_block;
+            data.y = read_imageui(input, coord.zy).x;
+
+            p_share_k[lx] = data.x;
+            p_share_v[lx] = coord.x;
+
+            p_share_k[lx1] = coord.z < width ? data.y : init_k;
+            p_share_v[lx1] = coord.z < width ? coord.z : init_v;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
+
+        if (p_share_k[index_minus_1] >= min_data)
+        {
+            p_share_k[lx] = p_share_k[lx1];
+            p_share_v[lx] = p_share_v[lx1];
+            barrier(CLK_LOCAL_MEM_FENCE);
+            bitonic_merge_uint(num_stages, lx, local_data, local_indices);
+        }
+    }
+
+    uint4 dst;
+    dst.x = local_data[lx];
+
+    coord.x = lx;
+    write_imageui(output, coord.xy, dst.xxxx);
+
+    int4 index;
+    index.x = local_indices[lx];
+
+    write_imagei(indices, coord.xy, index.xxxx);
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array.vx
@ -0,0 +1,344 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;
+_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform int channel;
+_viv_uniform int input_zp;
+_viv_uniform float in_out_scale;
+_viv_uniform float in_out_zp_scale;
+_viv_uniform float output_zp;
+_viv_uniform int remainder;
+_viv_uniform int w_size;
+
+
+__kernel void cumsum_array_F16toF16_axis2(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    for(coord.z = 0; coord.z < channel; coord.z++)
+    {
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+        src = in_ptr[0];
+
+        _viv_asm(COPY, data, src, 16);
+
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+        _viv_asm(COPY, dst, sum, 16);
+        out_ptr[0] = dst;
+    }
+}
+
+#define CUMSUM_8BITS_ARRAY_AXIS2(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_array_##in_name##to##out_name##_axis2( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    src_type src; \
+    dst_type dst; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
+ \
+    Tensor img1 = create_tensor_from_image2d_array(input, 1); \
+    Tensor img2 = create_tensor_from_image2d_array(output, 1); \
+    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
+    { \
+        coord.x = coord.x - (16 - remainder); \
+    } \
+    for(coord.z = 0; coord.z < channel; coord.z++) \
+    { \
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        __global src_type* in_ptr = (__global src_type*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+        float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp; \
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+        int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+        int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);\
+        out_ptr[0] = dst; \
+    } \
+}
+CUMSUM_8BITS_ARRAY_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_8BITS_ARRAY_AXIS2(I8, I8, vxc_char16, vxc_char16)
+
+__kernel void cumsum_array_I16toI16_axis2(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    for(coord.z = 0; coord.z < channel; coord.z++)
+    {
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+        src = in_ptr[0];
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+        float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp;
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+        int4 tmpDst0 = convert_int4_rte(tmpSum0);
+        int4 tmpDst1 = convert_int4_rte(tmpSum1);
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);
+
+        out_ptr[0] = dst;
+    }
+}
+
+__kernel void cumsum_array_F16toF16_axis1(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+        src = in_ptr[0];
+        _viv_asm(COPY, data, src, 16);
+
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+        _viv_asm(COPY, dst, sum, 16);
+        out_ptr[0] = dst;
+    }
+}
+
+#define CUMSUM_8BITS_ARRAY_AXIS1(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_array_##in_name##to##out_name##_axis1( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    src_type src; \
+    dst_type dst; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
+    Tensor img2 = create_tensor_from_image2d_array(output, 2); \
+    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
+    { \
+        coord.x = coord.x - (16 - remainder); \
+    } \
+ \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        __global src_type* in_ptr = (__global src_type*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+        int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+        int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        out_ptr[0] = dst; \
+    } \
+}
+CUMSUM_8BITS_ARRAY_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_8BITS_ARRAY_AXIS1(I8, I8, vxc_char16,  vxc_char16)
+
+__kernel void cumsum_array_I16toI16_axis1(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+        src = in_ptr[0];
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+        int4 tmpDst0 = convert_int4_rte(tmpSum0);
+        int4 tmpDst1 = convert_int4_rte(tmpSum1);
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+
+        out_ptr[0] = dst;
+    }
+}
+
+__kernel void cumsum_array_F16toF16_axis0(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, tmpsum, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+
+    for(; coord.x < width; coord.x += 8)
+    {
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+        {
+            coord.x = coord.x - (8 - remainder);
+        }
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+        src = in_ptr[0];
+        _viv_asm(COPY, data, src, 16);
+
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);
+        _viv_asm(COPY, dst, sum, 16);
+        out_ptr[0] = dst;
+
+    }
+}
+
+#define CUMSUM_ARRAY_QINT_AXIS0(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_array_##in_name##to##out_name##_axis0( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    src_type src; \
+    dst_type dst; \
+    vxc_short8 rowSum; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0); \
+    short zp = (short)input_zp; \
+ \
+    for(; coord.x < width; coord.x += 8) \
+    { \
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+        { \
+            coord.x = coord.x - (8 - remainder); \
+        } \
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        __global src_type* in_ptr = (__global src_type*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \
+        VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \
+        VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \
+        VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32A_4x4); \
+        VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32B_4x4); \
+ \
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        out_ptr[0] = dst; \
+    } \
+}
+
+CUMSUM_ARRAY_QINT_AXIS0(U8,  U8,  vxc_uchar16, vxc_uchar16)
+CUMSUM_ARRAY_QINT_AXIS0(I8,  I8,  vxc_char16,  vxc_char16)
+CUMSUM_ARRAY_QINT_AXIS0(I16, I16, vxc_short8,  vxc_short8)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_2d.vx
@ -0,0 +1,259 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;
+_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform int input_zp;
+_viv_uniform float in_out_scale;
+_viv_uniform float in_out_zp_scale;
+_viv_uniform float output_zp;
+_viv_uniform int remainder;
+_viv_uniform int w_size;
+
+
+__kernel void cumsum_array_F16toF16_axis1_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int2 coord = (int2)(get_global_id(0), 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+
+    Image img1 = create_image_from_image2d(input, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    for(; coord.y < height; coord.y++)
+    {
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+        src = in_ptr[0];
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        _viv_asm(COPY, data, src, 16);
+
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertF16toF16_2x8);
+        _viv_asm(COPY, dst, sum, 16);
+        out_ptr[0] = dst;
+    }
+}
+
+#define CUMSUM_8BITS_ARRAY_AXIS1_2D(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_array_##in_name##to##out_name##_axis1_2D( \
+    __read_only image2d_t   input, \
+    __write_only image2d_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    src_type src; \
+    dst_type dst; \
+    int4 sum0 = (int4)(0); \
+    int4 sum1 = (int4)(0); \
+    int4 sum2 = (int4)(0); \
+    int4 sum3 = (int4)(0); \
+ \
+    Image img1 = create_image_from_image2d(input, 1); \
+    Image img2 = create_image_from_image2d(output, 1); \
+    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
+    { \
+        coord.x = coord.x - (16 - remainder); \
+    } \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord); \
+        __global src_type* in_ptr = (__global src_type*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32A_4x4); \
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32B_4x4); \
+        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32C_4x4); \
+        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32D_4x4); \
+ \
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+        int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+        int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+ \
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                 uniConvertInt32toUint8_2x8); \
+        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \
+                 uniConvertInt32toUint8_2x8); \
+        out_ptr[0] = dst; \
+    } \
+}
+CUMSUM_8BITS_ARRAY_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_8BITS_ARRAY_AXIS1_2D(I8, I8, vxc_char16, vxc_char16)
+
+__kernel void cumsum_array_I16toI16_axis1_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+
+    vxc_short8 src, dst;
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);
+
+    Image img1 = create_image_from_image2d(input, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+        src = in_ptr[0];
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32A_4x4);
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32B_4x4);
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+        int4 tmpDst0 = convert_int4_rte(tmpSum0);
+        int4 tmpDst1 = convert_int4_rte(tmpSum1);
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
+                 uniConvertInt32toUint8_2x8);
+
+        out_ptr[0] = dst;
+    }
+}
+
+__kernel void cumsum_array_F16toF16_axis0_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+
+    vxc_short8 src, dst;
+    vxc_half8 data, tmpsum, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    Image img1 = create_image_from_image2d(input, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    for(; coord.x < width; coord.x += 8)
+    {
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+        {
+            coord.x = coord.x - (8 - remainder);
+        }
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+        src = in_ptr[0];
+        _viv_asm(COPY, data, src, 16);
+
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16A_4x4);
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16B_4x4);
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16C_2x8);
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumHorzF16toF16_2x8);
+        _viv_asm(COPY, dst, sum, 16);
+        out_ptr[0] = dst;
+    }
+}
+
+#define CUMSUM_ARRAY_QINT_AXIS0_2D(in_name, out_name, src_type, dst_type, stride_data) \
+__kernel void cumsum_array_##in_name##to##out_name##_axis0_2D( \
+    __read_only image2d_t   input, \
+    __write_only image2d_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    src_type src; \
+    dst_type dst; \
+    vxc_short8 rowSum; \
+    int4 sum0, sum1; \
+    sum0 ^= sum0; \
+    sum1 ^= sum1; \
+    short zp = (short)input_zp; \
+    Image img1 = create_image_from_image2d(input, stride_data); \
+    Image img2 = create_image_from_image2d(output, stride_data); \
+ \
+    for(; coord.x < width; coord.x += 8) \
+    { \
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+        { \
+            coord.x = coord.x - (8 - remainder); \
+        } \
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord); \
+        __global src_type* in_ptr = (__global src_type*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzU8toI16A_4x4); \
+        VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzU8toI16B_8x4); \
+        VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSubZpI16toI16_2x8); \
+        VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumHorzI16toI32A_4x4); \
+        VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumHorzI16toI32B_4x4); \
+ \
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+ \
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                 uniConvertInt32toUint8_2x8); \
+        out_ptr[0] = dst; \
+    } \
+}
+
+CUMSUM_ARRAY_QINT_AXIS0_2D(U8,  U8,  vxc_uchar16, vxc_uchar16, 1)
+CUMSUM_ARRAY_QINT_AXIS0_2D(I8,  I8,  vxc_char16,  vxc_char16, 1)
+CUMSUM_ARRAY_QINT_AXIS0_2D(I16, I16, vxc_short8,  vxc_short8, 2)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_bf16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_bf16.vx
@ -0,0 +1,244 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform int channel;
+_viv_uniform int remainder;
+_viv_uniform int w_size;
+
+
+__kernel void cumsum_array_BF16toBF16_axis2(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_ushort8 src, val0, val1;
+    vxc_ushort8 dst0, dst1, dst;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float4 sum0 = (float4)(0), sum1 = (float4)(0);
+
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+
+    for(coord.z = 0; coord.z < channel; coord.z++)
+    {
+        float4 data0, data1;
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
+        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
+        src = in_ptr[0];
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, data0, val0, 16);
+        _viv_asm(COPY, data1, val1, 16);
+
+        sum0 += data0;
+        sum1 += data1;
+        _viv_asm(COPY, dst0, sum0, 16);
+        _viv_asm(COPY, dst1, sum1, 16);
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+        out_ptr[0] = dst;
+    }
+}
+
+__kernel void cumsum_BF16toBF16_axis1(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_ushort8 src, val0, val1;
+    vxc_ushort8 dst0, dst1, dst;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float4 sum0 = (float4)(0), sum1 = (float4)(0);
+
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        float4 data0, data1;
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
+        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
+        src = in_ptr[0];
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, data0, val0, 16);
+        _viv_asm(COPY, data1, val1, 16);
+        sum0 += data0;
+        sum1 += data1;
+        _viv_asm(COPY, dst0, sum0, 16);
+        _viv_asm(COPY, dst1, sum1, 16);
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+        out_ptr[0] = dst;
+    }
+}
+
+__kernel void cumsum_BF16toBF16_axis0(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_ushort8 src, val0, val1;
+    vxc_ushort8 dst0, dst1, dst;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float preSum = 0;
+    float4 one = (float4)(1.0, 1.0, 1.0, 1.0);
+    float4 q = (float4)(1.0, 1.0, 1.0, 0);
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+
+    for(; coord.x < width; coord.x += 8)
+    {
+        float4 data0, data1;
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+        {
+            coord.x = coord.x - (8 - remainder);
+        }
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
+        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
+        src = in_ptr[0];
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, data0, val0, 16);
+        _viv_asm(COPY, data1, val1, 16);
+
+        float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));
+        float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));
+        tmpSum1 += tmpSum0.w;
+
+        tmpSum0 += preSum;
+        tmpSum1 += preSum;
+
+        preSum = tmpSum1.w;
+
+        _viv_asm(COPY, dst0, tmpSum0, 16);
+        _viv_asm(COPY, dst1, tmpSum1, 16);
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+        out_ptr[0] = dst;
+    }
+}
+
+__kernel void cumsum_BF16toBF16_axis1_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int2 coord = (int2)(get_global_id(0), 0);
+
+    vxc_ushort8 src, val0, val1;
+    vxc_ushort8 dst0, dst1, dst;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float4 sum0 = (float4)(0), sum1 = (float4)(0);
+
+    Image img1 = create_image_from_image2d(input, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+
+    for(; coord.y < height; coord.y++)
+    {
+        float4 data0, data1;
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
+        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
+        src = in_ptr[0];
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, data0, val0, 16);
+        _viv_asm(COPY, data1, val1, 16);
+
+        sum0 += data0;
+        sum1 += data1;
+
+        _viv_asm(COPY, dst0, sum0, 16);
+        _viv_asm(COPY, dst1, sum1, 16);
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniExtractOddData_2x8);
+        out_ptr[0] = dst;
+    }
+}
+
+__kernel void cumsum_BF16toBF16_axis0_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+
+    vxc_ushort8 src, val0, val1;
+    vxc_ushort8 dst0, dst1, dst;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float preSum = 0;
+    float4 one = (float4)(1.0, 1.0, 1.0, 1.0);
+    float4 q = (float4)(1.0, 1.0, 1.0, 0);
+
+    Image img1 = create_image_from_image2d(input, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    for(; coord.x < width; coord.x += 8)
+    {
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+        {
+            coord.x = coord.x - (8 - remainder);
+        }
+        float4 data0, data1;
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
+        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
+        src = in_ptr[0];
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, data0, val0, 16);
+        _viv_asm(COPY, data1, val1, 16);
+
+        float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));
+        float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));
+        tmpSum1 += tmpSum0.w;
+
+        tmpSum0 += preSum;
+        tmpSum1 += preSum;
+
+        preSum = tmpSum1.w;
+
+        _viv_asm(COPY, dst0, tmpSum0, 16);
+        _viv_asm(COPY, dst1, tmpSum1, 16);
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniExtractOddData_2x8);
+        out_ptr[0] = dst;
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis0.vx
@ -0,0 +1,259 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;
+_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzRevF16toF16_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzRevU8toI16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzRevU8toI16B_8x4;
+_viv_uniform VXC_512Bits uniSubZpRevI16toI16_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32B_4x4;
+
+
+_viv_uniform int width;
+_viv_uniform int input_zp;
+_viv_uniform float in_out_scale;
+_viv_uniform float output_zp;
+
+_viv_uniform int remainder;
+_viv_uniform int w_size;
+
+
+__kernel void cumsum_ex_rev_array_F16toF16_axis0(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    vxc_short8 src, dst;
+    vxc_half8 data, tmpsum, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+    if(exclusive == 0 && rev)
+    {
+        for(coord.x = width - 8; coord.x >= 0; coord.x -= 8)
+        {
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+            {
+                coord.x = coord.x - (8 - remainder);
+            }
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            out_ptr = (__global vxc_short8*)output_ptr;
+            src = in_ptr[0];
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);
+            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                        uniSumHorzRevF16toF16C_2x8);
+            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive && rev == 0)
+    {
+        _viv_asm(COPY, dst, sum, 16);
+        out_ptr[0] = dst;
+        for(; coord.x < width - 8;)
+        {
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+            {
+                coord.x = coord.x - (8 - remainder);
+            }
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            coord_out.x = coord.x + 1;
+            coord.x += 8;
+            _viv_asm(COPY, data, src, 16);
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+            out_ptr = (__global vxc_short8*)output_ptr;
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);
+            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);
+            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive && rev)
+    {
+        coord.x = width - 8;
+        coord_out.x = width - 1;
+        _viv_asm(COPY, dst, sum, 16);
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+        out_ptr = (__global vxc_short8*)output_ptr;
+        out_ptr[0] = dst;
+        for(; coord.x > 0;)
+        {
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+            {
+                coord.x = coord.x - (8 - remainder);
+            }
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            out_ptr = (__global vxc_short8*)output_ptr;
+            src = in_ptr[0];
+            coord_out.x = coord.x - 1;
+            coord.x -= 8;
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);
+            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                        uniSumHorzRevF16toF16C_2x8);
+            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+}
+
+#define CUMSUM_QINT_EX_REV_ARRAY_AXIS0(in_name, out_name, src_type, dst_type, stride_data) \
+__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis0( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); \
+    int4 coord_out = coord; \
+ \
+    src_type src; \
+    dst_type dst; \
+    vxc_short8 rowSum; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0); \
+    short zp = (short)input_zp; \
+ \
+    Tensor img1 = create_tensor_from_image2d_array(input, stride_data); \
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_data); \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \
+    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+    if(exclusive == 0 && rev) \
+    { \
+        for(coord.x = width - 8; coord.x >= 0; coord.x -= 8) \
+        { \
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+            { \
+                coord.x = coord.x - (8 - remainder); \
+            } \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            out_ptr = (__global dst_type*)output_ptr; \
+            src = in_ptr[0]; \
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \
+            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \
+            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \
+            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzRevI16toI32A_4x4); \
+            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzRevI16toI32B_4x4); \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev == 0) \
+    { \
+        for(coord.x = -1; coord.x < width - 8;) \
+        { \
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+            { \
+                coord.x = coord.x - (8 - remainder); \
+            } \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src = in_ptr[0]; \
+            coord_out.x = coord.x + 1; \
+            coord.x += 8; \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+            out_ptr = (__global dst_type*)output_ptr; \
+            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \
+            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \
+            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \
+            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzI16toI32A_4x4); \
+            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzI16toI32B_4x4); \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev) \
+    { \
+        for(coord.x = width - 7; coord.x > 0;) \
+        { \
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+            { \
+                coord.x = coord.x - (8 - remainder); \
+            } \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            out_ptr = (__global dst_type*)output_ptr; \
+            src = in_ptr[0]; \
+            coord_out.x = coord.x - 1; \
+            coord.x -= 8; \
+            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \
+            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \
+            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \
+            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzRevI16toI32A_4x4); \
+            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzRevI16toI32B_4x4); \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+}
+CUMSUM_QINT_EX_REV_ARRAY_AXIS0(U8,  U8,  vxc_uchar16, vxc_uchar16, 1)
+CUMSUM_QINT_EX_REV_ARRAY_AXIS0(I8,  I8,  vxc_char16,  vxc_char16, 1)
+CUMSUM_QINT_EX_REV_ARRAY_AXIS0(I16, I16, vxc_short8,  vxc_short8, 2)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis1.vx
@ -0,0 +1,330 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int height;
+_viv_uniform float in_out_scale;
+_viv_uniform float in_out_zp_scale;
+_viv_uniform float output_zp;
+
+_viv_uniform int remainder;
+_viv_uniform int w_size;
+
+
+__kernel void cumsum_ex_rev_array_F16toF16_axis1(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev)
+{
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+    if(exclusive == 0 && rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            out_ptr = (__global vxc_short8*)output_ptr;
+            src = in_ptr[0];
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive && rev == 0)
+    {
+        dst ^= dst;
+        out_ptr[0] = dst;
+        for(; coord.y < height - 1;)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            coord.y++;
+            _viv_asm(COPY, data, src, 16);
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global vxc_short8*)output_ptr;
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive && rev)
+    {
+        dst ^= dst;
+        coord.y = height - 1;
+        output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        out_ptr = (__global vxc_short8*)output_ptr;
+        out_ptr[0] = dst;
+
+        for(; coord.y > 0;)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            coord.y--;
+            _viv_asm(COPY, data, src, 16);
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global vxc_short8*)output_ptr;
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+}
+
+#define CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis1( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev) \
+{ \
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \
+ \
+    src_type src; \
+    dst_type dst; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
+ \
+    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
+    { \
+        coord.x = coord.x - (16 - remainder); \
+    } \
+    Tensor img1 = create_tensor_from_image2d_array(input, 1); \
+    Tensor img2 = create_tensor_from_image2d_array(output, 1); \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \
+    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+    if(exclusive == 0 && rev) \
+    { \
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            out_ptr = (__global dst_type*)output_ptr; \
+            src = in_ptr[0]; \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev == 0) \
+    { \
+        int tmpAlpha0 = convert_int_rte(output_zp); \
+        int4 tmpVal; \
+        tmpVal.x = tmpAlpha0; \
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \
+        for(; coord.y < height - 1;) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src = in_ptr[0]; \
+            coord.y++; \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global dst_type*)output_ptr; \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8);\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8);\
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev) \
+    { \
+        coord.y = height - 1; \
+        int tmpAlpha0 = convert_int_rte(output_zp); \
+        int4 tmpVal; \
+        tmpVal.x = tmpAlpha0; \
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        out_ptr = (__global vxc_short8*)output_ptr; \
+        out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \
+        for(; coord.y > 0;) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src = in_ptr[0]; \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \
+            coord.y--; \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global dst_type*)output_ptr; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8);\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8);\
+            out_ptr[0] = dst; \
+        } \
+    } \
+}
+CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(I8, I8, vxc_char16,  vxc_char16)
+
+__kernel void cumsum_ex_rev_array_I16toI16_axis1(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev)
+{
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+    if(exclusive == 0 && rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            out_ptr = (__global vxc_short8*)output_ptr;
+            src = in_ptr[0];
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
+                        uniConvertInt32toUint8_2x8);
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive && rev == 0)
+    {
+        int tmpAlpha0 = convert_int_rte(output_zp);
+        int4 tmpVal;
+        tmpVal.x = tmpAlpha0;
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+        out_ptr[0] = dst.xxxxxxxx;
+        for(; coord.y < height - 1;)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            coord.y++;
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global vxc_short8*)output_ptr;
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
+                        uniConvertInt32toUint8_2x8);
+
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive && rev)
+    {
+        coord.y = height - 1;
+        int tmpAlpha0 = convert_int_rte(output_zp);
+        int4 tmpVal;
+        tmpVal.x = tmpAlpha0;
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+        output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        out_ptr = (__global vxc_short8*)output_ptr;
+        out_ptr[0] = dst.xxxxxxxx;
+        for(; coord.y > 0;)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;
+            coord.y--;
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global vxc_short8*)output_ptr;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
+                        uniConvertInt32toUint8_2x8);
+
+            out_ptr[0] = dst;
+        }
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis2.vx
@ -0,0 +1,322 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int channel;
+_viv_uniform float in_out_scale;
+_viv_uniform float in_out_zp_scale;
+_viv_uniform float output_zp;
+
+_viv_uniform int remainder;
+_viv_uniform int w_size;
+
+
+__kernel void cumsum_ex_rev_array_F16toF16_axis2(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev)
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+    if(rev && exclusive == 0)
+    {
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            out_ptr = (__global vxc_short8*)output_ptr;
+            src = in_ptr[0];
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+    else if(rev == 0 && exclusive)
+    {
+        _viv_asm(COPY, dst, sum, 16);
+        out_ptr[0] = dst;
+        for(; coord.z < channel - 1;)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            coord.z++;
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+    else if(rev && exclusive)
+    {
+        _viv_asm(COPY, dst, sum, 16);
+        coord.z = channel - 1;
+        output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        out_ptr = (__global vxc_short8*)output_ptr;
+        out_ptr[0] = dst;
+        for(; coord.z > 0;)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            coord.z--;
+            _viv_asm(COPY, data, src, 16);
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global vxc_short8*)output_ptr;
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+}
+
+#define CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis2( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+ \
+    src_type src; \
+    dst_type dst; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
+ \
+    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
+    { \
+        coord.x = coord.x - (16 - remainder); \
+    } \
+    Tensor img1 = create_tensor_from_image2d_array(input, 1); \
+    Tensor img2 = create_tensor_from_image2d_array(output, 1); \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \
+    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+    if(rev && exclusive == 0) \
+    { \
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            out_ptr = (__global dst_type*)output_ptr; \
+            src = in_ptr[0]; \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8);\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \
+                        uniConvertInt32toUint8_2x8);\
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev == 0) \
+    { \
+        int tmpAlpha0 = convert_int_rte(output_zp); \
+        int4 tmpVal; \
+        tmpVal.x = tmpAlpha0; \
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
+        out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \
+        for(; coord.z < channel - 1;) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src = in_ptr[0]; \
+            coord.z++; \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \
+                        uniConvertInt32toUint8_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(rev && exclusive) \
+    { \
+        coord.z = channel - 1; \
+        int tmpAlpha0 = convert_int_rte(output_zp); \
+        int4 tmpVal; \
+        tmpVal.x = tmpAlpha0; \
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        out_ptr = (__global vxc_short8*)output_ptr; \
+        out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \
+        for(; coord.z > 0;) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src = in_ptr[0]; \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \
+            coord.z--; \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global dst_type*)output_ptr; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1),
+                        uniConvertInt32toUint8_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+}
+CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(I8, I8, vxc_char16, vxc_char16)
+
+__kernel void cumsum_ex_rev_array_I16toI16_axis2(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev)
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+    vxc_short8 src, dst;
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+    if(exclusive == 0 && rev)
+    {
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            out_ptr = (__global vxc_short8*)output_ptr;
+            src = in_ptr[0];
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
+                        uniConvertInt32toUint8_2x8);
+
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        }
+    }
+    else if(exclusive && rev == 0)
+    {
+        int tmpAlpha0 = convert_int_rte(output_zp);
+        int4 tmpVal;
+        tmpVal.x = tmpAlpha0;
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+        out_ptr[0] = dst.xxxxxxxx;
+        for(; coord.z < channel - 1;)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            coord.z++;
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
+                        uniConvertInt32toUint8_2x8);
+
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive && rev)
+    {
+        coord.z = channel - 1;
+        int tmpAlpha0 = convert_int_rte(output_zp);
+        int4 tmpVal;
+        tmpVal.x = tmpAlpha0;
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+        output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        out_ptr = (__global vxc_short8*)output_ptr;
+        out_ptr[0] = dst.xxxxxxxx;
+        for(; coord.z > 0;)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;
+            coord.z--;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
+                        uniConvertInt32toUint8_2x8);
+
+            out_ptr[0] = dst;
+        }
+    }
+}
+
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8.vx
@ -0,0 +1,324 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform int channel;
+
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
+
+_viv_uniform int remainder;
+_viv_uniform int w_size;
+
+
+#define CUMSUM_ARRAY_F16TOQINT_AXIS2(out_name, src_type, dst_type, stride_out) \
+__kernel void cumsum_array_F16to##out_name##_axis2( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+    { \
+        coord.x = coord.x - (8 - remainder); \
+    } \
+    for(coord.z = 0; coord.z < channel; coord.z++) \
+    { \
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        _viv_asm(COPY, data, src, 16); \
+ \
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                uniU8MulAndPostShift_0_Lo_2x8); \
+        out_ptr[0] = dst; \
+    } \
+}
+CUMSUM_ARRAY_F16TOQINT_AXIS2(I8,  vxc_half8, vxc_char16, 1)
+CUMSUM_ARRAY_F16TOQINT_AXIS2(I16, vxc_half8, vxc_short8, 2)
+CUMSUM_ARRAY_F16TOQINT_AXIS2(U8,  vxc_half8, vxc_uchar16, 1)
+
+
+#define CUMSUM_ARRAY_F16TOQINT_AXIS1(out_name, src_type, dst_type, stride_out) \
+__kernel void cumsum_array_F16to##out_name##_axis1( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+    { \
+        coord.x = coord.x - (8 - remainder); \
+    } \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        _viv_asm(COPY, data, src, 16); \
+ \
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                uniU8MulAndPostShift_0_Lo_2x8); \
+        out_ptr[0] = dst; \
+    } \
+}
+CUMSUM_ARRAY_F16TOQINT_AXIS1(I8,  vxc_half8, vxc_char16, 1)
+CUMSUM_ARRAY_F16TOQINT_AXIS1(I16, vxc_half8, vxc_short8, 2)
+CUMSUM_ARRAY_F16TOQINT_AXIS1(U8,  vxc_half8, vxc_uchar16, 1)
+
+#define CUMSUM_ARRAY_F16TOQINT_AXIS0(out_name, src_type, dst_type, stride_out) \
+__kernel void cumsum_array_F16to##out_name##_axis0( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, tmpsum, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
+    for(; coord.x < width; coord.x += 8) \
+    { \
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+        { \
+            coord.x = coord.x - (8 - remainder); \
+        } \
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        _viv_asm(COPY, data, src, 16); \
+ \
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4); \
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4); \
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8); \
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8); \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                uniU8MulAndPostShift_0_Lo_2x8); \
+        out_ptr[0] = dst; \
+    } \
+}
+CUMSUM_ARRAY_F16TOQINT_AXIS0(I8,  vxc_half8, vxc_char16, 1)
+CUMSUM_ARRAY_F16TOQINT_AXIS0(I16, vxc_half8, vxc_short8, 2)
+CUMSUM_ARRAY_F16TOQINT_AXIS0(U8,  vxc_half8, vxc_uchar16, 1)
+
+#define CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(out_name, src_type, dst_type, stride_out) \
+__kernel void cumsum_array_ex_rev_F16to##out_name##_axis2( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+    { \
+        coord.x = coord.x - (8 - remainder); \
+    } \
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
+    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+    if(exclusive == 0 && rev) \
+    { \
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            in_ptr = (__global vxc_short8*)input_ptr; \
+            out_ptr = (__global dst_type*)output_ptr; \
+            src = in_ptr[0]; \
+            _viv_asm(COPY, data, src, 16); \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev == 0) \
+    { \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+        out_ptr[0] = dst; \
+        for(; coord.z < channel - 1;) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global vxc_short8*)input_ptr; \
+            src = in_ptr[0]; \
+            coord.z++; \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global dst_type*)output_ptr; \
+            _viv_asm(COPY, data, src, 16); \
+     \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev) \
+    { \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+        coord.z = channel - 1; \
+        output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        out_ptr = (__global dst_type*)output_ptr; \
+        out_ptr[0] = dst; \
+        for(; coord.z > 0;) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global vxc_short8*)input_ptr; \
+            src = in_ptr[0]; \
+            coord.z--; \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global dst_type*)output_ptr; \
+            _viv_asm(COPY, data, src, 16); \
+     \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+}
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(I8,  vxc_half8, vxc_char16, 1)
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(I16, vxc_half8, vxc_short8, 2)
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(U8,  vxc_half8, vxc_uchar16, 1)
+
+#define CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(out_name, src_type, dst_type, stride_out) \
+__kernel void cumsum_array_ex_rev_F16to##out_name##_axis1( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+    { \
+        coord.x = coord.x - (8 - remainder); \
+    } \
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
+    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+    if(exclusive == 0 && rev) \
+    { \
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            in_ptr = (__global vxc_short8*)input_ptr; \
+            out_ptr = (__global dst_type*)output_ptr; \
+            src = in_ptr[0]; \
+            _viv_asm(COPY, data, src, 16); \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev == 0) \
+    { \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+        out_ptr[0] = dst; \
+        for(; coord.y < height - 1;) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global vxc_short8*)input_ptr; \
+            src = in_ptr[0]; \
+            coord.y++; \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global dst_type*)output_ptr; \
+            _viv_asm(COPY, data, src, 16); \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev) \
+    { \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+        coord.y = height - 1; \
+        output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        out_ptr = (__global dst_type*)output_ptr; \
+        out_ptr[0] = dst; \
+        for(; coord.y > 0;) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global vxc_short8*)input_ptr; \
+            src = in_ptr[0]; \
+            coord.y--; \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global dst_type*)output_ptr; \
+            _viv_asm(COPY, data, src, 16); \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+}
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(I8,  vxc_half8, vxc_char16, 1)
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8, 2)
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(U8,  vxc_half8, vxc_uchar16, 1)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8_2d.vx
@ -0,0 +1,108 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform int channel;
+
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
+
+_viv_uniform int remainder;
+_viv_uniform int w_size;
+
+
+#define CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(out_name, src_type, dst_type, stride_out) \
+__kernel void cumsum_array_F16to##out_name##_axis1_2D( \
+    __read_only image2d_t   input, \
+    __write_only image2d_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+    { \
+        coord.x = coord.x - (8 - remainder); \
+    } \
+    for(; coord.y < height; coord.y++) \
+    { \
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        _viv_asm(COPY, data, src, 16); \
+ \
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertF16toF16_2x8); \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                uniU8MulAndPostShift_0_Lo_2x8); \
+        out_ptr[0] = dst; \
+    } \
+}
+CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(I8,  vxc_half8, vxc_char16, 1)
+CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(I16, vxc_half8, vxc_short8, 2)
+CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(U8,  vxc_half8, vxc_uchar16, 1)
+
+#define CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(out_name, src_type, dst_type, stride_out) \
+__kernel void cumsum_array_F16to##out_name##_axis0_2D( \
+    __read_only image2d_t   input, \
+    __write_only image2d_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, tmpsum, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
+    for(; coord.x < width; coord.x += 8) \
+    { \
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+        { \
+            coord.x = coord.x - (8 - remainder); \
+        } \
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        _viv_asm(COPY, data, src, 16); \
+ \
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16A_4x4); \
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16B_4x4); \
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16C_2x8); \
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumHorzF16toF16_2x8); \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                uniU8MulAndPostShift_0_Lo_2x8); \
+        out_ptr[0] = dst; \
+    } \
+}
+CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(I8,  vxc_half8, vxc_char16, 1)
+CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8, 2)
+CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(U8,  vxc_half8, vxc_uchar16, 1)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx
@ -92,3 +92,116 @@ __kernel void gather_nd_F16toF16_1D(
    VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
+
+__kernel void gather_nd_array_I8toI8_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    coord.w = indice.x;
+
+    Image img1 = create_image_from_image2d(input0, 1);
+    Image img2 = create_image_from_image2d(output, 1);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global char* data_ptr = (__global char*)input_ptr;
+    __global char* dst_ptr = (__global char*)output_ptr;
+    char src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_U8toU8_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    coord.w = indice.x;
+
+    Image img1 = create_image_from_image2d(input0, 1);
+    Image img2 = create_image_from_image2d(output, 1);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global uchar* data_ptr = (__global uchar*)input_ptr;
+    __global uchar* dst_ptr = (__global uchar*)output_ptr;
+    uchar src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_I16toI16_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    coord.w = indice.x;
+
+    Image img1 = create_image_from_image2d(input0, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+
+}
+
+__kernel void gather_nd_array_F16toF16_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    coord.w = indice.x;
+
+    Image img1 = create_image_from_image2d(input0, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx
@ -92,3 +92,116 @@ __kernel void gather_nd_F16toF16_2D(
    VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
+
+__kernel void gather_nd_array_I8toI8_2D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+
+    Image img1 = create_image_from_image2d(input0, 1);
+    Image img2 = create_image_from_image2d(output, 1);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global char* data_ptr = (__global char*)input_ptr;
+    __global char* dst_ptr = (__global char*)output_ptr;
+    char src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_U8toU8_2D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+
+    Image img1 = create_image_from_image2d(input0, 1);
+    Image img2 = create_image_from_image2d(output, 1);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global uchar* data_ptr = (__global uchar*)input_ptr;
+    __global uchar* dst_ptr = (__global uchar*)output_ptr;
+    uchar src = data_ptr[0];
+    dst_ptr[0] = src;
+
+}
+
+__kernel void gather_nd_array_I16toI16_2D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+
+    Image img1 = create_image_from_image2d(input0, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_F16toF16_2D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+
+    Image img1 = create_image_from_image2d(input0, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx
@ -80,3 +80,85 @@ __kernel void gather_nd_F16to##src1_type_name##_2D( \
 GATHER_ND_F16_TO_QINT_2D(U8, vxc_uchar16)
 GATHER_ND_F16_TO_QINT_2D(I8, vxc_char16)
 GATHER_ND_F16_TO_QINT_2D(I16, vxc_short8)
+
+#define GATHER_ND_ARRAY_QINT_TO_F16_2D(src0_type_name, read_type, ptr_type, stride) \
+__kernel void gather_nd_array_##src0_type_name##toF16_2D( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int coord_dim \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    int4 coord = (int4)(0, gidy, gidx, 0); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
+    indice.x = indice.x * block_size + gidx; \
+ \
+    Image img1 = create_image_from_image2d(input0, stride); \
+    Image img2 = create_image_from_image2d(output, 2); \
+ \
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); \
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
+ \
+    __global ptr_type data_ptr = (__global ptr_type)input_ptr; \
+    __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \
+    read_type src = data_ptr[0]; \
+ \
+    vxc_half8  src0; \
+    vxc_short8 dst0; \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \
+    _viv_asm(COPY, dst0, src0, 16); \
+    dst_ptr[0] = dst0; \
+}
+GATHER_ND_ARRAY_QINT_TO_F16_2D(U8, vxc_uchar16, vxc_uchar16*, 1)
+GATHER_ND_ARRAY_QINT_TO_F16_2D(I8, vxc_char16, vxc_char16*, 1)
+GATHER_ND_ARRAY_QINT_TO_F16_2D(I16, vxc_short8, vxc_short8*, 2)
+
+#define GATHER_ND_ARRAY_F16_TO_QINT_2D(src1_type_name, write_type, ptr_type, stride) \
+__kernel void gather_nd_array_F16to##src1_type_name##_2D( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int coord_dim \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    int4 coord = (int4)(0, gidy, gidx, 0); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
+    indice.x = indice.x * block_size + gidx; \
+ \
+    Image img1 = create_image_from_image2d(input0, 2); \
+    Image img2 = create_image_from_image2d(output, stride); \
+ \
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); \
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
+ \
+    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \
+    __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \
+    vxc_short8 src = data_ptr[0]; \
+ \
+    vxc_ushort8 mp1; \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    vxc_half8 data; \
+    write_type dst; \
+    _viv_asm(COPY, data, src, 16); \
+    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1),uniConvertFp16toU8_2x8); \
+    dst_ptr[0] = dst; \
+}
+GATHER_ND_ARRAY_F16_TO_QINT_2D(U8, vxc_uchar16, vxc_uchar16*, 1)
+GATHER_ND_ARRAY_F16_TO_QINT_2D(I8, vxc_char16, vxc_char16*, 1)
+GATHER_ND_ARRAY_F16_TO_QINT_2D(I16, vxc_short8, vxc_short8*, 2)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx
@ -98,3 +98,120 @@ __kernel void gather_nd_F16toF16_3D(
    VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }

+__kernel void gather_nd_array_I8toI8_3D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.w = 0;
+
+    Tensor img1 = create_tensor_from_image2d_array(input0, 1);
+    Image img2 = create_image_from_image2d(output, 1);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global char* data_ptr = (__global char*)input_ptr;
+    __global char* dst_ptr = (__global char*)output_ptr;
+    char src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_U8toU8_3D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.w = 0;
+
+    Tensor img1 = create_tensor_from_image2d_array(input0, 1);
+    Image img2 = create_image_from_image2d(output, 1);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global uchar* data_ptr = (__global uchar*)input_ptr;
+    __global uchar* dst_ptr = (__global uchar*)output_ptr;
+    uchar src = data_ptr[0];
+    dst_ptr[0] = src;
+
+}
+
+__kernel void gather_nd_array_I16toI16_3D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.w = 0;
+
+    Tensor img1 = create_tensor_from_image2d_array(input0, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_F16toF16_3D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.w = 0;
+
+    Tensor img1 = create_tensor_from_image2d_array(input0, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx
@ -80,3 +80,86 @@ GATHER_ND_F16_TO_QINT_3D(U8, vxc_uchar16)
 GATHER_ND_F16_TO_QINT_3D(I8, vxc_char16)
 GATHER_ND_F16_TO_QINT_3D(I16, vxc_short8)

+#define GATHER_ND_ARRAY_QINT_TO_F16_3D(src0_type_name, read_type, ptr_type, stride) \
+__kernel void gather_nd_array_##src0_type_name##toF16_3D( \
+    __read_only image2d_array_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int coord_dim \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    int4 coord = (int4)(0, gidy, gidx, 0); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
+    indice.x = indice.x * block_size + gidx; \
+    indice.w = 0; \
+    Tensor img1 = create_tensor_from_image2d_array(input0, stride); \
+    Image img2 = create_image_from_image2d(output, 2); \
+ \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); \
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
+ \
+    __global ptr_type data_ptr = (__global ptr_type)input_ptr; \
+    __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \
+    read_type src = data_ptr[0]; \
+ \
+    vxc_half8  src0; \
+    vxc_short8 dst0; \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \
+    _viv_asm(COPY, dst0, src0, 16); \
+    dst_ptr[0] = dst0; \
+}
+GATHER_ND_ARRAY_QINT_TO_F16_3D(U8, vxc_uchar16, vxc_uchar16*, 1)
+GATHER_ND_ARRAY_QINT_TO_F16_3D(I8, vxc_char16, vxc_char16*, 1)
+GATHER_ND_ARRAY_QINT_TO_F16_3D(I16, vxc_short8, vxc_short8*, 2)
+
+#define GATHER_ND_ARRAY_F16_TO_QINT_3D(src1_type_name, write_type, ptr_type, stride) \
+__kernel void gather_nd_array_F16to##src1_type_name##_3D( \
+    __read_only image2d_array_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int coord_dim \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    int4 coord = (int4)(0, gidy, gidx, 0); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
+    indice.x = indice.x * block_size + gidx; \
+    indice.w = 0; \
+ \
+    Tensor img1 = create_tensor_from_image2d_array(input0, 2); \
+    Image img2 = create_image_from_image2d(output, stride); \
+ \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); \
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
+ \
+    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \
+    __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \
+    vxc_short8 src = data_ptr[0]; \
+ \
+    vxc_ushort8 mp1; \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    vxc_half8 data; \
+    write_type dst; \
+    _viv_asm(COPY, data, src, 16); \
+    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1), uniConvertFp16toU8_2x8); \
+    dst_ptr[0] = dst; \
+}
+GATHER_ND_ARRAY_F16_TO_QINT_3D(U8, vxc_uchar16, vxc_uchar16*, 1)
+GATHER_ND_ARRAY_F16_TO_QINT_3D(I8, vxc_char16, vxc_char16*, 1)
+GATHER_ND_ARRAY_F16_TO_QINT_3D(I16, vxc_short8, vxc_short8*, 2)
+
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx
@ -95,3 +95,118 @@ __kernel void gather_nd_batch_F16toF16_1D(
    VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
+
+__kernel void gather_nd_array_batch_I8toI8_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
+    int4 indice = ((int4 *)indice_ptr)[0];
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
+
+    Image img1 = create_image_from_image2d(input0, 1);
+    Tensor img2 = create_tensor_from_image2d_array(output, 1);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global char* data_ptr = (__global char*)input_ptr;
+    __global char* dst_ptr = (__global char*)output_ptr;
+    char src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_batch_U8toU8_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
+
+    Image img1 = create_image_from_image2d(input0, 1);
+    Tensor img2 = create_tensor_from_image2d_array(output, 1);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global uchar* data_ptr = (__global uchar*)input_ptr;
+    __global uchar* dst_ptr = (__global uchar*)output_ptr;
+    uchar src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_batch_I16toI16_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
+
+    Image img1 = create_image_from_image2d(input0, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_batch_F16toF16_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
+
+    Image img1 = create_image_from_image2d(input0, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx
@ -26,7 +26,7 @@ __kernel void gather_nd_batch_I8toI8_2D(
    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }

-__kernel void gather_nd_U8toU8_2D(
+__kernel void gather_nd_batch_U8toU8_2D(
    __read_only image2d_array_t   input0,
    __read_only image2d_array_t   input1,
    __write_only image2d_array_t  output,
@ -51,7 +51,7 @@ __kernel void gather_nd_U8toU8_2D(
    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }

-__kernel void gather_nd_I16toI16_2D(
+__kernel void gather_nd_batch_I16toI16_2D(
    __read_only image2d_array_t   input0,
    __read_only image2d_array_t   input1,
    __write_only image2d_array_t  output,
@ -76,7 +76,7 @@ __kernel void gather_nd_I16toI16_2D(
    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }

-__kernel void gather_nd_F16toF16_2D(
+__kernel void gather_nd_batch_F16toF16_2D(
    __read_only image2d_array_t   input0,
    __read_only image2d_array_t   input1,
    __write_only image2d_array_t  output,
@ -100,3 +100,123 @@ __kernel void gather_nd_F16toF16_2D(
    VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
+
+__kernel void gather_nd_array_batch_I8toI8_2D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.zw = coord.zw;
+
+    Tensor img1 = create_tensor_from_image2d_array(input0, 1);
+    Tensor img2 = create_tensor_from_image2d_array(output, 1);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global char* data_ptr = (__global char*)input_ptr;
+    __global char* dst_ptr = (__global char*)output_ptr;
+    char src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_batch_U8toU8_2D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.zw = coord.zw;
+
+    Tensor img1 = create_tensor_from_image2d_array(input0, 1);
+    Tensor img2 = create_tensor_from_image2d_array(output, 1);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global uchar* data_ptr = (__global uchar*)input_ptr;
+    __global uchar* dst_ptr = (__global uchar*)output_ptr;
+    uchar src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_batch_I16toI16_2D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.zw = coord.zw;
+
+    Tensor img1 = create_tensor_from_image2d_array(input0, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_batch_F16toF16_2D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.zw = coord.zw;
+
+    Tensor img1 = create_tensor_from_image2d_array(input0, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx
@ -81,3 +81,85 @@ GATHER_ND_F16_TO_QINT_1D(U8, vxc_uchar16)
 GATHER_ND_F16_TO_QINT_1D(I8, vxc_char16)
 GATHER_ND_F16_TO_QINT_1D(I16, vxc_short8)

+#define GATHER_ND_ARRAY_QINT_TO_F16_1D(src0_type_name, read_type, ptr_type, stride) \
+__kernel void gather_nd_array_##src0_type_name##toF16_1D( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int coord_dim \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    int4 coord = (int4)(0, gidy, gidx, 0); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
+    coord.w = indice.x; \
+ \
+    Image img1 = create_image_from_image2d(input0, stride); \
+    Image img2 = create_image_from_image2d(output, 2); \
+ \
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); \
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
+ \
+    __global ptr_type data_ptr = (__global ptr_type)input_ptr; \
+    __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \
+    read_type src = data_ptr[0]; \
+ \
+    vxc_half8  src0; \
+    vxc_short8 dst0; \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \
+    _viv_asm(COPY, dst0, src0, 16); \
+    dst_ptr[0] = dst0; \
+}
+GATHER_ND_ARRAY_QINT_TO_F16_1D(U8, vxc_uchar16, vxc_uchar16*, 1)
+GATHER_ND_ARRAY_QINT_TO_F16_1D(I8, vxc_char16, vxc_char16*, 1)
+GATHER_ND_ARRAY_QINT_TO_F16_1D(I16, vxc_short8, vxc_short8*, 2)
+
+#define GATHER_ND_ARRAY_F16_TO_QINT_1D(src1_type_name, write_type, ptr_type, stride) \
+__kernel void gather_nd_array_F16to##src1_type_name##_1D( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int coord_dim \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    int4 coord = (int4)(0, gidy, gidx, 0); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
+    coord.w = indice.x; \
+ \
+    Image img1 = create_image_from_image2d(input0, 2); \
+    Image img2 = create_image_from_image2d(output, stride); \
+ \
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); \
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
+ \
+    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \
+    __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \
+    vxc_short8 src = data_ptr[0]; \
+    vxc_ushort8 mp1; \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    vxc_half8 data; \
+    write_type dst; \
+    _viv_asm(COPY, data, src, 16); \
+    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \
+    dst_ptr[0] = dst; \
+}
+GATHER_ND_ARRAY_F16_TO_QINT_1D(U8, vxc_uchar16, vxc_uchar16*, 1)
+GATHER_ND_ARRAY_F16_TO_QINT_1D(I8, vxc_char16, vxc_char16*, 1)
+GATHER_ND_ARRAY_F16_TO_QINT_1D(I16, vxc_short8, vxc_short8*, 2)
+
+
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx
@ -65,5 +65,5 @@ __kernel void pre_process_gray_half_U8toU8

    coord_in.xy = coord_in.xy >> 1;

-    VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 }
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
@ -62,11 +62,20 @@ static vsi_status _argmaxmin_op_compute
    }
    status = VSI_FAILURE;

-    param =vsi_nn_kernel_param_create();
+    param = vsi_nn_kernel_param_create();
    if (strcmp(kernel_name, "argmax") == 0)
    {
        vsi_nn_argmax_param * p = &(self->nn_param.argmax);
        axis = p->axis;
+#if (VX_ARGMAX_VX_SUPPORT)
+        vsi_nn_kernel_param_add_int32(param, "axis", axis);
+        self->n = (vx_node)vsi_nn_kernel_selector(self->graph,
+            kernel_name,
+            inputs, 1,
+            outputs, 1, param);
+        goto final;
+#endif
+
    }
    else
    {
@ -101,6 +110,10 @@ static vsi_status _argmaxmin_op_compute
        vsi_nn_ReleaseTensor( &reshape_tensors[0] );
        vsi_nn_ReleaseTensor( &reshape_tensors[1] );
    }
+
+#if (VX_ARGMAX_VX_SUPPORT)
+final:
+#endif
    if( self->n )
    {
        status = VSI_SUCCESS;
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c
@ -0,0 +1,153 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
+
+typedef struct _bitcast_local_data_t {
+    int32_t placeholder;
+} bitcast_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status           status  = VSI_FAILURE;
+    vsi_nn_kernel_node_t n       = NULL;
+
+    n = vsi_nn_kernel_selector( self->graph, "bitcast", inputs, 1, outputs, 1, NULL );
+    if (n != NULL)
+    {
+        status = VSI_SUCCESS;
+    }
+    self->n = (vx_node)n;
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    int32_t i = 0;
+
+    VSI_UNREFERENCED(self);
+
+    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
+    {
+        uint32_t input_byte = 0;
+        uint32_t output_byte = 0;
+        uint32_t in_dim = inputs[0]->attr.dim_num;
+        input_byte = vsi_nn_TypeGetBytesExt(inputs[0]->attr.dtype.vx_type);
+        output_byte = vsi_nn_TypeGetBytesExt(outputs[0]->attr.dtype.vx_type);
+
+        if (input_byte == output_byte)
+        {
+            outputs[0]->attr.dim_num = in_dim;
+            for (i = 0; i < (int32_t)(in_dim); i++)
+            {
+                outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+            }
+        }
+        else if (input_byte > output_byte)
+        {
+            outputs[0]->attr.dim_num = in_dim + 1;
+            outputs[0]->attr.size[0] = input_byte / output_byte;
+            for (i = 1;i < (int32_t)(outputs[0]->attr.dim_num); i++)
+            {
+                outputs[0]->attr.size[i] = inputs[0]->attr.size[i - 1];
+            }
+        }
+        else
+        {
+            if ((uint32_t)(inputs[0]->attr.size[in_dim - 1]) != output_byte / input_byte)
+            {
+                VSILOGE("If input datatype is smaller than output datatype, bitcast op requires that \
+                    the rightmost dimension be equal to sizeof(output datatype) / sizeof(input datatype)");
+                return FALSE;
+            }
+            outputs[0]->attr.dim_num = in_dim - 1;
+            if (outputs[0]->attr.dim_num == 0)
+            {
+                outputs[0]->attr.size[0] = 1;
+                vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
+            }
+            else
+            {
+                for (i = 0; i < (int32_t)(outputs[0]->attr.dim_num); i++)
+                {
+                    outputs[0]->attr.size[i] = inputs[0]->attr.size[i + 1];
+                }
+            }
+        }
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ BITCAST,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ NULL,
+    /* check      */ NULL,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_col2im.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_col2im.c
@ -0,0 +1,258 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _col2im_local_data_t {
+    int32_t placeholder;
+} col2im_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t* param = NULL;
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32( param, "stride_w", self->nn_param.col2im.strides[0] );
+    vsi_nn_kernel_param_add_int32( param, "stride_h", self->nn_param.col2im.strides[1] );
+    vsi_nn_kernel_param_add_int32( param, "stride_d", self->nn_param.col2im.strides[2] );
+    vsi_nn_kernel_param_add_int32( param, "pad_w_front", self->nn_param.col2im.pads[0] );
+    vsi_nn_kernel_param_add_int32( param, "pad_w_end", self->nn_param.col2im.pads[1] );
+    vsi_nn_kernel_param_add_int32( param, "pad_h_front", self->nn_param.col2im.pads[2] );
+    vsi_nn_kernel_param_add_int32( param, "pad_h_end", self->nn_param.col2im.pads[3] );
+    vsi_nn_kernel_param_add_int32( param, "pad_d_front", self->nn_param.col2im.pads[4] );
+    vsi_nn_kernel_param_add_int32( param, "pad_d_end", self->nn_param.col2im.pads[5] );
+    vsi_nn_kernel_param_add_int32( param, "dilation_w", self->nn_param.col2im.dilations[0] );
+    vsi_nn_kernel_param_add_int32( param, "dilation_h", self->nn_param.col2im.dilations[1] );
+    vsi_nn_kernel_param_add_int32( param, "dilation_d", self->nn_param.col2im.dilations[2] );
+    vsi_nn_kernel_param_add_buffer( param, "block_shape", (void*)self->nn_param.col2im.block_shape, \
+                                    self->nn_param.col2im.dim_num );
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "col2im",
+        inputs, 1, outputs, 1, param );
+
+    if (self->n)
+    {
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(COL2IM, 1, 1)
+        IO_TYPE(D_F32,        D_F32)
+        IO_TYPE(D_F32,        D_I32)
+        IO_TYPE(D_F32,        D_U32)
+        IO_TYPE(D_F32,        D_F16)
+        IO_TYPE(D_I32,        D_F32)
+        IO_TYPE(D_I32,        D_I32)
+        IO_TYPE(D_I32,        D_U32)
+        IO_TYPE(D_I32,        D_F16)
+        IO_TYPE(D_U32,        D_F32)
+        IO_TYPE(D_U32,        D_I32)
+        IO_TYPE(D_U32,        D_U32)
+        IO_TYPE(D_F16,        D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_I16|Q_ASYM)
+        IO_TYPE(D_F16,        D_I16|Q_SYM)
+        IO_TYPE(D_F16,        D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_I8|Q_ASYM)
+        IO_TYPE(D_F16,        D_I8|Q_SYM)
+        IO_TYPE(D_F16,        D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,  D_F16)
+        IO_TYPE(D_I16|Q_DFP,  D_I8|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM, D_F16)
+        IO_TYPE(D_I16|Q_ASYM, D_I8|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM, D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_F16)
+        IO_TYPE(D_I16|Q_SYM,  D_I8|Q_DFP)
+        IO_TYPE(D_I16|Q_SYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_I16,        D_F16)
+        IO_TYPE(D_I16,        D_I8|Q_DFP)
+        IO_TYPE(D_I16,        D_U8|Q_ASYM)
+        IO_TYPE(D_I16,        D_I32)
+        IO_TYPE(D_I16,        D_U32)
+        IO_TYPE(D_I16,        D_F32)
+        IO_TYPE(D_I8|Q_DFP,   D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,  D_F16)
+        IO_TYPE(D_I8|Q_ASYM,  D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_SYM,   D_U8|Q_ASYM)
+        IO_TYPE(D_I8,         D_F16)
+        IO_TYPE(D_I8,         D_I16|Q_DFP)
+        IO_TYPE(D_I8,         D_U8|Q_ASYM)
+        IO_TYPE(D_I8,         D_I32)
+        IO_TYPE(D_I8,         D_U32)
+        IO_TYPE(D_I8,         D_F32)
+        IO_TYPE(D_U8|Q_ASYM,  D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_I16|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,  D_I8|Q_DFP)
+        IO_TYPE(D_U8,         D_F16)
+        IO_TYPE(D_U8,         D_I16|Q_DFP)
+        IO_TYPE(D_U8,         D_I8|Q_DFP)
+        IO_TYPE(D_U8,         D_I32)
+        IO_TYPE(D_U8,         D_U32)
+        IO_TYPE(D_U8,         D_F32)
+        IO_TYPE(D_F32,        D_I16|Q_DFP)
+        IO_TYPE(D_F32,        D_I16|Q_ASYM)
+        IO_TYPE(D_F32,        D_I16|Q_SYM)
+        IO_TYPE(D_F32,        D_I8|Q_DFP)
+        IO_TYPE(D_F32,        D_I8|Q_ASYM)
+        IO_TYPE(D_F32,        D_I8|Q_SYM)
+        IO_TYPE(D_F32,        D_U8|Q_ASYM)
+        IO_TYPE(D_I32,        D_I16|Q_DFP)
+        IO_TYPE(D_I32,        D_I16|Q_ASYM)
+        IO_TYPE(D_I32,        D_I16|Q_SYM)
+        IO_TYPE(D_I32,        D_I8|Q_DFP)
+        IO_TYPE(D_I32,        D_I8|Q_ASYM)
+        IO_TYPE(D_I32,        D_I8|Q_SYM)
+        IO_TYPE(D_I32,        D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32)
+        IO_TYPE(D_F16,        D_I32)
+        IO_TYPE(D_F16,        D_I16)
+        IO_TYPE(D_F16,        D_U8)
+        IO_TYPE(D_F16,        D_I8)
+        IO_TYPE(D_F16,        D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_I16|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_F32)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32)
+        IO_TYPE(D_BF16,       D_BF16)
+    END_IO_TYPE_DECL(COL2IM)
+    if (!VALIDATE_OP_IO_TYPES(COL2IM, self, inputs, self->input.num, outputs, self->output.num)) {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_col2im_param *p = NULL;
+    p = (vsi_nn_col2im_param* )&(self->nn_param.col2im);
+    int32_t i = 0;
+    vsi_size_t block_size = 1;
+    vsi_size_t channel = 1;
+    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
+    {
+        outputs[0]->attr.dim_num = p->dim_num + 2;
+        for (i = 0; i < p->dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = (vsi_size_t)p->image_shape[i];
+            block_size = block_size * (vsi_size_t)p->block_shape[i];
+        }
+        channel = inputs[0]->attr.size[1] / block_size;
+        outputs[0]->attr.size[i + 1] = channel;
+        outputs[0]->attr.size[i + 2] = inputs[0]->attr.size[0];
+
+    }
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    self->nn_param.col2im.pads[0] = 0;
+    self->nn_param.col2im.pads[1] = 0;
+    self->nn_param.col2im.pads[2] = 0;
+    self->nn_param.col2im.pads[3] = 0;
+    self->nn_param.col2im.pads[4] = 0;
+    self->nn_param.col2im.pads[5] = 0;
+    self->nn_param.col2im.strides[0] = 1;
+    self->nn_param.col2im.strides[1] = 1;
+    self->nn_param.col2im.strides[2] = 1;
+    self->nn_param.col2im.dilations[0] = 1;
+    self->nn_param.col2im.dilations[1] = 1;
+    self->nn_param.col2im.dilations[2] = 1;
+
+    return VSI_SUCCESS;
+}
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ COL2IM,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
@ -28,6 +28,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_graph.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
@ -278,7 +279,7 @@ static vsi_status op_compute
    if(_is_tensorview_support(self, outputs)
        && _is_same_quant(self, inputs, outputs)
        && (_has_norm_input(self, inputs) == FALSE)
-        && self->graph->ctx->options.enable_concat_optimize)
+        && ((vsi_nn_graph_prv_t*)(self->graph))->options->enable_concat_optimize)
    {
        iter = self->nn_param.concat.lcl_data;
        while( NULL != iter )
@ -443,7 +444,7 @@ static vsi_status op_optimize
    if (_is_tensorview_support(self, outputs) == FALSE ||
        _is_same_quant(self, inputs, outputs) == FALSE ||
        _has_norm_input(self, inputs) == TRUE ||
-        self->graph->ctx->options.enable_concat_optimize == 0)
+        ((vsi_nn_graph_prv_t*)(self->graph))->options->enable_concat_optimize == 0)
    {
        return status;
    }
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@ -23,6 +23,7 @@
 *****************************************************************************/
 #include <string.h>
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_graph.h"
@ -95,7 +96,7 @@ static vsi_status op_optimize

    status = VSI_SUCCESS;

-    if( !self->graph->ctx->options.enable_dataconvert_optimize )
+    if( !((vsi_nn_graph_prv_t*)(self->graph))->options->enable_dataconvert_optimize )
    {
        return status;
    }
@ -266,14 +267,14 @@ static vsi_bool op_check
        IO_TYPE(D_BF16,       D_BF16)
        IO_TYPE(D_BF16,       D_F16)
        IO_TYPE(D_BF16,       D_F32)
-        IO_TYPE(D_I32,        D_I32)
-        IO_TYPE(D_I32,        D_F32)
-        IO_TYPE(D_I32,        D_F16)
-        IO_TYPE(D_I32,        D_I16|Q_DFP)
-        IO_TYPE(D_I32,        D_I8|Q_DFP)
-        IO_TYPE(D_I32,        D_U32)
-        IO_TYPE(D_I32,        D_U16)
-        IO_TYPE(D_I32,        D_U8|Q_ASYM)
+        IO_TYPE(D_I32|Q_ASYM, D_I32|Q_ASYM)
+        IO_TYPE(D_I32|Q_ASYM, D_F32)
+        IO_TYPE(D_I32|Q_ASYM, D_F16)
+        IO_TYPE(D_I32|Q_ASYM, D_I16|Q_DFP)
+        IO_TYPE(D_I32|Q_ASYM, D_I8|Q_DFP)
+        IO_TYPE(D_I32|Q_ASYM, D_U32|Q_ASYM)
+        IO_TYPE(D_I32|Q_ASYM, D_U16|Q_ASYM)
+        IO_TYPE(D_I32|Q_ASYM, D_U8|Q_ASYM)
        IO_TYPE(D_U32,        D_U32)
        IO_TYPE(D_U32,        D_I16|Q_DFP)
        IO_TYPE(D_U32,        D_I8|Q_DFP)
@ -281,7 +282,7 @@ static vsi_bool op_check
        IO_TYPE(D_U32,        D_U8|Q_ASYM)
        IO_TYPE(D_U32,        D_U8)
        IO_TYPE(D_BF16,       D_I32)
-        IO_TYPE(D_I32,        D_BF16)
+        IO_TYPE(D_I32|Q_ASYM, D_BF16)
        IO_TYPE(D_U4|Q_ASYM,  D_U8|Q_ASYM)
        IO_TYPE(D_U4|Q_SYM,   D_U8|Q_ASYM)
        IO_TYPE(D_U8|Q_ASYM,  D_U4|Q_ASYM)
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
@ -183,10 +183,16 @@ vsi_bool vsi_nn_op_eltwise_setup
        shape[i] = sz0;
    }

-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
    {
        outputs[0]->attr.dim_num = out_rank;
        memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) );
+        if (out_rank == 1 &&
+            vsi_nn_GetTensorIsScalar(inputs[0]) &&
+            vsi_nn_GetTensorIsScalar(inputs[1]))
+        {
+            vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
+        }
    }
    else
    {
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
@ -54,10 +54,12 @@ static vsi_status op_compute
    vsi_nn_kernel_param_t* param = NULL;
    int32_t align_corners = self->nn_param.gridsample.align_corners;
    int32_t pad_mode = (int32_t)self->nn_param.gridsample.padding_mode;
+    int32_t mode = (int32_t)self->nn_param.gridsample.mode;
    vsi_nn_kernel_node_t n;
    char kernel_name[128];

    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32(param, "mode", mode);
    vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners);
    vsi_nn_kernel_param_add_int32(param, "padding_mode", pad_mode);

--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv3d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv3d.c
@ -0,0 +1,412 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "utils/vsi_nn_math.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+
+/*
+ Declare number of input and output.
+ */
+#define _ARG_NUM            (1)
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
+#define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
+
+#define LOCAL() ((vsi_nn_grouped_conv3d_param_local_data *)nn_param->local)
+
+typedef struct _vsi_nn_grouped_conv3d_param_local_data {
+    vsi_nn_tensor_t ** input_tensor_group;
+    vsi_nn_tensor_t ** weight_tensor_group;
+    vsi_nn_tensor_t ** bias_tensor_group;
+    vsi_nn_tensor_t ** output_tensor_group;
+} vsi_nn_grouped_conv3d_param_local_data;
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+#if VX_CONV_3D_API_SUPPORT
+#define _TENSOR_LEN 64
+    vsi_bool res;
+    uint32_t i;
+    char tensor_name[_TENSOR_LEN];
+    vsi_nn_grouped_conv3d_param *nn_param = &self->nn_param.grouped_conv3d;
+    nn_param->local = (vsi_nn_grouped_conv3d_param_local_data*)malloc(
+        sizeof(vsi_nn_grouped_conv3d_param_local_data));
+    if (NULL == nn_param->local)
+    {
+        VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
+    memset(nn_param->local, 0, sizeof(vsi_nn_grouped_conv3d_param_local_data));
+    LOCAL()->input_tensor_group = (vsi_nn_tensor_t **)malloc(
+        nn_param->group * sizeof(vsi_nn_tensor_t *));
+    if (NULL == LOCAL()->input_tensor_group)
+    {
+        VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
+    memset(LOCAL()->input_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *));
+    res = vsi_nn_CreateTensorGroup(self->graph, inputs[0], 3,
+        LOCAL()->input_tensor_group, nn_param->group);
+    if (res == FALSE)
+    {
+        VSILOGE("CreateTensorGroup fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
+
+    LOCAL()->weight_tensor_group = (vsi_nn_tensor_t **)malloc(
+        nn_param->group * sizeof(vsi_nn_tensor_t *));
+    if (NULL == LOCAL()->weight_tensor_group)
+    {
+        VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
+    memset(LOCAL()->weight_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *));
+    res = vsi_nn_CreateTensorGroup(self->graph, inputs[1], 4,
+        LOCAL()->weight_tensor_group, nn_param->group);
+    if (res == FALSE)
+    {
+        VSILOGE("CreateTensorGroup fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
+
+    LOCAL()->bias_tensor_group = (vsi_nn_tensor_t **)malloc(
+        nn_param->group * sizeof(vsi_nn_tensor_t *));
+    if (NULL == LOCAL()->bias_tensor_group)
+    {
+        VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
+    memset(LOCAL()->bias_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *));
+    if (inputs[2] != NULL)
+    {
+        res = vsi_nn_CreateTensorGroup(self->graph, inputs[2], 0,
+            LOCAL()->bias_tensor_group, nn_param->group);
+        if (res == FALSE)
+        {
+            VSILOGE("CreateTensorGroup fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+            return VSI_FAILURE;
+        }
+    }
+
+    LOCAL()->output_tensor_group = (vsi_nn_tensor_t **)malloc(
+        nn_param->group * sizeof(vsi_nn_tensor_t *));
+    if (NULL == LOCAL()->output_tensor_group)
+    {
+        VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
+    memset(LOCAL()->output_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *));
+    res = vsi_nn_CreateTensorGroup(self->graph, outputs[0], 3,
+        LOCAL()->output_tensor_group, nn_param->group);
+    if (res == FALSE)
+    {
+        VSILOGE("CreateTensorGroup fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
+
+    for (i = 0; i < nn_param->group; i++)
+    {
+        vx_tensor bias;
+        vx_nn_convolution_3d_params_t *param = NULL;
+        vx_nn_convolution_3d_params_t param_;
+        memset( &param_, 0, sizeof( vx_nn_convolution_3d_params_t ) );
+        param = &param_;
+        param->padding_w_left = self->nn_param.grouped_conv3d.pad[0];
+        param->padding_w_right = self->nn_param.grouped_conv3d.pad[1];
+        param->padding_h_top = self->nn_param.grouped_conv3d.pad[2];
+        param->padding_h_bottom = self->nn_param.grouped_conv3d.pad[3];
+        param->padding_d_front = self->nn_param.grouped_conv3d.pad[4];
+        param->padding_d_rear = self->nn_param.grouped_conv3d.pad[5];
+
+        param->stride_w = self->nn_param.grouped_conv3d.stride[0];
+        param->stride_h = self->nn_param.grouped_conv3d.stride[1];
+        param->stride_d = self->nn_param.grouped_conv3d.stride[2];
+
+        if (self->nn_param.grouped_conv3d.dilation[0] *
+            self->nn_param.grouped_conv3d.dilation[1] *
+            self->nn_param.grouped_conv3d.dilation[2] > 1)
+        {
+            VSILOGE("conv3d could not support dilation > 1\n");
+            return VSI_FAILURE;
+        }
+        if ( self->nn_param.grouped_conv3d.dilation[0] > 0 )
+        {
+            param->dilation_w = self->nn_param.grouped_conv3d.dilation[0] - 1;
+        }
+        if ( self->nn_param.grouped_conv3d.dilation[1] > 0 )
+        {
+            param->dilation_h = self->nn_param.grouped_conv3d.dilation[1] - 1;
+        }
+        if ( self->nn_param.grouped_conv3d.dilation[2] > 0 )
+        {
+            param->dilation_d = self->nn_param.grouped_conv3d.dilation[2] - 1;
+        }
+        param->pad_mode = vsi_nn_get_vx_pad_mode(nn_param->pad_mode);
+        param->depth_multiplier = self->nn_param.grouped_conv3d.multiplier;
+        param->overflow_policy = self->vx_param.overflow_policy;
+        param->rounding_policy = self->vx_param.rounding_policy;
+        param->down_scale_size_rounding = self->vx_param.down_scale_size_rounding;
+
+        if ( inputs[2] == NULL )
+        {
+            bias = NULL;
+        }
+        else
+        {
+            bias = LOCAL()->bias_tensor_group[i]->t;
+        }
+
+        self->n = vxConv3dLayer(
+            self->graph->g,
+            LOCAL()->input_tensor_group[i]->t,
+            LOCAL()->weight_tensor_group[i]->t,
+            bias,
+            (vx_nn_convolution_3d_params_t* )param,
+            sizeof( vx_nn_convolution_3d_params_t),
+            LOCAL()->output_tensor_group[i]->t
+            );
+
+        memset(tensor_name, 0, sizeof(tensor_name));
+        snprintf(tensor_name, sizeof(tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, i);
+        if (vxSetReferenceName((vx_reference)LOCAL()->output_tensor_group[i]->t, tensor_name) == VSI_FAILURE)
+        {
+            VSILOGW("Set uid %u copy node output name fail", self->uid);
+            return VSI_FAILURE;
+        }
+        if ( NULL == self->n )
+        {
+            VSILOGE("Add vxConvolutionLayer fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+            return VSI_FAILURE;
+        }
+        else
+        {
+            // no need to maintain self->n
+            vxReleaseNode( &self->n );
+            self->n = NULL;
+        }
+    }
+#else
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+#endif
+    return VSI_SUCCESS;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = FALSE;
+
+    ret = vsi_nn_OpCheck(VSI_NN_OP_CONV3D, self, inputs, outputs);
+
+    return ret;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /* TODO: Add code to comput outputs' shape. */
+    vsi_nn_grouped_conv3d_param *nn_param;
+    vsi_size_t perm[] = { 3, 2, 0, 1 };
+
+#ifdef VX_CONVERT_POLICY_WRAP_ENABLE
+    if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 )
+    {
+        self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
+    }
+#endif
+
+    if ( VSI_NN_DIM_FMT_NHWC == inputs[1]->attr.dtype.fmt &&
+        VSI_NN_TYPE_VDATA != inputs[1]->attr.dtype.vx_type )
+    {
+        vsi_nn_TransposeTensor( self->graph, inputs[1], perm, 4, NULL );
+        inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW;
+    }
+
+    nn_param = &self->nn_param.grouped_conv3d;
+    {
+        vsi_size_t i, pad[_cnt_of_array(nn_param->pad)] = {0};
+        for (i = 0; i < _cnt_of_array(nn_param->pad); i++)
+        {
+            pad[i] = self->nn_param.grouped_conv3d.pad[i];
+        }
+        vsi_nn_compute_padding_3d(
+            inputs[0]->attr.size,
+            inputs[1]->attr.size,
+            nn_param->stride,
+            nn_param->dilation,
+            nn_param->pad_type,
+            pad
+        );
+        for (i = 0; i < _cnt_of_array(nn_param->pad); i++)
+        {
+            self->nn_param.grouped_conv3d.pad[i] = (uint32_t)pad[i];
+        }
+    }
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[0],
+            inputs[1]->attr.size[0],
+            &nn_param->pad[0],
+            nn_param->stride[0],
+            nn_param->dilation[0],
+            VSI_NN_ROUND_FLOOR
+            );
+        outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[1],
+            inputs[1]->attr.size[1],
+            &nn_param->pad[2],
+            nn_param->stride[1],
+            nn_param->dilation[1],
+            VSI_NN_ROUND_FLOOR
+            );
+        outputs[0]->attr.size[2] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[2],
+            inputs[1]->attr.size[2],
+            &nn_param->pad[4],
+            nn_param->stride[2],
+            nn_param->dilation[2],
+            VSI_NN_ROUND_FLOOR
+            );
+        if (self->nn_param.grouped_conv3d.weights > 0)
+        {
+            outputs[0]->attr.size[3] = self->nn_param.grouped_conv3d.weights;
+        }
+        else if (self->nn_param.grouped_conv3d.multiplier > 0)
+        {
+            outputs[0]->attr.size[3] = inputs[0]->attr.size[3] * self->nn_param.grouped_conv3d.multiplier;
+        }
+        else
+        {
+            outputs[0]->attr.size[3] = inputs[1]->attr.size[4];
+        }
+        outputs[0]->attr.size[4] = inputs[0]->attr.size[4];
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+    }
+    return TRUE;
+} /* op_setup() */
+
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_nn_grouped_conv3d_param *nn_param = &(self->nn_param.grouped_conv3d);
+    uint32_t i;
+    if (LOCAL())
+    {
+        if (LOCAL()->input_tensor_group)
+        {
+            for (i = 0; i < nn_param->group; i++)
+            {
+                vsi_nn_ReleaseTensor(&(LOCAL()->input_tensor_group[i]));
+            }
+            free(LOCAL()->input_tensor_group);
+        }
+        if (LOCAL()->weight_tensor_group)
+        {
+            for (i = 0; i < nn_param->group; i++)
+            {
+                vsi_nn_ReleaseTensor(&(LOCAL()->weight_tensor_group[i]));
+            }
+            free(LOCAL()->weight_tensor_group);
+        }
+        if (LOCAL()->bias_tensor_group != NULL)
+        {
+            for (i = 0; i < nn_param->group; i++)
+            {
+                vsi_nn_ReleaseTensor(&(LOCAL()->bias_tensor_group[i]));
+            }
+            free(LOCAL()->bias_tensor_group);
+        }
+        if (LOCAL()->output_tensor_group != NULL)
+        {
+            for (i = 0; i < nn_param->group; i++)
+            {
+                vsi_nn_ReleaseTensor(&(LOCAL()->output_tensor_group[i]));
+            }
+            free(LOCAL()->output_tensor_group);
+        }
+
+        free(LOCAL());
+    }
+    vsi_nn_op_common_deinit(self);
+    return VSI_SUCCESS;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ GROUPED_CONV3D,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c
@ -0,0 +1,206 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_tensor_util_prv.h"
+
+typedef struct _l1_layer_norm_local_data_t {
+    int32_t placeholder;
+} l1_layer_norm_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (4)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_kernel_node_t    n = NULL;
+    float eps = self->nn_param.l1_layer_norm.eps;
+    int32_t axis = self->nn_param.l1_layer_norm.axis;
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_float32( param, "eps", eps );
+    vsi_nn_kernel_param_add_int32( param, "axis", axis );
+    n = vsi_nn_kernel_selector( self->graph, "l1_layer_norm",
+                    inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+
+    if ( n != NULL )
+    {
+        self->n = (vx_node)n;
+        status = VSI_SUCCESS;
+    }
+    if (param != NULL)
+    {
+        vsi_nn_kernel_param_release( &param );
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num);
+
+    if (!ret)
+    {
+        BEGIN_IO_TYPE_DECL(L1_LAYER_NORM, 4, 1)
+            IO_TYPE(D_F32,        D_F32,  D_F32,  D_F32,  D_F32)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_F16)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_U8|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_U8|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_DFP)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_DFP)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_SYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_SYM)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_DFP)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_DFP)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_SYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_SYM)
+            IO_TYPE(D_BF16,       D_F32,  D_F32,  D_F32,  D_BF16)
+            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_F16)
+            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_U8|Q_ASYM)
+            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F32,  D_I16|Q_DFP)
+            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_F32,  D_I16|Q_ASYM)
+            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_F32,  D_I16|Q_SYM)
+            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F32,  D_F16)
+            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_F32,  D_F16)
+            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F32,  D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_F32,  D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_F32,  D_F16)
+            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F32,  D_I16|Q_DFP)
+            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_F32,  D_I16|Q_ASYM)
+            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_F32,  D_I16|Q_SYM)
+            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_F32,  D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_F32,  D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_F32,  D_F16)
+        END_IO_TYPE_DECL(L1_LAYER_NORM)
+        if (!VALIDATE_OP_IO_TYPES(L1_LAYER_NORM, self, inputs, self->input.num, outputs, self->output.num))
+        {
+            char* desc = generate_op_io_types_desc(inputs,
+                    self->input.num, outputs, self->output.num);
+            VSILOGE("Inputs/Outputs data type not support: %s", desc);
+            destroy_op_io_types_desc(desc);
+            return FALSE;
+        }
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    int32_t i = 0;
+    VSI_UNREFERENCED(self);
+
+    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        for (i = 0; i < (int32_t)inputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+        }
+    }
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    self->nn_param.l1_layer_norm.axis = 0;
+
+    return status;
+} /* op_init() */
+
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ L1_LAYER_NORM,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ NULL,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
@ -25,6 +25,7 @@
 #include <stdlib.h>

 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_graph.h"
@ -161,7 +162,7 @@ static vsi_bool op_setup
            if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ||
                p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP)
            {
-                enable_rgb88_planar_nhwc = self->graph->ctx->options.enable_rgb88_planar_nhwc;
+                enable_rgb88_planar_nhwc = ((vsi_nn_graph_prv_t*)(self->graph))->options->enable_rgb88_planar_nhwc;
            }
        }

--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
@ -183,7 +183,8 @@ static vsi_bool _check_is_sp_supported_type
        return FALSE;
    }

-    if ( (axes_num == 1 && (axes[0] == 0 || axes[0] == 2)) ||
+    if ( (axes_num == 1 && (axes[0] == 0 || axes[0] == 2 ||
+         (axes[0] == 1 && (input->attr.size[0] == 1 || input->attr.size[2] == 1)))) ||
         (axes_num == 2 && ((axes[0] < 2 && axes[1] < 2) || (axes[0] == 1 && axes[1] == 2))) )
    {
        return TRUE;
@ -1167,6 +1168,7 @@ static vsi_bool op_setup
            {
                outputs[0]->attr.dim_num = 1;
                outputs[0]->attr.size[0] = 1;
+                vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
            }
            else
            {
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c
@ -93,52 +93,32 @@ static vsi_bool op_check
    if (!ret)
    {
        BEGIN_IO_TYPE_DECL(RMS_NORM, 2, 1)
-            IO_TYPE(D_F32, D_F32, D_F32)
-            IO_TYPE(D_F16, D_F32, D_F16)
-            IO_TYPE(D_F16, D_F32, D_F16)
-            IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM)
-            IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM)
-            IO_TYPE(D_BF16, D_F32, D_BF16)
-            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16)
-            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM)
-            IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP)
-            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM)
-            IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM)
-            IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16)
-            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16)
-            IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16)
-            IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP)
-            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM)
-            IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM)
-            IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16)
-            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16)
-            IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16)
-            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM)
-            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16)
-            IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP)
-            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM)
-            IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM)
-            IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16)
-            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16)
-            IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16)
-            IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP)
-            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM)
-            IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM)
-            IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16)
-            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16)
-            IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16)
+            IO_TYPE(D_F32,          D_F32,  D_F32)
+            IO_TYPE(D_F32,          D_F32,  D_F16)
+            IO_TYPE(D_F16,          D_F32,  D_F16)
+            IO_TYPE(D_F16,          D_F32,  D_F32)
+            IO_TYPE(D_F16,          D_F32,  D_U8 | Q_ASYM)
+            IO_TYPE(D_F16,          D_F32,  D_I8 | Q_DFP)
+            IO_TYPE(D_F16,          D_F32,  D_I8 | Q_ASYM)
+            IO_TYPE(D_F16,          D_F32,  D_I8 | Q_SYM)
+            IO_TYPE(D_F16,          D_F32,  D_I16 | Q_DFP)
+            IO_TYPE(D_F16,          D_F32,  D_I16 | Q_ASYM)
+            IO_TYPE(D_F16,          D_F32,  D_I16 | Q_SYM)
+            IO_TYPE(D_BF16,         D_F32,  D_BF16)
+            IO_TYPE(D_U8 | Q_ASYM,  D_F32,  D_F16)
+            IO_TYPE(D_U8 | Q_ASYM,  D_F32,  D_U8 | Q_ASYM)
+            IO_TYPE(D_I16 | Q_DFP,  D_F32,  D_I16 | Q_DFP)
+            IO_TYPE(D_I16 | Q_ASYM, D_F32,  D_I16 | Q_ASYM)
+            IO_TYPE(D_I16 | Q_SYM,  D_F32,  D_I16 | Q_SYM)
+            IO_TYPE(D_I16 | Q_DFP,  D_F32,  D_F16)
+            IO_TYPE(D_I16 | Q_ASYM, D_F32,  D_F16)
+            IO_TYPE(D_I16 | Q_SYM,  D_F32,  D_F16)
+            IO_TYPE(D_I8 | Q_DFP,   D_F32,  D_I8 | Q_DFP)
+            IO_TYPE(D_I8 | Q_ASYM,  D_F32,  D_I8 | Q_ASYM)
+            IO_TYPE(D_I8 | Q_SYM,   D_F32,  D_I8 | Q_SYM)
+            IO_TYPE(D_I8 | Q_DFP,   D_F32,  D_F16)
+            IO_TYPE(D_I8 | Q_ASYM,  D_F32,  D_F16)
+            IO_TYPE(D_I8 | Q_SYM,   D_F32,  D_F16)
            END_IO_TYPE_DECL(RMS_NORM)
            if (!VALIDATE_OP_IO_TYPES(RMS_NORM, self, inputs, self->input.num, outputs, self->output.num))
            {
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
@ -25,6 +25,7 @@
 #include <stdlib.h>

 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
@ -776,7 +777,7 @@ static vsi_status op_optimize

    /* Only forward run stride_slice's optimize */
    if ( direction == VSI_NN_OPTIMIZE_BACKWARD ||
-         !self->graph->ctx->options.enable_slice_optimize )
+         !((vsi_nn_graph_prv_t*)(self->graph))->options->enable_slice_optimize )
    {
        return status;
    }
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c
@ -78,9 +78,10 @@ static vsi_status _tile_op_compute
    vsi_size_t new_rank                      = 0;
    vsi_bool   ret                          = FALSE;
    uint32_t i                              = 0;
-    vsi_size_t* multiples                   = (vsi_size_t*)self->nn_param.tile.multiples;
+    int32_t* multiples_                     = (int32_t*)self->nn_param.tile.multiples;
    vsi_nn_tensor_t* temp_tensors[3]        = { NULL };
    vsi_nn_tensor_t* reshape_tensors[3]     = { NULL };
+    vsi_size_t multiples[VSI_NN_MAX_DIM_NUM] = {1};
    int32_t   multiples_value[VSI_NN_MAX_DIM_NUM] = {0};
    vsi_nn_tensor_attr_t attr;

@ -101,6 +102,11 @@ static vsi_status _tile_op_compute
        temp_tensors[2] = outputs[0];
    }

+    for (i = 0; i < inputs[0]->attr.dim_num; i ++)
+    {
+        multiples[i] = (vsi_size_t)multiples_[i];
+    }
+
    ret = vsi_nn_kernel_optimize_tile_shape(
            inputs[0]->attr.size, inputs[0]->attr.dim_num,
            multiples, inputs[0]->attr.dim_num,
@ -111,6 +117,7 @@ static vsi_status _tile_op_compute
    {
        if (_is_supported_axis(shapes[1], new_rank) == FALSE)
        {
+            uint32_t _multiples = (uint32_t)(new_rank > 4 && shapes[1][4] > 1 ? 3 : 2);
            reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0],\
                shapes[0], (vsi_size_t)new_rank );
            reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, temp_tensors[2],\
@ -125,8 +132,11 @@ static vsi_status _tile_op_compute
            memcpy( &attr, &reshape_tensors[0]->attr, sizeof(attr));
            attr.is_const = FALSE;
            attr.vtl = TRUE;
-            attr.size[0] = reshape_tensors[2]->attr.size[0];
-            attr.size[1] = reshape_tensors[2]->attr.size[1];
+
+            for (i = 0; i < _multiples; i++)
+            {
+                attr.size[i] = reshape_tensors[2]->attr.size[i];
+            }
            temp_tensors[0] = vsi_nn_CreateTensor( self->graph, &attr );

            memset( &attr, 0 , sizeof(vsi_nn_tensor_attr_t) );
@ -136,9 +146,11 @@ static vsi_status _tile_op_compute
            attr.size[0] = new_rank;
            attr.dim_num = 1;

-            multiples_value[0] = (int32_t)shapes[1][0];
-            multiples_value[1] = (int32_t)shapes[1][1];
-            for (i = 0; i < new_rank; i++)
+            for (i = 0; i < _multiples; i++)
+            {
+                multiples_value[i] = (int32_t)shapes[1][i];
+            }
+            for (i = _multiples; i < new_rank; i++)
            {
                multiples_value[i] = 1;
            }
@ -150,9 +162,11 @@ static vsi_status _tile_op_compute
                goto final;
            }

-            multiples_value[0] = 1;
-            multiples_value[1] = 1;
-            for (i = 0; i < new_rank; i++)
+            for (i = 0; i < _multiples; i++)
+            {
+                multiples_value[i] = 1;
+            }
+            for (i = _multiples; i < new_rank; i++)
            {
                multiples_value[i] = (int32_t)shapes[1][i];
            }
@ -257,6 +271,7 @@ static vsi_bool op_check
        IO_TYPE(D_F32,          D_F32)
        IO_TYPE(D_F32,          D_U8|Q_ASYM)
        IO_TYPE(D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_BOOL8,        D_BOOL8)
    END_IO_TYPE_DECL(TILE)
    if (!VALIDATE_OP_IO_TYPES(TILE, self, inputs, self->input.num, outputs, self->output.num)) {
        char* desc = generate_op_io_types_desc(inputs,
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@ -471,6 +471,10 @@ static _op_param_gen_t s_op_gen[] =
    /* TAN */                   NULL,
    /* RMSNORM */               NULL,
    /* SHAPE */                 NULL,
+    /* BITCAST */               NULL,
+    /* GROUPED_CONV3D */        NULL,
+    /* COL2IM */                NULL,
+    /* L1_LAYER_NORM */         NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );

--- a/src/tim/vx/internal/src/utils/vsi_nn_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c
@ -772,6 +772,7 @@ vsi_bool vsi_nn_CreateTensorGroup
    end[1] = in_tensor->attr.size[1];
    end[2] = in_tensor->attr.size[2];
    end[3] = in_tensor->attr.size[3];
+    end[4] = in_tensor->attr.size[4];
    end[axis] = 0;
    for( i = 0; i <  group_number; i ++ )
    {
@ -1259,6 +1260,32 @@ vsi_bool vsi_nn_is_same_quant_type(
            }
            break;
        }
+#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
+        case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC: {
+            const float diff = (float)1e-5;
+            int32_t i = 0;
+            int32_t scale_cnt0 = src_dtype->group_count;
+            int32_t scale_cnt1 = dst_dtype->group_count;
+            int32_t group_size0 = src_dtype->group_size;
+            int32_t group_size1 = dst_dtype->group_size;
+            if (scale_cnt0 == scale_cnt1 && group_size0 == group_size1)
+            {
+                const float* src_scale_ptr = src_dtype->group_scales;
+                const float* dst_scale_ptr = dst_dtype->group_scales;
+                for (i = 0; i < scale_cnt0; i++)
+                {
+                    if (vsi_nn_float_compare(
+                            src_scale_ptr[i], dst_scale_ptr[i], diff) == FALSE)
+                    {
+                        return FALSE;
+                    }
+                }
+            } else {
+                return FALSE;
+            }
+            break;
+        }
+#endif
        default:
            break;
    }
--- a/src/tim/vx/internal/src/vsi_nn_context.c
+++ b/src/tim/vx/internal/src/vsi_nn_context.c
@ -22,10 +22,10 @@
 *
 *****************************************************************************/
 #include <stdlib.h>
-#include "vsi_nn_types.h"
 #include "vsi_nn_test.h"
 #include "vsi_nn_context.h"
 #include "vsi_nn_platform.h"
+#include "vsi_nn_types.h"

 static vsi_status query_hardware_caps
    (
@ -103,6 +103,9 @@ static const char* ENV_ENABLE_STREAM_PROCESSOR = "vendor.VSI_VX_ENABLE_STREAM_PR
 static const char* ENV_FORCE_RGB888_OUT_NHWC = "vendor.VSI_NN_FORCE_RGB888_OUT_NHWC";
 static const char* ENV_ENABLE_SLICE_OPTIMIZE = "vendor.VSI_NN_ENABLE_SLICE_OPTIMIZE";
 static const char* ENV_ENABLE_BATCH_OPT = "vendor.VSI_VX_ENABLE_BATCH_OPT";
+static const char* ENV_SAVE_FILE_TYPE = "vendor.VSI_SAVE_FILE_TYPE";
+static const char* VSI_USE_IMAGE_PROCESS = "vendor.VSI_USE_IMAGE_PROCESS";
+static const char* VSI_USE_FROM_HANDLE = "vendor.VSI_USE_FROM_HANDLE";
 #else
 static const char* ENV_ENABLE_SHADER = "VIV_VX_ENABLE_SHADER";
 static const char* ENV_ENABLE_OPCHECK = "VSI_NN_ENABLE_OPCHECK";
@ -113,8 +116,11 @@ static const char* ENV_ENABLE_STREAM_PROCESSOR = "VSI_VX_ENABLE_STREAM_PROCESSOR
 static const char* ENV_FORCE_RGB888_OUT_NHWC = "VSI_NN_FORCE_RGB888_OUT_NHWC";
 static const char* ENV_ENABLE_SLICE_OPTIMIZE = "VSI_NN_ENABLE_SLICE_OPTIMIZE";
 static const char* ENV_ENABLE_BATCH_OPT = "VSI_VX_ENABLE_BATCH_OPT";
+static const char* ENV_SAVE_FILE_TYPE = "VSI_SAVE_FILE_TYPE";
+static const char* VSI_USE_IMAGE_PROCESS = "VSI_USE_IMAGE_PROCESS";
+static const char* VSI_USE_FROM_HANDLE = "VSI_USE_FROM_HANDLE";
 #endif
-static vsi_status vsi_nn_initOptions
+vsi_status vsi_nn_initOptions
    (
    vsi_nn_runtime_option_t *options
    )
@ -129,7 +135,7 @@ static vsi_status vsi_nn_initOptions
    default_value = 1;
 #endif
    options->enable_concat_optimize = vsi_nn_getenv_asint(ENV_ENABLE_CONCAT_OPTIMIZE, default_value);
-    options->enable_asymi8_to_u8 = vsi_nn_getenv_asint(ENV_ENABLE_I8TOU8, 1);
+    options->enable_i8_to_u8 = vsi_nn_getenv_asint(ENV_ENABLE_I8TOU8, 1);
    options->enable_dataconvert_optimize = vsi_nn_getenv_asint(ENV_ENABLE_DATACONVERT_OPTIMIZE, 1);
    options->enable_stream_processor = vsi_nn_getenv_asint(ENV_ENABLE_STREAM_PROCESSOR, 1);
    options->enable_rgb88_planar_nhwc = vsi_nn_getenv_asint(ENV_FORCE_RGB888_OUT_NHWC, 0);
@ -140,6 +146,9 @@ static vsi_status vsi_nn_initOptions
 #endif
    options->enable_slice_optimize = vsi_nn_getenv_asint(ENV_ENABLE_SLICE_OPTIMIZE, default_value);
    options->enable_batch_opt = vsi_nn_getenv_asint(ENV_ENABLE_BATCH_OPT, 0);
+    options->enable_save_file_type = vsi_nn_getenv_asint(ENV_SAVE_FILE_TYPE, 0);
+    options->enable_use_image_process = vsi_nn_getenv_asint(VSI_USE_IMAGE_PROCESS, -1);
+    options->enable_use_from_handle = vsi_nn_getenv_asint(VSI_USE_FROM_HANDLE, -1);

    return VSI_SUCCESS;
 }
--- a/src/tim/vx/internal/src/vsi_nn_graph.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph.c
@ -1354,20 +1354,26 @@ vsi_nn_graph_t * vsi_nn_CreateGraph
            graph->node_num = 0;
            graph->ctx = ctx;
            graph->rnn_wksp = NULL;
+            ((vsi_nn_graph_prv_t*) graph)->options =
+                (vsi_nn_runtime_option_t *)malloc( sizeof( vsi_nn_runtime_option_t ));
+            CHECK_PTR_FAIL_GOTO(((vsi_nn_graph_prv_t*) graph)->options, "Create graph options fail.", error);
            graph->node_table = (vsi_nn_map_t *)malloc( sizeof( vsi_nn_map_t ) );
            graph->tensor_table = (vsi_nn_map_t *)malloc( sizeof( vsi_nn_map_t ) );
            graph->isAllowFastMode = TRUE;
            vsi_nn_MapInit( graph->node_table );
            vsi_nn_MapInit( graph->tensor_table );
+            vsi_nn_initOptions( ((vsi_nn_graph_prv_t*) graph)->options );
        }
        else
        {
            VSILOGE( "Create vx graph fail." );
-            free( graph );
+            free(graph);
            graph = NULL;
        }
    }

+    return graph;
+error:
    return graph;
 } /* vsi_nn_CreateGraph() */

@ -1429,6 +1435,10 @@ void vsi_nn_ReleaseGraph
                free( tmp );
            }
        }
+        if (NULL != ((vsi_nn_graph_prv_t*)ptr)->options)
+        {
+            free(((vsi_nn_graph_prv_t*)ptr)->options);
+        }
        free( ptr );
        *graph = NULL;
    }
@ -1500,7 +1510,7 @@ vsi_status vsi_nn_SetupGraph
    }

 #if VX_GRAPH_BATCH_OPT_SUPPORT
-    if (graph->ctx->options.enable_batch_opt)
+    if (((vsi_nn_graph_prv_t*)graph)->options->enable_batch_opt)
    {
        /*processing batch splitting*/
        status = batchInference_graph(graph, nodes_list);
@ -2064,7 +2074,7 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
    const char          * kernel_name
    )
 {
-    vsi_nn_node_t * node;
+    vsi_nn_node_prv_t* node;
    vsi_nn_node_id_t id;
    vsi_nn_op_proc_t * node_proc;

@ -2076,16 +2086,17 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
    {
        return NULL;
    }
-    node = (vsi_nn_node_t *)malloc( sizeof( vsi_nn_node_t ) );
+    node = (vsi_nn_node_prv_t*)malloc(sizeof(vsi_nn_node_prv_t));

    if( NULL != node )
    {
-        memset( node, 0, sizeof( vsi_nn_node_t ) );
-        node->graph = graph;
-        node->op = op;
-        node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
-        node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_ZERO;
-        node->vx_param.down_scale_size_rounding = VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR;
+        memset(node, 0, sizeof(vsi_nn_node_prv_t));
+        node->pon.graph = graph;
+        node->pon.op = op;
+        node->pon.vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
+        node->pon.vx_param.rounding_policy = VX_ROUND_POLICY_TO_ZERO;
+        node->pon.vx_param.down_scale_size_rounding =
+            VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR;

        /* init op */
        if(node_proc->init != NULL){
@ -2093,31 +2104,31 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
        }

        /* init output struct */
-        node->output.num = node_proc->output_num;
-        node->output.tensors = (vsi_nn_tensor_id_t *) malloc(
+        node->pon.output.num = node_proc->output_num;
+        node->pon.output.tensors = (vsi_nn_tensor_id_t*)malloc(
            node_proc->output_num * sizeof( vsi_nn_tensor_id_t ) );
-        if ( NULL == node->output.tensors )
+        if (NULL == node->pon.output.tensors)
        {
            VSILOGE("Create output tensor id %s. fail", vsi_nn_OpGetName(op));
            vsi_nn_safe_free(node);
            return NULL;
        }
-        vsi_nn_InitTensorsId( node->output.tensors, node_proc->output_num );
+        vsi_nn_InitTensorsId(node->pon.output.tensors, node_proc->output_num);

        /* init input struct */
-        node->input.num = node_proc->input_num;
-        node->input.tensors = (vsi_nn_tensor_id_t *) malloc(
+        node->pon.input.num = node_proc->input_num;
+        node->pon.input.tensors = (vsi_nn_tensor_id_t*)malloc(
            node_proc->input_num * sizeof( vsi_nn_tensor_id_t ) );
-        if ( NULL == node->input.tensors )
+        if (NULL == node->pon.input.tensors)
        {
            VSILOGE("Create input tensor id %s. fail", vsi_nn_OpGetName(op));
-            vsi_nn_safe_free(node->output.tensors);
+            vsi_nn_safe_free(node->pon.output.tensors);
            vsi_nn_safe_free(node);
            return NULL;
        }
-        vsi_nn_InitTensorsId( node->input.tensors, node_proc->input_num );
-        node->attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE;
-        node->attr.enable_op_constraint_check = TRUE;
+        vsi_nn_InitTensorsId(node->pon.input.tensors, node_proc->input_num);
+        node->pon.attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE;
+        node->pon.attr.enable_op_constraint_check = TRUE;
    }
    id = graph->cur_nid;
    if(NULL != node){
@ -2126,7 +2137,7 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
        graph->cur_nid ++;
    }
    vsi_nn_OpRegisterExternalOvxInit(op, kernel_name, node_proc);
-    return node;
+    return (vsi_nn_node_t*)node;
 } /* vsi_nn_AddExternalNode() */

 void vsi_nn_RemoveNode
@ -3354,24 +3365,245 @@ final:
    return status;
 } /* vsi_nn_ExecuteGraphLoop() */

+typedef enum {
+    VSI_NN_ENABLE_I8TOU8 = 0,
+    VSI_NN_ENABLE_OPCHECK,
+    VSI_SAVE_FILE_TYPE,
+    VSI_USE_IMAGE_PROCESS,
+    VSI_NN_LOG_LEVEL,
+    VSI_NN_ENABLE_CONCAT_OPTIMIZE,
+    VSI_NN_ENABLE_DATACONVERT_OPTIMIZE,
+    VSI_VX_ENABLE_STREAM_PROCESSOR,
+    VSI_NN_FORCE_RGB888_OUT_NHWC,
+    VSI_NN_ENABLE_SLICE_OPTIMIZE,
+    VSI_VX_ENABLE_BATCH_OPT,
+    VIV_VX_ENABLE_SHADER,
+    VSI_USE_FROM_HANDLE,
+    VIV_VX_ENABLE_GRAPH_TRANSFORM
+} VSI_PUBLIC_TYPE vsi_nn_runtime_variable;

-vsi_status vsi_nn_SetGraphTransformOption
+typedef struct {
+    const char* key;
+    int32_t value;
+} VSI_PUBLIC_TYPE keyValuePair;
+
+char* vsi_nn_GetRunTimeVariable
+    (
+    const vsi_nn_graph_t* graph,
+    const char* key
+    )
+{
+    int32_t isVaid = 1;
+    int32_t value = -1;
+#define varSize 256
+    char* value_str = (char*)malloc(sizeof(char) * varSize);
+    CHECK_PTR_FAIL_GOTO(value_str, "Create value_str fail.", final);
+    memset(value_str, 0, varSize);
+    char tmp_value[varSize] = {0};
+    VSI_UNREFERENCED(tmp_value);
+    vsi_nn_runtime_option_t* options = ((vsi_nn_graph_prv_t*)graph)->options;
+    switch (vsi_nn_GetVariable(key))
+    {
+        case VIV_VX_ENABLE_SHADER:
+            value =options->enable_shader;
+            break;
+        case VSI_NN_ENABLE_OPCHECK:
+            value = options->enable_opcheck;
+            break;
+        case VSI_NN_ENABLE_I8TOU8:
+            value = options->enable_i8_to_u8;
+            break;
+        case VSI_VX_ENABLE_STREAM_PROCESSOR:
+            value = options->enable_stream_processor;
+            break;
+        case VSI_VX_ENABLE_BATCH_OPT:
+            value = options->enable_batch_opt;
+            break;
+        case VSI_NN_FORCE_RGB888_OUT_NHWC:
+            value = options->enable_rgb88_planar_nhwc;
+            break;
+        case VSI_SAVE_FILE_TYPE:
+            value = options->enable_save_file_type;
+            break;
+        case VSI_NN_ENABLE_CONCAT_OPTIMIZE:
+            value = options->enable_concat_optimize;
+            break;
+        case VSI_NN_ENABLE_SLICE_OPTIMIZE:
+            value = options->enable_slice_optimize;
+            break;
+        case VSI_USE_IMAGE_PROCESS:
+            if (options->enable_use_image_process != -1)
+            {
+                value = options->enable_use_image_process;
+            }
+            else
+            {
+                isVaid = 0;
+            }
+            break;
+        case VSI_USE_FROM_HANDLE:
+            if (options->enable_use_from_handle != -1)
+            {
+                value = options->enable_use_from_handle;
+            }
+            else
+            {
+                isVaid = 0;
+            }
+            break;
+        default:
+            isVaid = 0;
+            VSILOGE("Not support this key: %s.", key);
+    }
+    if (isVaid == 1)
+    {
+        snprintf(tmp_value, varSize, "%d", value);
+        memcpy(value_str, tmp_value, varSize);
+    } else
+    {
+        goto final;
+    }
+#undef varSize
+    return value_str;
+final:
+#undef varSize
+    vsi_nn_safe_free(value_str);
+    return value_str;
+}
+
+vsi_status vsi_nn_SetRunTimeVariable
    (
    vsi_nn_graph_t* graph,
-    const char* ctrl_str,
-    size_t size
+    const char* key,
+    const char* value
+     )
+{
+    vsi_status status = VSI_SUCCESS;
+    size_t size = 1;  // placeholder, not used in vxSetGraphAttribute.
+    if (graph == NULL)
+    {
+        status = VSI_FAILURE;
+        return status;
+    }
+    vsi_nn_runtime_option_t* options = ((vsi_nn_graph_prv_t*)graph)->options;
+    VSI_UNREFERENCED(size);
+    if (vsi_nn_getenv(key) == NULL)
+    {
+        switch (vsi_nn_GetVariable(key) )
+        {
+            case VIV_VX_ENABLE_SHADER:
+                options->enable_shader = atoi(value);
+                break;
+            case VSI_NN_ENABLE_OPCHECK:
+                options->enable_opcheck = atoi(value);
+                break;
+            case VSI_NN_ENABLE_I8TOU8:
+                options->enable_i8_to_u8 = atoi(value);
+                break;
+            case VSI_VX_ENABLE_STREAM_PROCESSOR:
+                options->enable_stream_processor = atoi(value);
+                break;
+            case VSI_VX_ENABLE_BATCH_OPT:
+                options->enable_batch_opt = atoi(value);
+                break;
+            case VSI_NN_FORCE_RGB888_OUT_NHWC:
+                options->enable_rgb88_planar_nhwc = atoi(value);
+                break;
+            case VSI_NN_ENABLE_CONCAT_OPTIMIZE:
+                options->enable_concat_optimize = atoi(value);
+                break;
+            case VSI_NN_ENABLE_DATACONVERT_OPTIMIZE:
+                options->enable_dataconvert_optimize = atoi(value);
+                break;
+            case VSI_NN_ENABLE_SLICE_OPTIMIZE:
+                options->enable_slice_optimize = atoi(value);
+                break;
+            case VSI_SAVE_FILE_TYPE:
+                options->enable_save_file_type = atoi(value);
+                break;
+            case VSI_USE_IMAGE_PROCESS:
+                options->enable_use_image_process = atoi(value);
+                break;
+            case VSI_USE_FROM_HANDLE:
+                options->enable_use_from_handle = atoi(value);
+                break;
+            case VIV_VX_ENABLE_GRAPH_TRANSFORM:
+#ifdef VX_GRAPH_TRANSFORM_OPTION_SUPPORT
+                if (graph && graph->g) {
+                    status = vxSetGraphAttribute(
+                        graph->g, VX_GRAPH_VSI_TRANSFORM_OPTIONS, value, size);
+                }
+#else
+                status = VSI_FAILURE;
+                VSILOGE("VX_GRAPH_TRANSFORM_OPTION_SUPPORT is not defined, please check driver version.");
+#endif
+                break;
+            default:
+#ifdef VX_GRAPH_ENV_SUPPORT
+                status = vxSetGraphEnv(graph->g, key, value);
+#else
+                status = VSI_FAILURE;
+                VSILOGE("VX_GRAPH_ENV_SUPPORT is not defined, please check driver version.");
+#endif
+                break;
+        }
+    }
+    return status;
+}
+
+int32_t vsi_nn_GetVariable(const char* variableKey) {
+    keyValuePair dict[] = {
+        {"VSI_NN_ENABLE_I8TOU8", VSI_NN_ENABLE_I8TOU8},
+        {"VSI_NN_ENABLE_OPCHECK", VSI_NN_ENABLE_OPCHECK},
+        {"VSI_SAVE_FILE_TYPE", VSI_SAVE_FILE_TYPE},
+        {"VSI_USE_IMAGE_PROCESS", VSI_USE_IMAGE_PROCESS},
+        {"VSI_NN_ENABLE_CONCAT_OPTIMIZE", VSI_NN_ENABLE_CONCAT_OPTIMIZE},
+        {"VSI_NN_ENABLE_DATACONVERT_OPTIMIZE", VSI_NN_ENABLE_DATACONVERT_OPTIMIZE},
+        {"VSI_VX_ENABLE_STREAM_PROCESSOR", VSI_VX_ENABLE_STREAM_PROCESSOR},
+        {"VSI_NN_FORCE_RGB888_OUT_NHWC", VSI_NN_FORCE_RGB888_OUT_NHWC},
+        {"VSI_NN_ENABLE_SLICE_OPTIMIZE", VSI_NN_ENABLE_SLICE_OPTIMIZE},
+        {"VSI_VX_ENABLE_BATCH_OPT", VSI_VX_ENABLE_BATCH_OPT},
+        {"VIV_VX_ENABLE_SHADER", VIV_VX_ENABLE_SHADER},
+        {"VSI_USE_FROM_HANDLE", VSI_USE_FROM_HANDLE},
+        {"VIV_VX_ENABLE_GRAPH_TRANSFORM", VIV_VX_ENABLE_GRAPH_TRANSFORM},
+        {NULL, -1}
+    };
+    for (int32_t i = 0; dict[i].key != NULL; i++) {
+        if (strcmp(dict[i].key, variableKey) == 0) {
+            return dict[i].value;
+        }
+    }
+    return -1;
+}
+
+OVXLIB_API char* vsi_nn_GenerateGraphJson
+    (
+    vsi_nn_graph_t* graph
+    )
+{
+    char* json = NULL;
+    VSI_UNREFERENCED(graph);
+#ifdef VX_GENERATE_GRAPH_JSON_API_SUPPORT
+    if (graph && graph->g)
+    {
+        json = vxGenerateGraphJson(graph->g);
+    }
+#endif
+    return json;
+}
+
+OVXLIB_API vsi_status vsi_nn_ReleaseGraphJson
+    (
+    char* json
    )
 {
    vsi_status status = VSI_FAILURE;
-    VSI_UNREFERENCED(graph);
-    VSI_UNREFERENCED(ctrl_str);
-    VSI_UNREFERENCED(size);
-#ifdef VX_GRAPH_TRANSFORM_OPTION_SUPPORT
-
-    if(graph && graph->g)
-    {
-        status = vxSetGraphAttribute(graph->g, VX_GRAPH_VSI_TRANSFORM_OPTIONS, ctrl_str, size);
+    VSI_UNREFERENCED(json);
+#ifdef VX_GENERATE_GRAPH_JSON_API_SUPPORT
+    if (json) {
+        status = vxReleaseGraphJson(json);
    }
 #endif
+
    return status;
 }
--- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
@ -26,6 +26,7 @@
 #include "vsi_nn_graph_optimization.h"
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_graph.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_error.h"

@ -37,14 +38,50 @@ static vsi_bool _is_asymm_int8_norm_tensor
 {
    vsi_bool ret = FALSE;

-    ret = ( tensor != NULL
-   && tensor->attr.vtl == FALSE && tensor->attr.is_const == FALSE
-   && tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8
-   && tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC);
+    ret = ( tensor != NULL &&
+            tensor->attr.vtl == FALSE &&
+            tensor->attr.is_const == FALSE &&
+            tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
+            tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
+          );

    return ret;
 }/* _is_asymm_int8_norm_tensor() */

+static vsi_bool _is_symm_int8_norm_tensor
+(
+    vsi_nn_tensor_t* tensor
+)
+{
+    vsi_bool ret = FALSE;
+
+    ret = (tensor != NULL &&
+           tensor->attr.vtl == FALSE &&
+           tensor->attr.is_const == FALSE &&
+           tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
+           tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC
+        );
+
+    return ret;
+}/* _is_symm_int8_norm_tensor() */
+
+static vsi_bool _is_int8_norm_tensor
+(
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t* tensor
+)
+{
+    vsi_bool ret = FALSE;
+    vsi_bool support_symi8 =
+       ((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8 == 2;
+
+
+    ret = _is_asymm_int8_norm_tensor(tensor);
+    ret = ret || (support_symi8 && _is_symm_int8_norm_tensor(tensor));
+
+    return ret;
+}/* _is_int8_norm_tensor() */
+
 static vsi_bool _is_asymm_int8_const_tensor
    (
        vsi_nn_tensor_t * tensor
@ -52,14 +89,47 @@ static vsi_bool _is_asymm_int8_const_tensor
 {
    vsi_bool ret = FALSE;

-    ret = ( tensor != NULL
-   && tensor->attr.is_const == TRUE
-   && tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8
-   && tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC);
+    ret = ( tensor != NULL &&
+            tensor->attr.is_const == TRUE &&
+            tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
+            tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
+          );

    return ret;
 }/* _is_asymm_int8_const_tensor() */

+static vsi_bool _is_symm_int8_const_tensor
+(
+    vsi_nn_tensor_t* tensor
+)
+{
+    vsi_bool ret = FALSE;
+
+    ret = (tensor != NULL &&
+        tensor->attr.is_const == TRUE &&
+        tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
+        tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC
+        );
+
+    return ret;
+}/* _is_symm_int8_const_tensor() */
+
+static vsi_bool _is_int8_const_tensor
+(
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t* tensor
+)
+{
+    vsi_bool ret = FALSE;
+    vsi_bool support_symi8 =
+       ((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8 == 2;
+
+    ret = _is_asymm_int8_const_tensor(tensor);
+    ret = ret || (support_symi8 && _is_symm_int8_const_tensor(tensor));
+
+    return ret;
+}/* _is_int8_const_tensor() */
+
 static vsi_bool _is_asymm_int8_virtual_tensor
    (
        vsi_nn_tensor_t * tensor
@ -67,14 +137,47 @@ static vsi_bool _is_asymm_int8_virtual_tensor
 {
    vsi_bool ret = FALSE;

-    ret = ( tensor != NULL
-   && tensor->attr.vtl == TRUE
-   && tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8
-   && tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC);
+    ret = ( tensor != NULL &&
+            tensor->attr.vtl == TRUE &&
+            tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
+            tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
+          );

    return ret;
 }/* _is_asymm_int8_virtual_tensor() */

+static vsi_bool _is_symm_int8_virtual_tensor
+(
+    vsi_nn_tensor_t* tensor
+)
+{
+    vsi_bool ret = FALSE;
+
+    ret = (tensor != NULL &&
+        tensor->attr.vtl == TRUE &&
+        tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
+        tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC
+        );
+
+    return ret;
+}/* _is_symm_int8_virtual_tensor() */
+
+static vsi_bool _is_int8_virtual_tensor
+(
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t* tensor
+)
+{
+    vsi_bool ret = FALSE;
+    vsi_bool support_symi8 =
+       ((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8 == 2;
+
+    ret = _is_asymm_int8_virtual_tensor(tensor);
+    ret = ret || (support_symi8 && _is_symm_int8_virtual_tensor(tensor));
+
+    return ret;
+}/* _is_int8_virtual_tensor() */
+
 static vsi_status _add_forward_node
    (
    vsi_nn_graph_t* graph,
@ -199,7 +302,7 @@ static void _get_graph_input_asymm_int8_norm_tensor
            vsi_nn_tensor_id_t id = node->input.tensors[j];
            vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);

-            if (_is_asymm_int8_norm_tensor(tensor))
+            if (_is_int8_norm_tensor(graph, tensor))
            {
                if(tensor_ids != NULL)
                {
@ -251,7 +354,7 @@ static void _get_graph_output_asymm_int8_norm_tensor
            vsi_nn_tensor_id_t id = node->output.tensors[j];
            vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);

-            if (_is_asymm_int8_norm_tensor(tensor))
+            if (_is_int8_norm_tensor(graph, tensor))
            {
                if(tensor_ids != NULL)
                {
@ -360,6 +463,7 @@ static vsi_status _add_graph_dataconvert_for_int8
                {
                   memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t));
                   attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
+                   attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
                   attr.dtype.zero_point += 128;
                   attr.vtl = TRUE;
                   output = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL );
@ -383,6 +487,7 @@ static vsi_status _add_graph_dataconvert_for_int8
            {
                memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t));
                attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
+                attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
                attr.dtype.zero_point += 128;
                attr.vtl = TRUE;
                input = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL );
@ -788,6 +893,7 @@ static void _convert_const_I8toU8
    }

    attr->dtype.vx_type = VSI_NN_TYPE_UINT8;
+    attr->dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
    attr->dtype.zero_point += 128;

    if ( tensor->t ) vxReleaseTensor(&tensor->t);
@ -818,7 +924,7 @@ static vsi_status _convert_graph_const_tensor
           vsi_nn_tensor_id_t id = node->input.tensors[j];
           vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);

-           if (_is_asymm_int8_const_tensor(tensor))
+           if (_is_int8_const_tensor(graph, tensor))
           {
               _convert_const_I8toU8(graph, id);
           }
@ -835,11 +941,9 @@ static vsi_status _convert_virtual_tensor_attr
    vsi_nn_tensor_t * tensor
    )
 {
-    if (_is_asymm_int8_virtual_tensor(tensor))
-    {
-        tensor->attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
-        tensor->attr.dtype.zero_point += 128;
-    }
+    tensor->attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
+    tensor->attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
+    tensor->attr.dtype.zero_point += 128;

    return VSI_SUCCESS;
 }/* _convert_virtual_tensor_attr() */
@ -849,7 +953,7 @@ static vsi_status _convert_graph_virtual_tensor
    vsi_nn_graph_t* graph
    )
 {
-    vsi_status status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
    uint32_t node_num = graph->node_num;
    vsi_nn_node_t* node = NULL;
    uint32_t i = 0;
@ -865,7 +969,10 @@ static vsi_status _convert_graph_virtual_tensor
            vsi_nn_tensor_id_t id = node->input.tensors[j];
            vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);

-            status = _convert_virtual_tensor_attr(tensor);
+            if (_is_int8_virtual_tensor(graph, tensor))
+            {
+                status = _convert_virtual_tensor_attr(tensor);
+            }
        }

        for(j = 0; j < node->output.num; j++)
@ -873,7 +980,10 @@ static vsi_status _convert_graph_virtual_tensor
            vsi_nn_tensor_id_t id = node->output.tensors[j];
            vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);

-            status = _convert_virtual_tensor_attr(tensor);
+            if (_is_int8_virtual_tensor(graph, tensor))
+            {
+                status = _convert_virtual_tensor_attr(tensor);
+            }
        }
    }

@ -925,7 +1035,7 @@ vsi_status vsi_nn_OptimizeGraph

    status = VSI_SUCCESS;

-    if (!nbg_flag && graph->ctx->options.enable_asymi8_to_u8)
+    if (!nbg_flag &&((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8)
    {
        status = _graph_optimization_convert_int8_to_uint8(graph, dirty);
        CHECK_STATUS_FAIL_GOTO(status, final);
--- a/src/tim/vx/internal/src/vsi_nn_internal_node.c
+++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c
@ -452,7 +452,8 @@ void vsi_nn_internal_init_tensor_attr
    if( dtype->qnt_type == VSI_NN_QNT_TYPE_NONE &&
        ( dtype->vx_type != VSI_NN_TYPE_FLOAT16 &&
          dtype->vx_type != VSI_NN_TYPE_FLOAT32 &&
-          dtype->vx_type != VSI_NN_TYPE_BFLOAT16 ) )
+          dtype->vx_type != VSI_NN_TYPE_BFLOAT16 &&
+          dtype->vx_type != VSI_NN_TYPE_INT32) )
    {
        attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
        attr->dtype.vx_type = VSI_NN_TYPE_FLOAT16;
--- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
+++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
@ -208,6 +208,10 @@ static _node_template s_template[] =
    /* RESIZE_3D */                NULL,
    /* REDUCEL2 */              NULL,
    /* CROP_AND_RESIZE */       NULL,
+    /* BITCAST */       NULL,
+    /* GROUPED_CONV3D */        NULL,
+    /* CO2IM */        NULL,
+    /* L1_LAYER_NORM */         NULL,
 };
 //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c );

--- a/src/tim/vx/internal/src/vsi_nn_ops.c
+++ b/src/tim/vx/internal/src/vsi_nn_ops.c
@ -26,6 +26,7 @@
 #include "vsi_nn_client_op.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"

@ -281,7 +282,7 @@ vsi_bool vsi_nn_OpCheck
    if ( NULL != proc )
    {
        ret = TRUE;
-        if ( proc->check && node->graph->ctx->options.enable_opcheck)
+        if ( proc->check && ((vsi_nn_graph_prv_t*)(node->graph))->options->enable_opcheck)
        {
            ret = proc->check( node, inputs, outputs );
        }
--- a/src/tim/vx/internal/src/vsi_nn_tensor.c
+++ b/src/tim/vx/internal/src/vsi_nn_tensor.c
@ -144,6 +144,17 @@ static void print_tensor
                         tensor->attr.dtype.scale_dim);
        ext_attr[count] = 0;
        break;
+#endif
+#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
+    case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC:
+        count = snprintf(&ext_attr[0],
+                         _EXT_ATTR_BUF_SZ,
+                         "SYM GPTQ axis=%d, count=%d, group_size=%d",
+                         tensor->attr.dtype.group_channel_dim,
+                         tensor->attr.dtype.group_count,
+                         tensor->attr.dtype.group_size);
+        ext_attr[count] = 0;
+        break;
 #endif
    default:
        vsi_nn_strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ);
@ -430,6 +441,25 @@ static vsi_bool _init_tensor
        VSILOGE(
            "can't support qnt_type "
            "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC.");
+#endif
+    case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC:
+#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
+        params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_GROUP;
+        // This is a hack that driver doesn't support const scales
+        scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.group_count);
+        CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
+        memcpy(scales, tensor->attr.dtype.group_scales, tensor->attr.dtype.group_count * sizeof(float));
+        params.quant_data.affinePerGroup.channel_dim = tensor->attr.dtype.group_channel_dim;
+        params.quant_data.affinePerGroup.group_size = tensor->attr.dtype.group_size;
+        params.quant_data.affinePerGroup.scale_group_count = tensor->attr.dtype.group_count;
+        params.quant_data.affinePerGroup.scales = scales;
+        params.quant_data.affinePerGroup.zero_points = NULL;
+        params.quant_data.affinePerGroup.zero_point_group_count = 0;
+        break;
+#else
+        VSILOGE(
+            "can't support qnt_type "
+            "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC.");
 #endif
    default:
        break;
--- a/src/tim/vx/internal/src/vsi_nn_types_prv.h
+++ b/src/tim/vx/internal/src/vsi_nn_types_prv.h
@ -58,6 +58,7 @@ typedef struct _vsi_nn_graph_prv

    // Add graph internal attribute here...
    vsi_nn_swap_handle_cache_t swap_handle_cache;
+    vsi_nn_runtime_option_t* options;
 } vsi_nn_graph_prv_t;

 /** Internal Node structure, internal use only. */
 @ -1 +1 @@
 .2.6
 .2.14