Update internal ovxlib to rel/1.2.14 (#699)

Type: New Feature Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
2024-07-08 09:29:24 +08:00 · 2024-07-08 09:29:24 +08:00 · c8b7c410bf
parent 8894360c74
commit c8b7c410bf
94 changed files with 14958 additions and 320 deletions
--- a/2
+++ b/2
@ -1 +1 @@
-1.2.6
+1.2.14
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@ -199,3 +199,7 @@ DEF_OP(CROP_AND_RESIZE)
 DEF_OP(TAN)
 DEF_OP(RMSNORM)
 DEF_OP(SHAPE)
 DEF_OP(BITCAST)
 DEF_OP(GROUPED_CONV3D)
 DEF_OP(COL2IM)
 DEF_OP(L1_LAYER_NORM)
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_bitcast.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bitcast.h
@ -0,0 +1,44 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #ifndef _VSI_NN_OP_BITCAST_H
 #define _VSI_NN_OP_BITCAST_H
 #include "vsi_nn_types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef struct _vsi_nn_bitcast_param
 {
    struct _bitcast_local_data_t* local;
 } vsi_nn_bitcast_param;
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_col2im.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_col2im.h
@ -0,0 +1,49 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #ifndef _VSI_NN_OP_COL2IM_H
 #define _VSI_NN_OP_COL2IM_H
 #include "vsi_nn_types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef struct _vsi_nn_col2im_param
 {
    const int32_t* image_shape;
    const int32_t* block_shape;
    int32_t      strides[3];
    int32_t      pads[6];
    int32_t      dilations[3];
    int32_t      dim_num;
 } vsi_nn_col2im_param;
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv3d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv3d.h
@ -0,0 +1,55 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #ifndef _VSI_NN_OP_GROUPED_CONV3D_H
 #define _VSI_NN_OP_GROUPED_CONV3D_H
 #include "vsi_nn_types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef struct _vsi_nn_grouped_conv3d_param
 {
    void*          local;
    uint32_t       ksize[3];
    uint32_t       stride[3];
    /* Pad left, right, top, bottom, front, rear */
    uint32_t       pad[6];
    /* Pad type default value shall be AUTO */
    vsi_nn_pad_e   pad_type;
    uint32_t       weights;
    uint32_t       group;
    uint32_t       dilation[3];
    int32_t        multiplier;
    vsi_nn_pad_mode_e pad_mode;
 } vsi_nn_grouped_conv3d_param;
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_l1_layer_norm.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_l1_layer_norm.h
@ -0,0 +1,47 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #ifndef _VSI_NN_OP_L1_LAYER_NORM_H
 #define _VSI_NN_OP_L1_LAYER_NORM_H
 #include "vsi_nn_types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef struct _vsi_nn_l1_layer_norm_param
 {
    struct _l1_layer_norm_local_data_t * local;
    float eps;
    int32_t axis;
 } vsi_nn_l1_layer_norm_param;
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/src/tim/vx/internal/include/utils/vsi_nn_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h
@ -349,7 +349,7 @@ vsi_bool vsi_nn_IsEVISFeatureAvaiable
    vsi_nn_context_t context
    );
-int32_t vsi_nn_compareVersion
+OVXLIB_API int32_t vsi_nn_compareVersion
    (
    vsi_nn_graph_t * graph,
    uint32_t version_major,
--- a/src/tim/vx/internal/include/vsi_nn/vsi_nn.h
+++ b/src/tim/vx/internal/include/vsi_nn/vsi_nn.h
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@ -26,6 +26,7 @@
 #define _VSI_NN_CONTEXT_H
 #include "vsi_nn_platform.h"
 #include "vsi_nn_types.h"
 #ifdef __cplusplus
 extern "C" {
@ -75,12 +76,19 @@ typedef struct _vsi_nn_runtime_option_t
    int32_t enable_shader;
    int32_t enable_opcheck;
    int32_t enable_concat_optimize;
-    int32_t enable_asymi8_to_u8;
+    /*  0: disable convert int8 to uint8
     *  1: enable convert asymm int8 to asymm uint8
     *  2: enable convert both asymm and sym int8 to asymm uint8
     */
    int32_t enable_i8_to_u8;
    int32_t enable_dataconvert_optimize;
    int32_t enable_stream_processor;
    int32_t enable_rgb88_planar_nhwc;
    int32_t enable_slice_optimize;
    int32_t enable_batch_opt;
    int32_t enable_save_file_type;
    int32_t enable_use_image_process;
    int32_t enable_use_from_handle;
 } vsi_nn_runtime_option_t;
 /**
@ -101,6 +109,10 @@ typedef struct _vsi_nn_context_t
 OVXLIB_API vsi_nn_context_t vsi_nn_CreateContext
    ( void );
 OVXLIB_API vsi_status vsi_nn_initOptions
    (
    vsi_nn_runtime_option_t *options
    );
 /**
 * Release context
 * Release ovxlib NN runtime resource and reset context handle to NULL.
--- a/src/tim/vx/internal/include/vsi_nn_feature_config.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h
@ -53,5 +53,9 @@
 #if defined(VX_13_NN_COMPATIBLITY)
 #define VSI_MAP_TENSOR_PATCH_SUPPORT
 #endif
 #if defined (VX_QUANT_PER_GROUP_SUPPORT)
 #define VSI_PER_GROUP_QUANTIZATION_SUPPORT
 #endif
 #define VSI_GRAPH_RUNTIME_ENV_SUPPORT
 #endif
--- a/src/tim/vx/internal/include/vsi_nn_graph.h
+++ b/src/tim/vx/internal/include/vsi_nn_graph.h
@ -814,11 +814,77 @@ OVXLIB_API vsi_status vsi_nn_ExecuteGraphLoop
    vsi_nn_tensor_t *max_iteration_tensor
    );
-OVXLIB_API vsi_status vsi_nn_SetGraphTransformOption
+/**
 * Set runtime variable
 * Set runtime variable for ovxlib and driver.
 *
 * @param[in] graph Graph handle
 * @param[in] key Ovxlib and driver Envoriment variable name
 * Ovxlib supported keys:
 * VSI_NN_ENABLE_I8TOU8
 * VSI_NN_ENABLE_OPCHECK
 * VSI_SAVE_FILE_TYPE
 * VSI_USE_IMAGE_PROCESS
 * VSI_NN_ENABLE_CONCAT_OPTIMIZE
 * VSI_NN_ENABLE_DATACONVERT_OPTIMIZE
 * VSI_VX_ENABLE_STREAM_PROCESSOR
 * VSI_NN_FORCE_RGB888_OUT_NHWC
 * VSI_NN_ENABLE_SLICE_OPTIMIZE
 * VSI_VX_ENABLE_BATCH_OPT
 * VSI_USE_FROM_HANDLE
 * Driver keys:
 * VIV_VX_ENABLE_GRAPH_TRANSFORM
 * VIV_VX_ENABLE_SHADER
 * In addition to the ovxlib keys listed above, all others will be treated as the driver envoriment variable.
 * @return VSI_SUCCESS on success, or appropriate error code otherwise
 */
 OVXLIB_API vsi_status vsi_nn_SetRunTimeVariable
    (
    vsi_nn_graph_t* graph,
-    const char* ctrl_str,
+    const char* key,
-    size_t size
+    const char* value
    );
 /**
 * Get runtime variable
 * Get runtime variable of ovxlib.
 *
 * @param[in] graph Graph handle
 * @param[in] key Envoriment variable name
 * Supported keys:
 * VSI_NN_ENABLE_I8TOU8
 * VSI_NN_ENABLE_OPCHECK
 * VSI_SAVE_FILE_TYPE
 * VSI_USE_IMAGE_PROCESS
 * VSI_NN_ENABLE_CONCAT_OPTIMIZE
 * VSI_NN_ENABLE_DATACONVERT_OPTIMIZE
 * VSI_VX_ENABLE_STREAM_PROCESSOR
 * VSI_NN_FORCE_RGB888_OUT_NHWC
 * VSI_NN_ENABLE_SLICE_OPTIMIZE
 * VSI_VX_ENABLE_BATCH_OPT
 * VSI_USE_FROM_HANDLE
 * VIV_VX_ENABLE_GRAPH_TRANSFORM
 * VIV_VX_ENABLE_SHADER
 * Only supported the keys listed above.
 * @return Variable's value on success, or NULL otherwise, attention: if success,
 *                 the caller need release the memory after use the return value.
 */
 OVXLIB_API char* vsi_nn_GetRunTimeVariable
    (
    const vsi_nn_graph_t* graph,
    const char* key
    );
 int32_t vsi_nn_GetVariable(const char* variableKey);
 OVXLIB_API char* vsi_nn_GenerateGraphJson
    (
    vsi_nn_graph_t* graph
    );
 OVXLIB_API vsi_status vsi_nn_ReleaseGraphJson
    (
    char* json
    );
 /**
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@ -212,6 +212,10 @@
 #include "ops/vsi_nn_op_crop_and_resize.h"
 #include "ops/vsi_nn_op_rmsnorm.h"
 #include "ops/vsi_nn_op_shape.h"
 #include "ops/vsi_nn_op_bitcast.h"
 #include "ops/vsi_nn_op_grouped_conv3d.h"
 #include "ops/vsi_nn_op_col2im.h"
 #include "ops/vsi_nn_op_l1_layer_norm.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 #include "ops/vsi_nn_op_inverse_sigmoid.h"
@ -412,6 +416,10 @@ typedef union _vsi_nn_nn_param
    vsi_nn_crop_and_resize_param    crop_and_resize;
    vsi_nn_rmsnorm_param            rmsnorm;
    vsi_nn_shape_param              shape;
    vsi_nn_bitcast_param            bitcast;
    vsi_nn_grouped_conv3d_param     grouped_conv3d;
    vsi_nn_col2im_param             col2im;
    vsi_nn_l1_layer_norm_param      l1_layer_norm;
    void*                         client_param;
    /* custom node data struct define */
--- a/src/tim/vx/internal/include/vsi_nn_tensor.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor.h
@ -86,6 +86,8 @@ typedef enum
    VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6,
    /** perchannel float8 */
    VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7,
    /** GPQT */
    VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC = 0x8,
    /** undefined type */
    VSI_NN_QNT_TYPE_NA = 0xff,
 } vsi_nn_qnt_type_e;
@ -126,6 +128,16 @@ typedef struct vsi_nn_dtype
                const int32_t * zero_points;
                int32_t         zero_points_dim;
            };
 #endif
 #ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
            /** Meanful in GPTQ_SYMMETRIC */
            struct {
                const float* group_scales;
                int32_t group_channel_dim;
                int32_t group_size;
                const int32_t* group_zero_points;
                int32_t group_count;
            };
 #endif
        };
    };
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@ -33,7 +33,7 @@ extern "C"{
 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 2
-#define VSI_NN_VERSION_PATCH 5
+#define VSI_NN_VERSION_PATCH 14
 #define VSI_NN_VERSION \
    (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
--- a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
@ -35,6 +35,8 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #if (!VX_ARGMAX_VX_SUPPORT)
 __BEGIN_DECLS
@ -289,3 +291,5 @@ OnError:
 __END_DECLS
 REGISTER_BACKEND_CL( argmax, _setup )
 #endif
--- a/src/tim/vx/internal/src/kernel/cl/col2im_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/col2im_cl.c
@ -0,0 +1,432 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_error.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 __BEGIN_DECLS
 #define _COL2IM_KERNEL_SOURCE_NAME      "col2im"
 // Add kernel hashtable here
 #define COL2IM_HASH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d) \
        (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 | (_image_2d)))
 #define COL2IM_KERNELS( IN_DTYPE, OUT_DTYPE ) \
        { COL2IM_HASH_KEY( IN_DTYPE, OUT_DTYPE , 0), \
         CVIVANTE_NAMESPACE("cl.col2im_"#IN_DTYPE"to"#OUT_DTYPE), \
         _COL2IM_KERNEL_SOURCE_NAME }
 #define COL2IM_KERNELS_2D( IN_DTYPE, OUT_DTYPE ) \
        { COL2IM_HASH_KEY( IN_DTYPE, OUT_DTYPE , 1), \
         CVIVANTE_NAMESPACE("cl.col2im_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \
         _COL2IM_KERNEL_SOURCE_NAME }
 typedef struct
 {
    uint32_t key;
    char * function_name;
    const char * source_name;
 } _kernel_map_type;
 static const _kernel_map_type _col2im_kernel_map[] =
 {
    // Register kernel here
    COL2IM_KERNELS( F32, F32 ),
    COL2IM_KERNELS( F32, U32 ),
    COL2IM_KERNELS( F32, I32 ),
    COL2IM_KERNELS( U32, U32 ),
    COL2IM_KERNELS( U32, F32 ),
    COL2IM_KERNELS( U32, I32 ),
    COL2IM_KERNELS( I32, I32 ),
    COL2IM_KERNELS( I32, U32 ),
    COL2IM_KERNELS( I32, F32 ),
    COL2IM_KERNELS_2D( F32, F32 ),
    COL2IM_KERNELS_2D( F32, U32 ),
    COL2IM_KERNELS_2D( F32, I32 ),
    COL2IM_KERNELS_2D( U32, U32 ),
    COL2IM_KERNELS_2D( U32, F32 ),
    COL2IM_KERNELS_2D( U32, I32 ),
    COL2IM_KERNELS_2D( I32, I32 ),
    COL2IM_KERNELS_2D( I32, U32 ),
    COL2IM_KERNELS_2D( I32, F32 ),
 };
 /*
 * Kernel params
 */
 static vx_param_description_t _col2im_kernel_param_def[] =
 {
    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _COL2IM_PARAM_NUM  _cnt_of_array( _col2im_kernel_param_def )
 /*
 * Kernel initializer
 */
 DEF_KERNEL_INITIALIZER(_col2im_initializer)
    (
    vsi_nn_kernel_node_t                node,
    const vsi_nn_kernel_node_param_t  * param,
    size_t                              param_size
    )
 {
    vsi_status status = VSI_FAILURE;
    gpu_param_t gpu_param = {
        3,         // workdim
        {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
        {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
        {0, 0, 0}, // localWorkSize: local group size in thread
        {0, 0, 0}  // globalWorkSize: image size in thread
        };
    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
    vsi_size_array_t * in_shape = NULL;
    int32_t stride_w = 1, stride_h = 1;
    int32_t dilation_w = 1, dilation_h = 1, dilation_d = 1;
    int32_t pad_w_front = 0, pad_w_end = 0, pad_h_front = 0, pad_h_end = 0, pad_d_front = 0, pad_d_end = 0;
    int32_t kernel_w = 1, kernel_h = 1, kernel_d = 1;
    int32_t move_time_x = 0;
    int32_t move_time_y = 0;
    int32_t width_pad = 0;
    int32_t height_pad = 0;
    int32_t depth_pad = 0;
    int32_t kernel_x_new = 1;
    int32_t kernel_y_new = 1;
    int32_t kernel_z_new = 1;
    int32_t batch = 1;
    int32_t width = 1;
    int32_t height = 1;
    int32_t depth = 1;
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &stride_w);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &stride_h);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &dilation_w);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &dilation_h);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &dilation_d);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &pad_w_front);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &pad_w_end);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &pad_h_front);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &pad_h_end);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &pad_d_front);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &pad_d_end);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[14], &kernel_w);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[15], &kernel_h);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[16], &kernel_d);
    CHECK_STATUS_FAIL_GOTO(status, final );
    batch = (int32_t)(attr[0]->shape->data[2]);
    width = (int32_t)(attr[1]->shape->data[0]);
    height = (int32_t)(attr[1]->shape->data[1]);
    depth  = (int32_t)(attr[1]->shape->data[2]) / batch;
    width_pad = width + pad_w_front + pad_w_end;
    height_pad = height + pad_h_front + pad_h_end;
    depth_pad = depth + pad_d_front + pad_d_end;
    move_time_x = (width_pad - ((kernel_w - 1) * dilation_w + 1) + stride_w) / stride_w;
    move_time_y = (height_pad - ((kernel_h - 1) * dilation_h + 1) + stride_h) / stride_h;
    kernel_x_new = (kernel_w - 1) * dilation_w + 1;
    kernel_y_new = (kernel_h - 1) * dilation_h + 1;
    kernel_z_new = (kernel_d - 1) * dilation_d + 1;
    status = vsi_nn_kernel_gpu_add_param( node, "width_pad", &width_pad );
    status |= vsi_nn_kernel_gpu_add_param( node, "height_pad", &height_pad );
    status |= vsi_nn_kernel_gpu_add_param( node, "depth_pad", &depth_pad );
    status |= vsi_nn_kernel_gpu_add_param( node, "move_time_x", &move_time_x );
    status |= vsi_nn_kernel_gpu_add_param( node, "move_time_y", &move_time_y );
    status |= vsi_nn_kernel_gpu_add_param( node, "kernel_x_new", &kernel_x_new );
    status |= vsi_nn_kernel_gpu_add_param( node, "kernel_y_new", &kernel_y_new );
    status |= vsi_nn_kernel_gpu_add_param( node, "kernel_z_new", &kernel_z_new );
    status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth );
    CHECK_STATUS_FAIL_GOTO(status, final );
    in_shape  = attr[1]->shape;
    gpu_param.global_scale[0] = 1;
    gpu_param.global_scale[1] = 1;
    gpu_param.global_scale[2] = 1;
    gpu_param.global_size[0] = in_shape->data[0];
    gpu_param.global_size[1] = in_shape->data[1];
    gpu_param.global_size[2] = in_shape->data[2];
    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
 final:
    if (attr[0])
    {
        vsi_nn_kernel_tensor_attr_release( &attr[0] );
    }
    if (attr[1])
    {
        vsi_nn_kernel_tensor_attr_release( &attr[1] );
    }
    return status;
 } /* _col2im_initializer() */
 /*
 * Query kernel
 */
 static vsi_status _query_kernel
    (
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
    vsi_bool image_2d
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_dtype_e in_dtype;
    vsi_nn_kernel_dtype_e out_dtype;
    const _kernel_map_type * kernel_map = _col2im_kernel_map;
    size_t kernel_map_size              = _cnt_of_array( _col2im_kernel_map );
    vx_param_description_t * param_def  = _col2im_kernel_param_def;
    vx_kernel_initialize_f  initializer = _col2im_initializer;
    uint32_t key;
    uint32_t i;
    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
    if (F16 == in_dtype)
    {
        in_dtype = F32;
    }
    else if (U8 == in_dtype)
    {
        in_dtype = U32;
    }
    else if (I8 == in_dtype || I16 == in_dtype)
    {
        in_dtype = I32;
    }
    if (F16 == out_dtype)
    {
        out_dtype = F32;
    }
    else if (U8 == out_dtype)
    {
        out_dtype = U32;
    }
    else if (I8 == out_dtype || I16 == out_dtype)
    {
        out_dtype = I32;
    }
    key = COL2IM_HASH_KEY( in_dtype, out_dtype ,image_2d);
    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
    {
        if ( kernel_map[i].key == key )
        {
            break;
        }
    }
    if ( i < (uint32_t)kernel_map_size )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
        kernel->info.parameters  = param_def;
        kernel->info.numParams   = _cnt_of_array( _col2im_kernel_param_def );
        kernel->info.initialize  = initializer;
        // Register code source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                "eltwise_ops_helper",
                kernel_map[i].source_name );
        // Register binary source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
                kernel_map[i].source_name );
        status = VSI_SUCCESS;
    }
    return status;
 } /* _query_kernel() */
 static vsi_nn_kernel_node_t _setup
    (
    vsi_nn_graph_t              * graph,
    vsi_nn_tensor_t            ** inputs,
    size_t                        input_num,
    vsi_nn_tensor_t            ** outputs,
    size_t                        output_num,
    const vsi_nn_kernel_param_t * params,
    vsi_nn_kernel_t             * kernel
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t node_params[_COL2IM_PARAM_NUM];
    vsi_nn_kernel_node_t node = NULL;
    vsi_bool image_2d = FALSE;
    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
    float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
    float inputZp  = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
    float outputZp  = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
    float inOutScale = inputScale / outputScale;
    float inOutTile = outputZp - inOutScale * inputZp;
    int32_t stride_w = vsi_nn_kernel_param_get_int32( params, "stride_w" );
    int32_t stride_h = vsi_nn_kernel_param_get_int32( params, "stride_h" );
    int32_t stride_d = vsi_nn_kernel_param_get_int32( params, "stride_d" );
    int32_t dilation_w = vsi_nn_kernel_param_get_int32( params, "dilation_w" );
    int32_t dilation_h = vsi_nn_kernel_param_get_int32( params, "dilation_h" );
    int32_t dilation_d = vsi_nn_kernel_param_get_int32( params, "dilation_d" );
    int32_t pad_w_front = vsi_nn_kernel_param_get_int32( params, "pad_w_front" );
    int32_t pad_w_end = vsi_nn_kernel_param_get_int32( params, "pad_w_end" );
    int32_t pad_h_front = vsi_nn_kernel_param_get_int32( params, "pad_h_front" );
    int32_t pad_h_end = vsi_nn_kernel_param_get_int32( params, "pad_h_end" );
    int32_t pad_d_front = vsi_nn_kernel_param_get_int32( params, "pad_d_front" );
    int32_t pad_d_end = vsi_nn_kernel_param_get_int32( params, "pad_d_end" );
    size_t dim_num = 0;
    int32_t* block_shape = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "block_shape", &dim_num);
    int32_t kernel_w = block_shape[0];
    int32_t kernel_h = dim_num > 1 ? block_shape[1] : 1;
    int32_t kernel_d = dim_num > 2 ? block_shape[2] : 1;
    VSI_UNREFERENCED(params);
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    image_2d = dim_num > 2 ? FALSE : TRUE;
    shapes[0][0] = inputs[0]->attr.size[0];
    shapes[0][1] = inputs[0]->attr.size[1] / outputs[0]->attr.size[dim_num];
    shapes[0][2] = inputs[0]->attr.size[2] * outputs[0]->attr.size[dim_num];
    shapes[1][0] = outputs[0]->attr.size[0];
    shapes[1][1] = outputs[0]->attr.size[1];
    if (image_2d)
    {
        shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3];
    }
    else
    {
        shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3] * outputs[0]->attr.size[4];
    }
    rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], 3 );
    rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], 3 );
    if (rs_input == NULL || rs_output == NULL)
    {
        goto final;
    }
    status = _query_kernel( kernel, inputs, outputs, image_2d );
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
        if ( node )
        {
            node_params[0] = rs_input;
            node_params[1] = rs_output;
            node_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &stride_w );
            node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &stride_h );
            node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &stride_d );
            node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_w );
            node_params[6] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_h );
            node_params[7] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_d );
            node_params[8] = vsi_nn_kernel_scalar_create( graph, I32, &pad_w_front );
            node_params[9] = vsi_nn_kernel_scalar_create( graph, I32, &pad_w_end );
            node_params[10] = vsi_nn_kernel_scalar_create( graph, I32, &pad_h_front );
            node_params[11] = vsi_nn_kernel_scalar_create( graph, I32, &pad_h_end );
            node_params[12] = vsi_nn_kernel_scalar_create( graph, I32, &pad_d_front );
            node_params[13] = vsi_nn_kernel_scalar_create( graph, I32, &pad_d_end );
            node_params[14] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_w );
            node_params[15] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_h );
            node_params[16] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_d );
            node_params[17] = vsi_nn_kernel_scalar_create( graph, F32, &inOutScale );
            node_params[18] = vsi_nn_kernel_scalar_create( graph, F32, &inOutTile );
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _COL2IM_PARAM_NUM );
            CHECK_STATUS(status);
            vsi_nn_kernel_scalar_release( &node_params[2] );
            vsi_nn_kernel_scalar_release( &node_params[3] );
            vsi_nn_kernel_scalar_release( &node_params[4] );
            vsi_nn_kernel_scalar_release( &node_params[5] );
            vsi_nn_kernel_scalar_release( &node_params[6] );
            vsi_nn_kernel_scalar_release( &node_params[7] );
            vsi_nn_kernel_scalar_release( &node_params[8] );
            vsi_nn_kernel_scalar_release( &node_params[9] );
            vsi_nn_kernel_scalar_release( &node_params[10] );
            vsi_nn_kernel_scalar_release( &node_params[11] );
            vsi_nn_kernel_scalar_release( &node_params[12] );
            vsi_nn_kernel_scalar_release( &node_params[13] );
            vsi_nn_kernel_scalar_release( &node_params[14] );
            vsi_nn_kernel_scalar_release( &node_params[15] );
            vsi_nn_kernel_scalar_release( &node_params[16] );
            vsi_nn_kernel_scalar_release( &node_params[17] );
        }
    }
 final:
    if (rs_input)
    {
        vsi_nn_kernel_tensor_release( &rs_input );
    }
    if (rs_output)
    {
        vsi_nn_kernel_tensor_release( &rs_output );
    }
    return node;
 } /* _setup() */
 __END_DECLS
 REGISTER_BACKEND_CL( col2im, _setup )
--- a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
@ -46,21 +46,36 @@ __BEGIN_DECLS
 #define KERNEL_SOURCE_1    "cumsum"
 #define KERNEL_SOURCE_2    "cumsum_2d"
 #define KERNEL_SOURCE_3    "cumsum_array_axis0"
 #define KERNEL_SOURCE_4    "cumsum_array_axis1"
 #define KERNEL_SOURCE_5    "cumsum_array_axis2"
 #define KERNEL_SOURCE_6    "cumsum_array_2d_axis0"
 #define KERNEL_SOURCE_7    "cumsum_array_2d_axis1"
 // Add kernel hashtable here
-#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
+#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d, is_array) \
-    ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
+    ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 8) | (_image_2d << 4) | (is_array))
 #define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \
-        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0), \
        CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
        KERNEL_SOURCE_1 },
 #define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \
-        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0), \
        CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
        KERNEL_SOURCE_2 },
 #define HASH_CUMSUM_ARRAY_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1), \
        CVIVANTE_NAMESPACE("cl.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
        SOURCE },
 #define HASH_CUMSUM_ARRAY_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 1), \
        CVIVANTE_NAMESPACE("cl.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
        SOURCE },
 static const struct {
        uint32_t key;
        char* function_name;
@ -82,6 +97,22 @@ static const struct {
    HASH_CUMSUM_KERNELS_2D(1, U8,  U8)
    HASH_CUMSUM_KERNELS_2D(1, F32, F32)
    HASH_CUMSUM_KERNELS_2D(1, F32, U8)
    HASH_CUMSUM_ARRAY_KERNELS(0, U8,  U8, KERNEL_SOURCE_3)
    HASH_CUMSUM_ARRAY_KERNELS(0, F32, F32, KERNEL_SOURCE_3)
    HASH_CUMSUM_ARRAY_KERNELS(0, F32, U8, KERNEL_SOURCE_3)
    HASH_CUMSUM_ARRAY_KERNELS(1, U8,  U8, KERNEL_SOURCE_4)
    HASH_CUMSUM_ARRAY_KERNELS(1, F32, F32, KERNEL_SOURCE_4)
    HASH_CUMSUM_ARRAY_KERNELS(1, F32, U8, KERNEL_SOURCE_4)
    HASH_CUMSUM_ARRAY_KERNELS(2, U8,  U8, KERNEL_SOURCE_5)
    HASH_CUMSUM_ARRAY_KERNELS(2, F32, F32, KERNEL_SOURCE_5)
    HASH_CUMSUM_ARRAY_KERNELS(2, F32, U8, KERNEL_SOURCE_5)
    HASH_CUMSUM_ARRAY_KERNELS_2D(0, U8,  U8, KERNEL_SOURCE_6)
    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F32, F32, KERNEL_SOURCE_6)
    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F32, U8, KERNEL_SOURCE_6)
    HASH_CUMSUM_ARRAY_KERNELS_2D(1, U8,  U8, KERNEL_SOURCE_7)
    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F32, F32, KERNEL_SOURCE_7)
    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F32, U8, KERNEL_SOURCE_7)
 };
 /*
@ -197,7 +228,8 @@ static vsi_status _query_kernel
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
    int32_t axis,
-    int32_t is_2d
+    int32_t is_2d,
    int32_t is_array
    /* Add extra params */
    )
 {
@ -230,7 +262,7 @@ static vsi_status _query_kernel
        output_dtype = F32;
    }
-    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d);
+    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d, is_array);
    for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
    {
@ -270,6 +302,7 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_t             * kernel
    )
 {
 #define VSI_NN_MAX_BLOCK_SIZE  GPU_TENSOR_MAX_WIDTH
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t node_params[_CUMSUM_PARAM_NUM] = {NULL};
    vsi_nn_kernel_node_t node = NULL;
@ -291,6 +324,7 @@ static vsi_nn_kernel_node_t _setup
    int32_t height     = 0;
    int32_t channel    = 1;
    uint32_t i = 0;
    int32_t is_array   = 0;
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
@ -326,13 +360,16 @@ static vsi_nn_kernel_node_t _setup
    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
        outputs[0], shapes[0], (vsi_size_t)rs_dim );
-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    for (i = 0; i < rs_dim; i++)
                outputs[0]->attr.dim_num ) )
    {
-        return NULL;
+        if (shapes[0][i] > VSI_NN_MAX_BLOCK_SIZE)
        {
            is_array = 1;
        }
    }
 #undef VSI_NN_MAX_BLOCK_SIZE
-    status = _query_kernel( kernel, inputs, outputs, axis_new, is_2d );
+    status = _query_kernel( kernel, inputs, outputs, axis_new, is_2d, is_array);
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
--- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/
-#if !(VX_TENSOR_GATHER_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-
+#if !(VX_TENSOR_GATHER_API_SUPPORT)
 __BEGIN_DECLS
 /*
--- a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/
-#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-
+#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
 __BEGIN_DECLS
 /*
--- a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/
-#if !(VX_LOGSOFTMAX_VX_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
-
+#if !(VX_LOGSOFTMAX_VX_SUPPORT)
 __BEGIN_DECLS
--- a/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c
@ -36,6 +36,8 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #if (!VX_NEAREST_GRID_SAMPLE_VX_SUPPORT)
 __BEGIN_DECLS
 /*
@ -412,3 +414,4 @@ __END_DECLS
 REGISTER_BACKEND_CL( nearest_grid_sample, _setup )
 #endif
--- a/src/tim/vx/internal/src/kernel/cl/pow_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/pow_cl.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/
-#if !(VX_TENSOR_POW_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-
+#if !(VX_TENSOR_POW_API_SUPPORT)
 __BEGIN_DECLS
 /*
--- a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vx_lib_nnext.h"
-
+#if (!VX_RESIZE_BILINEAR_SH_SUPPORT)
 __BEGIN_DECLS
 #define _RESIZE_BILINEAR_KERNEL_SOURCE()      "resize_bilinear"
@ -319,3 +319,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 REGISTER_BACKEND_CL( resize_bilinear, _setup )
 #endif
--- a/src/tim/vx/internal/src/kernel/cl/tile_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/tile_cl.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/
-#if !(VX_TENSOR_TILE_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
-
+#if !(VX_TENSOR_TILE_API_SUPPORT)
 __BEGIN_DECLS
--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@ -34,20 +34,24 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vx_lib_nnext.h"
 __BEGIN_DECLS
 #define _TOPK_KERNEL_SOURCE      "topk"
 #define STR(a) #a
 // Add kernel hashtable here
-#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \
+#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, SECTION ) \
-        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) )
+        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) | (SECTION << 26))
 #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \
-        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \
+        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, 0 ), \
          CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
          _TOPK_KERNEL_SOURCE }
 #define PACK_MERGE_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, 1 ), \
          CVIVANTE_NAMESPACE("cl.topk_stage_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
          "topk2" }
 #define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) )
 #define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
@ -79,6 +83,7 @@ static const _kernel_map_type _topk_kernel_map[] =
    PACK_KERNEL_MAP( F32, F32, 4 ),
    PACK_KERNEL_MAP( F32, F32, 5 ),
    PACK_KERNEL_MAP( F32, F32, 6 ),
    PACK_KERNEL_MAP( F32, F32, 9 ),
    PACK_KERNEL_MAP( U32, U32, 0 ),
    PACK_KERNEL_MAP( U32, U32, 1 ),
@ -87,6 +92,7 @@ static const _kernel_map_type _topk_kernel_map[] =
    PACK_KERNEL_MAP( U32, U32, 4 ),
    PACK_KERNEL_MAP( U32, U32, 5 ),
    PACK_KERNEL_MAP( U32, U32, 6 ),
    PACK_KERNEL_MAP( U32, U32, 9 ),
    PACK_KERNEL_MAP( I32, I32, 0 ),
    PACK_KERNEL_MAP( I32, I32, 1 ),
@ -95,6 +101,7 @@ static const _kernel_map_type _topk_kernel_map[] =
    PACK_KERNEL_MAP( I32, I32, 4 ),
    PACK_KERNEL_MAP( I32, I32, 5 ),
    PACK_KERNEL_MAP( I32, I32, 6 ),
    PACK_KERNEL_MAP( I32, I32, 9 ),
    PACK_KERNEL_MAP( F32, U32, 0 ),
    PACK_KERNEL_MAP( F32, U32, 1 ),
@ -103,6 +110,7 @@ static const _kernel_map_type _topk_kernel_map[] =
    PACK_KERNEL_MAP( F32, U32, 4 ),
    PACK_KERNEL_MAP( F32, U32, 5 ),
    PACK_KERNEL_MAP( F32, U32, 6 ),
    PACK_KERNEL_MAP( F32, U32, 9 ),
    PACK_KERNEL_MAP( F32, I32, 0 ),
    PACK_KERNEL_MAP( F32, I32, 1 ),
@ -111,6 +119,10 @@ static const _kernel_map_type _topk_kernel_map[] =
    PACK_KERNEL_MAP( F32, I32, 4 ),
    PACK_KERNEL_MAP( F32, I32, 5 ),
    PACK_KERNEL_MAP( F32, I32, 6 ),
    PACK_KERNEL_MAP( F32, I32, 9 ),
    PACK_MERGE_KERNEL_MAP(U32, U32),
    PACK_MERGE_KERNEL_MAP(I32, I32),
 };
 static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
@ -254,7 +266,8 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
-    int32_t num_stages
+    int32_t num_stages,
    vsi_bool is_bitnoic_segment
    )
 {
    vsi_status status = VSI_FAILURE;
@ -272,21 +285,23 @@ static vsi_status _query_kernel
    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
    num_stages = is_bitnoic_segment ? 0 : num_stages;
    switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
    {
    case _PACK_SELECT_KEY(F32, F32):
    case _PACK_SELECT_KEY(F16, F16):
-        key = TOPK_HASH_KEY( F32, F32, num_stages );
+        key = TOPK_HASH_KEY( F32, F32, num_stages, is_bitnoic_segment );
        break;
    case _PACK_SELECT_KEY(U32, U32):
    case _PACK_SELECT_KEY(U16, U16):
    case _PACK_SELECT_KEY(U8,  U8):
-        key = TOPK_HASH_KEY( U32, U32, num_stages );
+        key = TOPK_HASH_KEY( U32, U32, num_stages, is_bitnoic_segment );
        break;
    case _PACK_SELECT_KEY(I32, I32):
    case _PACK_SELECT_KEY(I16, I16):
    case _PACK_SELECT_KEY(I8,  I8):
-        key = TOPK_HASH_KEY( I32, I32, num_stages );
+        key = TOPK_HASH_KEY( I32, I32, num_stages, is_bitnoic_segment );
        break;
    case _PACK_SELECT_KEY(F32, U32):
    case _PACK_SELECT_KEY(F16, U32):
@ -294,7 +309,7 @@ static vsi_status _query_kernel
    case _PACK_SELECT_KEY(F16, U16):
    case _PACK_SELECT_KEY(F32, U8):
    case _PACK_SELECT_KEY(F16, U8):
-        key = TOPK_HASH_KEY( F32, U32, num_stages );
+        key = TOPK_HASH_KEY( F32, U32, num_stages, is_bitnoic_segment );
        break;
    case _PACK_SELECT_KEY(F32, I32):
    case _PACK_SELECT_KEY(F16, I32):
@ -302,7 +317,7 @@ static vsi_status _query_kernel
    case _PACK_SELECT_KEY(F16, I16):
    case _PACK_SELECT_KEY(F32, I8):
    case _PACK_SELECT_KEY(F16, I8):
-        key = TOPK_HASH_KEY( F32, I32, num_stages );
+        key = TOPK_HASH_KEY( F32, I32, num_stages, is_bitnoic_segment );
        break;
    default:
        break;
@ -440,7 +455,12 @@ static vsi_nn_kernel_node_t _setup
    int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
    int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0);
    vsi_bool is_odd_even_sort = FALSE;
    vsi_bool is_bitnoic_segment = FALSE;
    size_t param_num = _TOPK_PARAM_NUM;
    int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2);
    vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
    float inputScale  = vsi_nn_get_tensor_scale(inputs[0]);
    float inputTail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
@ -471,9 +491,22 @@ static vsi_nn_kernel_node_t _setup
    rs_tensors[0] = vsi_nn_reshape_tensor( graph,
        inputs[0], shape[0], 2 );
-    if (num_stages < 7)
+    is_bitnoic_segment = (num_stages >= 9) && (top_k <= 512 && max_stages > 9) &&
        type0 == type1 && (type0 == U8 || type0 == I8 || type0 == I16 || type0 == U16 || type0 == I32 || type0 == U32);
    if (is_bitnoic_segment && num_stages == 9)
    {
-        status = _query_kernel( kernel, inputs, outputs, num_stages );
+        is_bitnoic_segment = FALSE;
    }
    else
    {
        num_stages = is_bitnoic_segment ? 9 : num_stages;
        max_stages = is_bitnoic_segment ? max_stages : 7;
    }
    if (num_stages < max_stages || is_bitnoic_segment)
    {
        status = _query_kernel( kernel, inputs, outputs, num_stages, is_bitnoic_segment );
        rs_tensors[1] = vsi_nn_reshape_tensor( graph,
            outputs[0], shape[1], 2 );
--- a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
@ -35,6 +35,8 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #if (!VX_ARGMAX_VX_SUPPORT)
 __BEGIN_DECLS
 #define HASH_ARGMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
@ -510,3 +512,4 @@ __END_DECLS
 REGISTER_BACKEND_EVIS( argmax, _setup )
 #endif
--- a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
@ -51,26 +51,49 @@ __BEGIN_DECLS
 #define KERNEL_SOURCE_5    "cumsum_ex_rev_axis0"
 #define KERNEL_SOURCE_6    "cumsum_ex_rev_axis1"
 #define KERNEL_SOURCE_7    "cumsum_ex_rev_axis2"
 #define KERNEL_SOURCE_8    "cumsum_array"
 #define KERNEL_SOURCE_9    "cumsum_array_2d"
 #define KERNEL_SOURCE_10   "cumsum_array_bf16"
 #define KERNEL_SOURCE_11   "cumsum_array_f16_u8"
 #define KERNEL_SOURCE_12   "cumsum_array_ex_rev_axis0"
 #define KERNEL_SOURCE_13   "cumsum_array_ex_rev_axis1"
 #define KERNEL_SOURCE_14   "cumsum_array_ex_rev_axis2"
 #define KERNEL_SOURCE_15   "cumsum_array_f16_u8_2d"
 // Add kernel hashtable here
-#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, EX_REV, _image_2d) \
+#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, EX_REV, _image_2d, is_array) \
-    ((EX_REV << 24) | (AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
+    ((EX_REV << 24) | (AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 8) | (_image_2d << 4) | (is_array))
 #define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
-        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0), \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0, 0), \
        CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
        SOURCE },
 #define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
-        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1), \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1, 0), \
        CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
        SOURCE },
 #define HASH_CUMSUM_EX_REV_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
-        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0), \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0, 0), \
        CVIVANTE_NAMESPACE("evis.cumsum_ex_rev_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
        SOURCE },
 #define HASH_CUMSUM_ARRAY_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0, 1), \
        CVIVANTE_NAMESPACE("evis.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
        SOURCE },
 #define HASH_CUMSUM_ARRAY_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1, 1), \
        CVIVANTE_NAMESPACE("evis.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
        SOURCE },
 #define HASH_CUMSUM_ARRAY_EX_REV_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0, 1), \
        CVIVANTE_NAMESPACE("evis.cumsum_ex_rev_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
        SOURCE },
 static const struct {
        uint32_t key;
        char* function_name;
@ -135,6 +158,65 @@ static const struct {
    HASH_CUMSUM_EX_REV_KERNELS(2, F16,  U8,  KERNEL_SOURCE_4)
    HASH_CUMSUM_EX_REV_KERNELS(2, F16,  I8,  KERNEL_SOURCE_4)
    HASH_CUMSUM_EX_REV_KERNELS(2, F16,  I16, KERNEL_SOURCE_4)
    HASH_CUMSUM_ARRAY_KERNELS(0, U8,   U8,   KERNEL_SOURCE_8)
    HASH_CUMSUM_ARRAY_KERNELS(0, I8,   I8,   KERNEL_SOURCE_8)
    HASH_CUMSUM_ARRAY_KERNELS(0, I16,  I16,  KERNEL_SOURCE_8)
    HASH_CUMSUM_ARRAY_KERNELS(0, F16,  F16,  KERNEL_SOURCE_8)
    HASH_CUMSUM_ARRAY_KERNELS(0, BF16, BF16, KERNEL_SOURCE_10)
    HASH_CUMSUM_ARRAY_KERNELS(1, U8,   U8,   KERNEL_SOURCE_8)
    HASH_CUMSUM_ARRAY_KERNELS(1, I8,   I8,   KERNEL_SOURCE_8)
    HASH_CUMSUM_ARRAY_KERNELS(1, I16,  I16,  KERNEL_SOURCE_8)
    HASH_CUMSUM_ARRAY_KERNELS(1, F16,  F16,  KERNEL_SOURCE_8)
    HASH_CUMSUM_ARRAY_KERNELS(1, BF16, BF16, KERNEL_SOURCE_10)
    HASH_CUMSUM_ARRAY_KERNELS(2, U8,   U8,   KERNEL_SOURCE_8)
    HASH_CUMSUM_ARRAY_KERNELS(2, I8,   I8,   KERNEL_SOURCE_8)
    HASH_CUMSUM_ARRAY_KERNELS(2, I16,  I16,  KERNEL_SOURCE_8)
    HASH_CUMSUM_ARRAY_KERNELS(2, F16,  F16,  KERNEL_SOURCE_8)
    HASH_CUMSUM_ARRAY_KERNELS(2, BF16, BF16, KERNEL_SOURCE_10)
    HASH_CUMSUM_ARRAY_KERNELS_2D(0, U8,   U8,   KERNEL_SOURCE_9)
    HASH_CUMSUM_ARRAY_KERNELS_2D(0, I8,   I8,   KERNEL_SOURCE_9)
    HASH_CUMSUM_ARRAY_KERNELS_2D(0, I16,  I16,  KERNEL_SOURCE_9)
    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16,  F16,  KERNEL_SOURCE_9)
    HASH_CUMSUM_ARRAY_KERNELS_2D(0, BF16, BF16, KERNEL_SOURCE_10)
    HASH_CUMSUM_ARRAY_KERNELS_2D(1, U8,   U8,   KERNEL_SOURCE_9)
    HASH_CUMSUM_ARRAY_KERNELS_2D(1, I8,   I8,   KERNEL_SOURCE_9)
    HASH_CUMSUM_ARRAY_KERNELS_2D(1, I16,  I16,  KERNEL_SOURCE_9)
    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16,  F16,  KERNEL_SOURCE_9)
    HASH_CUMSUM_ARRAY_KERNELS_2D(1, BF16, BF16, KERNEL_SOURCE_10)
    HASH_CUMSUM_ARRAY_KERNELS(0, F16,  U8,  KERNEL_SOURCE_11)
    HASH_CUMSUM_ARRAY_KERNELS(0, F16,  I8,  KERNEL_SOURCE_11)
    HASH_CUMSUM_ARRAY_KERNELS(0, F16,  I16, KERNEL_SOURCE_11)
    HASH_CUMSUM_ARRAY_KERNELS(1, F16,  U8,  KERNEL_SOURCE_11)
    HASH_CUMSUM_ARRAY_KERNELS(1, F16,  I8,  KERNEL_SOURCE_11)
    HASH_CUMSUM_ARRAY_KERNELS(1, F16,  I16, KERNEL_SOURCE_11)
    HASH_CUMSUM_ARRAY_KERNELS(2, F16,  U8,  KERNEL_SOURCE_11)
    HASH_CUMSUM_ARRAY_KERNELS(2, F16,  I8,  KERNEL_SOURCE_11)
    HASH_CUMSUM_ARRAY_KERNELS(2, F16,  I16, KERNEL_SOURCE_11)
    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16,  U8,  KERNEL_SOURCE_15)
    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16,  I8,  KERNEL_SOURCE_15)
    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16,  I16, KERNEL_SOURCE_15)
    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16,  U8,  KERNEL_SOURCE_15)
    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16,  I8,  KERNEL_SOURCE_15)
    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16,  I16, KERNEL_SOURCE_15)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, U8,   U8,  KERNEL_SOURCE_12)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, I8,   I8,  KERNEL_SOURCE_12)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, I16,  I16, KERNEL_SOURCE_12)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, F16,  F16, KERNEL_SOURCE_12)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, U8,   U8,  KERNEL_SOURCE_13)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, I8,   I8,  KERNEL_SOURCE_13)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, I16,  I16, KERNEL_SOURCE_13)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16,  F16, KERNEL_SOURCE_13)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, U8,   U8,  KERNEL_SOURCE_14)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, I8,   I8,  KERNEL_SOURCE_14)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, I16,  I16, KERNEL_SOURCE_14)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16,  F16, KERNEL_SOURCE_14)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16,  U8,  KERNEL_SOURCE_11)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16,  I8,  KERNEL_SOURCE_11)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16,  I16, KERNEL_SOURCE_11)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16,  U8,  KERNEL_SOURCE_11)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16,  I8,  KERNEL_SOURCE_11)
    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16,  I16, KERNEL_SOURCE_11)
 };
 /*
@ -161,6 +243,7 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
    size_t                              param_size
    )
 {
 #define VSI_NN_MAX_BLOCK_SIZE  GPU_TENSOR_MAX_WIDTH
    vsi_status status = VSI_FAILURE;
    gpu_param_t shaderParam = {
        3,          // workdim
@ -188,6 +271,9 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
    float   in_out_zp_scale = 1.0f;
    float   in_out_scale    = 1.0f;
    int32_t is_array        = 0;
    int32_t remainder       = 0;
    uint32_t pack_key = 0;
    VSI_UNREFERENCED(param_size);
@ -219,7 +305,15 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
    height  = (int32_t)(input_shape->data[1]);
    channel = (int32_t)(dim > 2 ? input_shape->data[2] : 1);
    if (width > VSI_NN_MAX_BLOCK_SIZE ||
       height > VSI_NN_MAX_BLOCK_SIZE ||
       channel > VSI_NN_MAX_BLOCK_SIZE)
    {
        is_array = 1;
    }
 #undef VSI_NN_MAX_BLOCK_SIZE
    if (axis == 0)
    {
        w = 1;
@ -245,6 +339,7 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
    {
        shaderParam.global_scale[0]  = 16;
    }
    remainder = w % shaderParam.global_scale[0];
    shaderParam.global_scale[1]  = 1;
    shaderParam.global_scale[2]  = 1;
    shaderParam.global_size[0]   = (w + shaderParam.global_scale[0] - 1) / shaderParam.global_scale[0];
@ -253,6 +348,12 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
    CHECK_STATUS_FAIL_GOTO(status, OnError);
    if (is_array)
    {
        status = vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder);
        status |= vsi_nn_kernel_gpu_add_param(node, "w_size", &w);
        CHECK_STATUS_FAIL_GOTO(status, OnError);
    }
 #define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, AXIS, DIM)    \
        (IN0_TYPE | (OUT_TYPE << 8) | (AXIS << 16) | (DIM << 24))
@ -767,7 +868,8 @@ static vsi_status _query_kernel
    const vsi_nn_kernel_param_t * params,
    int32_t axis,
    int32_t is_2d,
-    int32_t is_ex_rev
+    int32_t is_ex_rev,
    int32_t is_array
    )
 {
    vsi_status status = VSI_FAILURE;
@ -781,7 +883,7 @@ static vsi_status _query_kernel
    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_ex_rev, is_2d);
+    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_ex_rev, is_2d, is_array);
    for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
    {
@ -819,6 +921,7 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_t             * kernel
    )
 {
 #define VSI_NN_MAX_BLOCK_SIZE  GPU_TENSOR_MAX_WIDTH
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t tmp_params[_CUMSUM_PARAM_NUM] = { NULL };
    vsi_nn_kernel_node_t node = NULL;
@ -831,7 +934,10 @@ static vsi_nn_kernel_node_t _setup
    int32_t is_2d      = 0;
    uint32_t rs_dim    = 2;
    uint32_t i         = 0;
    int32_t is_array   = 0;
    int32_t is_ex_or_rev  = exclusive || reverse;
    vsi_nn_kernel_dtype_e input0_dtype = U8;
    int32_t width         = 0;
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
@ -860,7 +966,30 @@ static vsi_nn_kernel_node_t _setup
    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
        outputs[0], shapes[0], (vsi_size_t)rs_dim );
-    status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d, is_ex_or_rev);
+    width = (int32_t)shapes[0][0];
    for (i = 0; i < rs_dim; i++)
    {
        if (shapes[0][i] > VSI_NN_MAX_BLOCK_SIZE)
        {
            is_array = 1;
        }
    }
 #undef VSI_NN_MAX_BLOCK_SIZE
    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    if (is_array &&
       ((axis_new == 0 && width < 8) ||
       (axis_new > 0 && (((input0_dtype == U8 || input0_dtype == I8) && width < 16) ||
       ((input0_dtype != U8 && input0_dtype != I8) && width < 8)))
       ))
    {
        return NULL;
    }
    status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d, is_ex_or_rev, is_array);
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
--- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/
-#if !(VX_TENSOR_GATHER_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -35,7 +35,7 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-
+#if !(VX_TENSOR_GATHER_API_SUPPORT)
 __BEGIN_DECLS
 /*
--- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
@ -58,14 +58,14 @@ __BEGIN_DECLS
    _3D
 } vsi_nn_kernel_coord_type_e;
-#define HASH_GATHER_ND_KEY(_input0_type, _output_type, _coord_dim, _batch_dim) \
+#define HASH_GATHER_ND_KEY(_input0_type, _output_type, _coord_dim, _batch_dim, is_array) \
-    ((_input0_type << 24) | (_output_type << 16) | (_coord_dim << 8) | (_batch_dim))
+    ((_input0_type << 24) | (_output_type << 16) | (_coord_dim << 8) | (_batch_dim << 4) | (is_array))
 #define HASH_GATHER_ND_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
    CVIVANTE_NAMESPACE("evis.gather_nd_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
 #define TENSOR_GATHER_ND_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
-    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 0), \
+    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 0, 0), \
        HASH_GATHER_ND_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
        SOURCE },
@ -73,10 +73,26 @@ __BEGIN_DECLS
    CVIVANTE_NAMESPACE("evis.gather_nd_batch_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
 #define TENSOR_GATHER_ND_BATCH_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
-    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 1), \
+    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 1, 0), \
        HASH_GATHER_ND_BATCH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
        SOURCE },
 #define HASH_GATHER_ND_ARRAY_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
    CVIVANTE_NAMESPACE("evis.gather_nd_array_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
 #define TENSOR_GATHER_ND_ARRAY_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 0, 1), \
        HASH_GATHER_ND_ARRAY_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
        SOURCE },
 #define HASH_GATHER_ND_ARRAY_BATCH_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
    CVIVANTE_NAMESPACE("evis.gather_nd_array_batch_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
 #define TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 1, 1), \
        HASH_GATHER_ND_ARRAY_BATCH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
        SOURCE },
 static const struct {
        uint32_t key;
        char* function_name;
@ -125,6 +141,50 @@ static const struct {
    TENSOR_GATHER_ND_BATCH_KERNELS(U8,  I32, U8,  _2D,      KERNEL_SOURCE_8)
    TENSOR_GATHER_ND_BATCH_KERNELS(I16, I32, I16, _2D,      KERNEL_SOURCE_8)
    TENSOR_GATHER_ND_BATCH_KERNELS(F16, I32, F16, _2D,      KERNEL_SOURCE_8)
    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, I8,  _1D,      KERNEL_SOURCE_1)
    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, U8,  _1D,      KERNEL_SOURCE_1)
    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, I16, _1D,      KERNEL_SOURCE_1)
    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, F16, _1D,      KERNEL_SOURCE_1)
    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, I8,  _2D,      KERNEL_SOURCE_2)
    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, U8,  _2D,      KERNEL_SOURCE_2)
    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, I16, _2D,      KERNEL_SOURCE_2)
    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, F16, _2D,      KERNEL_SOURCE_2)
    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, I8,  _3D,      KERNEL_SOURCE_3)
    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, U8,  _3D,      KERNEL_SOURCE_3)
    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, I16, _3D,      KERNEL_SOURCE_3)
    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, F16, _3D,      KERNEL_SOURCE_3)
    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, F16, _1D,      KERNEL_SOURCE_4)
    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, F16, _1D,      KERNEL_SOURCE_4)
    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I8,  _1D,      KERNEL_SOURCE_4)
    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I16, _1D,      KERNEL_SOURCE_4)
    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, F16, _1D,      KERNEL_SOURCE_4)
    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, U8,  _1D,      KERNEL_SOURCE_4)
    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, F16, _2D,      KERNEL_SOURCE_5)
    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, F16, _2D,      KERNEL_SOURCE_5)
    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I8,  _2D,      KERNEL_SOURCE_5)
    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I16, _2D,      KERNEL_SOURCE_5)
    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, F16, _2D,      KERNEL_SOURCE_5)
    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, U8,  _2D,      KERNEL_SOURCE_5)
    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, F16, _3D,      KERNEL_SOURCE_6)
    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, F16, _3D,      KERNEL_SOURCE_6)
    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I8,  _3D,      KERNEL_SOURCE_6)
    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I16, _3D,      KERNEL_SOURCE_6)
    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, F16, _3D,      KERNEL_SOURCE_6)
    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, U8,  _3D,      KERNEL_SOURCE_6)
    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I8,  I32, I8,  _1D,      KERNEL_SOURCE_7)
    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(U8,  I32, U8,  _1D,      KERNEL_SOURCE_7)
    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I16, I32, I16, _1D,      KERNEL_SOURCE_7)
    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(F16, I32, F16, _1D,      KERNEL_SOURCE_7)
    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I8,  I32, I8,  _2D,      KERNEL_SOURCE_8)
    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(U8,  I32, U8,  _2D,      KERNEL_SOURCE_8)
    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I16, I32, I16, _2D,      KERNEL_SOURCE_8)
    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(F16, I32, F16, _2D,      KERNEL_SOURCE_8)
 };
 /*
@ -148,7 +208,8 @@ static vsi_status get_gather_nd_tensor_reshape_size
    vsi_size_t block_size,
    uint32_t coordDim,
    int32_t* newDim,
-    uint32_t  batch_dims
+    uint32_t  batch_dims,
    int32_t* arrayFlg
    )
 {
    vsi_status status = VSI_FAILURE;
@ -184,12 +245,20 @@ static vsi_status get_gather_nd_tensor_reshape_size
            for (i = 0; i < coordDim - 1; i++)
            {
                sizes[rank++] = input_size[i + offset];
                if (sizes[i] >= VSI_NN_MAX_IMAGE_WIDTH)
                {
                    arrayFlg[0] = 1;
                }
            }
            for (i = 0; i < batch_dims; i++)
            {
                sizes[rank] *= input_size[dims_num - i - 1];
            }
            if (sizes[rank] >= VSI_NN_MAX_IMAGE_WIDTH)
            {
                arrayFlg[0] = 1;
            }
            newDim[0] = rank + 1;
        }
@ -198,6 +267,10 @@ static vsi_status get_gather_nd_tensor_reshape_size
            for (i = coordDim-1; i > 0; i--)
            {
                sizes[i] = input_size[i + offset - 1];
                if (sizes[i] >= VSI_NN_MAX_IMAGE_WIDTH)
                {
                    arrayFlg[0] = 1;
                }
            }
            for (i = 0; i < offset; i++)
            {
@ -210,6 +283,10 @@ static vsi_status get_gather_nd_tensor_reshape_size
                newDim[0] = 2;
                sizes[0] = block_size;
                sizes[1] = elementCnt / block_size;
                if ((elementCnt / block_size) >= VSI_NN_MAX_IMAGE_WIDTH)
                {
                    arrayFlg[0] = 1;
                }
            }
            else if (coordDim == 4)
            {
@ -242,6 +319,14 @@ static vsi_status get_gather_nd_tensor_reshape_size
            status = VSI_SUCCESS;
            newDim[0] = 3;
        }
        else
        {
            sizes[0] = block_size;
            sizes[1] = elementCnt / block_size;
            status = VSI_SUCCESS;
            newDim[0] = 2;
            arrayFlg[0] = 1;
        }
    }
 #undef VSI_NN_MAX_IMAGE_WIDTH
@ -409,7 +494,8 @@ static vsi_status _query_kernel
    vsi_nn_tensor_t* const* const outputs,
    vsi_nn_kernel_t* kernel,
    int32_t coord_dim,
-    int32_t batch_dims
+    int32_t batch_dims,
    int32_t is_array
    )
 {
    vsi_status status = VSI_FAILURE;
@ -444,7 +530,7 @@ static vsi_status _query_kernel
        coord_type = _3D;
    }
-    key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, batch_flg );
+    key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, batch_flg, is_array);
    for ( i = 0; i < _cnt_of_array(gather_nd_map); i ++ )
    {
@ -482,6 +568,7 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_t             * kernel
    )
 {
 #define VSI_NN_MAX_BLOCK_SIZE  GPU_TENSOR_MAX_WIDTH
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t tmp_params[_GATHER_ND_PARAM_NUM] = { NULL };
    vsi_nn_kernel_node_t node = NULL;
@ -489,26 +576,41 @@ static vsi_nn_kernel_node_t _setup
    int32_t batch_dims  = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
    int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
    int32_t coord_dim   = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
    int32_t input_size  = 1;
    int32_t no_block_batch_size = 1;
    int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
    int32_t is_array    = 0;
    int32_t i = 0;
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
-    status = get_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim, batch_dims);
+    for (i = 0; i < (int32_t)inputs[0]->attr.dim_num; i++)
-    status |= get_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim, batch_dims);
+    {
-    status |= get_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim, batch_dims);
+        input_size = input_size * (int32_t)inputs[0]->attr.size[i];
    }
    no_block_batch_size = input_size / block_size;
    is_array = no_block_batch_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0;
    status = get_gather_nd_tensor_reshape_size(&inputs[0], shapes[0],
        block_size, coord_dim, &rs_in_dim, batch_dims, &is_array);
    status |= get_gather_nd_tensor_reshape_size(&inputs[1], shapes[1],
        coord_dim, 0, &rs_idx_dim, batch_dims, &is_array);
    status |= get_gather_nd_tensor_reshape_size(&outputs[0], shapes[2],
        block_size, 0, &rs_out_dim, batch_dims, &is_array);
 #undef VSI_NN_MAX_BLOCK_SIZE
    if (status != VSI_SUCCESS)
    {
        return NULL;
    }
-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    //if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    //            outputs[0]->attr.dim_num ) )
-    {
+    //{
-        return NULL;
+    //    return NULL;
-    }
+    //}
-    status = _query_kernel( inputs, outputs, kernel, coord_dim, batch_dims );
+    status = _query_kernel( inputs, outputs, kernel, coord_dim, batch_dims, is_array);
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
--- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/
-#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
-
+#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
 __BEGIN_DECLS
 #define SOURCE_AXIS0_0     "layer_normalization_0"
--- a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/
-#if !(VX_LOGSOFTMAX_VX_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
-
+#if !(VX_LOGSOFTMAX_VX_SUPPORT)
 __BEGIN_DECLS
 #define HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
--- a/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c
@ -36,6 +36,8 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #if (!VX_NEAREST_GRID_SAMPLE_VX_SUPPORT)
 __BEGIN_DECLS
 /*
@ -625,3 +627,4 @@ __END_DECLS
 REGISTER_BACKEND_EVIS( nearest_grid_sample, _setup )
 #endif
--- a/src/tim/vx/internal/src/kernel/evis/pow_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pow_evis.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/
-#if !(VX_TENSOR_POW_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-
+#if !(VX_TENSOR_POW_API_SUPPORT)
 __BEGIN_DECLS
 #define KERNEL_SOURCE    "pow",
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
@ -750,6 +750,7 @@ static vsi_nn_kernel_node_t _setup
    shape[2] = 1;
    reshape_tensor = vsi_nn_reshape_tensor( graph,
            outputs[0], shape, outputs[0]->attr.dim_num );
    CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor fail.", final);
    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
                outputs[0]->attr.dim_num ) )
@ -819,6 +820,7 @@ static vsi_nn_kernel_node_t _setup
 final:
    vsi_nn_safe_free(node_params);
    vsi_safe_release_tensor(reshape_tensor);
    return node;
 } /* _setup() */
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
@ -911,6 +911,7 @@ static vsi_nn_kernel_node_t _setup
    shape[2] = 1;
    reshape_tensor = vsi_nn_reshape_tensor( graph,
            outputs[0], shape, outputs[0]->attr.dim_num );
    CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor fail.", final);
    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
                outputs[0]->attr.dim_num ) )
@ -978,6 +979,7 @@ static vsi_nn_kernel_node_t _setup
 final:
    vsi_nn_safe_free(node_params);
    vsi_safe_release_tensor(reshape_tensor);
    return node;
 } /* _setup() */
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_dtype_util_prv.h"
-
+#if (!VX_RESIZE_BILINEAR_SH_SUPPORT)
 __BEGIN_DECLS
 /*
@ -1515,3 +1515,4 @@ final:
 __END_DECLS
 REGISTER_BACKEND_EVIS( resize_bilinear, _setup )
 #endif
--- a/src/tim/vx/internal/src/kernel/evis/tile_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/tile_evis.c
@ -22,7 +22,7 @@
 *
 *****************************************************************************/
-#if !(VX_TENSOR_TILE_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
-
+#if !(VX_TENSOR_TILE_API_SUPPORT)
 __BEGIN_DECLS
 /*
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@ -29,6 +29,7 @@
 #include "vsi_nn_context.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_error.h"
@ -1673,7 +1674,7 @@ vsi_status vsi_nn_KernelGpuConfig
 static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
 {
-    int32_t enableShader = graph->ctx->options.enable_shader;
+    int32_t enableShader = ((vsi_nn_graph_prv_t*)graph)->options->enable_shader;
 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
    if ( graph->ctx->config.subGroupSize == 0 )
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
@ -181,6 +181,9 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(cos)
 #if (VX_LOGSOFTMAX_VX_SUPPORT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(log_softmax)
 #endif
 #if (VX_BITCAST_VX_SUPPORT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(bitcast)
 #endif
 __END_DECLS
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
@ -916,11 +916,21 @@ vsi_nn_tensor_t * vsi_nn_kernel_insert_reshape_node
    {
        input = in_tensor;
        output = tensor;
        /* Create a openvx tensor if it is not exist */
        if (NULL == input->t)
        {
            vsi_nn_TensorReinit(graph, input);
        }
    }
    else
    {
        input = tensor;
        output = in_tensor;
        /* Create a openvx tensor if it is not exist */
        if (NULL == output->t)
        {
            vsi_nn_TensorReinit(graph, output);
        }
    }
    vxTensorReshapeNode(graph->g, input->t, &reshape_param, sizeof(reshape_param), output->t);
--- a/src/tim/vx/internal/src/kernel/vx/argmax_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/argmax_vx.c
@ -0,0 +1,79 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include "vsi_nn_types.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #if (VX_ARGMAX_VX_SUPPORT)
 #define REGISTER_ARGMAXOPENVX_KERNEL( kernel_name )   \
    static vsi_nn_kernel_node_t _##kernel_name##setup \
        ( \
        vsi_nn_graph_t              * graph, \
        vsi_nn_tensor_t            ** inputs, \
        size_t                        input_num, \
        vsi_nn_tensor_t            ** outputs, \
        size_t                        output_num,\
        const vsi_nn_kernel_param_t * params, \
        vsi_nn_kernel_t             * kernel \
        ); \
    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
    static vsi_nn_kernel_node_t _##kernel_name##setup \
        ( \
        vsi_nn_graph_t              * graph, \
        vsi_nn_tensor_t            ** inputs, \
        size_t                        input_num, \
        vsi_nn_tensor_t            ** outputs, \
        size_t                        output_num,\
        const vsi_nn_kernel_param_t * params, \
        vsi_nn_kernel_t             * kernel \
        )
 REGISTER_ARGMAXOPENVX_KERNEL( argmax )
 {
    vx_node node = NULL;
    int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
    VSI_UNREFERENCED(kernel);
    VSI_UNREFERENCED(params);
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    node = vxArgmaxLayer(graph->g,
                        inputs[0]->t,
                        axis,
                        outputs[0]->t
                        );
    return (vsi_nn_kernel_node_t)node;
 } /* argmax() */
 #undef REGISTER_ARGMAXOPENVX_KERNEL
 #endif
--- a/src/tim/vx/internal/src/kernel/vx/bitcast_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/bitcast_vx.c
@ -0,0 +1,77 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include "vsi_nn_types.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #if (VX_BITCAST_VX_SUPPORT)
 #define REGISTER_BITCASTOPENVX_KERNEL( kernel_name )   \
    static vsi_nn_kernel_node_t _##kernel_name##setup \
        ( \
        vsi_nn_graph_t              * graph, \
        vsi_nn_tensor_t            ** inputs, \
        size_t                        input_num, \
        vsi_nn_tensor_t            ** outputs, \
        size_t                        output_num,\
        const vsi_nn_kernel_param_t * params, \
        vsi_nn_kernel_t             * kernel \
        ); \
    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
    static vsi_nn_kernel_node_t _##kernel_name##setup \
        ( \
        vsi_nn_graph_t              * graph, \
        vsi_nn_tensor_t            ** inputs, \
        size_t                        input_num, \
        vsi_nn_tensor_t            ** outputs, \
        size_t                        output_num,\
        const vsi_nn_kernel_param_t * params, \
        vsi_nn_kernel_t             * kernel \
        )
 REGISTER_BITCASTOPENVX_KERNEL( bitcast )
 {
    vx_node node = NULL;
    VSI_UNREFERENCED(kernel);
    VSI_UNREFERENCED(params);
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    node = vxBitCastLayer(graph->g,
                              inputs[0]->t,
                              outputs[0]->t
                              );
    return (vsi_nn_kernel_node_t)node;
 } /* bitcast() */
 #undef REGISTER_BITCASTOPENVX_KERNEL
 #endif
--- a/src/tim/vx/internal/src/kernel/vx/grid_sample_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/grid_sample_vx.c
@ -0,0 +1,91 @@
 /****************************************************************************
 *
 *    Copyright (c) 2021 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include "vsi_nn_types.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #if (VX_NEAREST_GRID_SAMPLE_VX_SUPPORT)
 static vsi_nn_kernel_node_t _setup
    (
    vsi_nn_graph_t              * graph,
    vsi_nn_tensor_t            ** inputs,
    size_t                        input_num,
    vsi_nn_tensor_t            ** outputs,
    size_t                        output_num,
    const vsi_nn_kernel_param_t * params,
    vsi_nn_kernel_t             * kernel
    )
 {
    vx_node node = NULL;
    int32_t mode =
        vsi_nn_kernel_param_get_int32(params, "mode");
    int32_t align_corners =
        vsi_nn_kernel_param_get_int32(params, "align_corners");
    int32_t pad_mode =
        vsi_nn_kernel_param_get_int32(params, "padding_mode");
    VSI_UNREFERENCED(kernel);
    VSI_UNREFERENCED(output_num);
    VSI_UNREFERENCED(input_num);
    node = vxGridSampleLayer(
        graph->g,
        inputs[0]->t,
        inputs[1]->t,
        mode,
        align_corners,
        pad_mode,
        outputs[0]->t
        );
    return (vsi_nn_kernel_node_t)node;
 } /* _setup() */
 #define REGISTER_NEAREST_GRID_SAMPLE_OPENVX_KERNEL(KERNEL_NAME) \
    static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
        ( \
        vsi_nn_graph_t              * graph, \
        vsi_nn_tensor_t            ** inputs, \
        size_t                        input_num, \
        vsi_nn_tensor_t            ** outputs, \
        size_t                        output_num, \
        const vsi_nn_kernel_param_t * params, \
        vsi_nn_kernel_t             * kernel \
        ) \
    { \
        return _setup(graph, inputs, input_num, outputs, output_num, \
                params, kernel); \
    } \
    REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
 REGISTER_NEAREST_GRID_SAMPLE_OPENVX_KERNEL( nearest_grid_sample )
 #undef REGISTER_NEAREST_GRID_SAMPLE_OPENVX_KERNEL
 #endif
--- a/src/tim/vx/internal/src/kernel/vx/l1_layer_norm_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/l1_layer_norm_vx.c
@ -0,0 +1,82 @@
 /****************************************************************************
 *
 *    Copyright (c) 2021 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include "vsi_nn_types.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #if (VX_L1_LAYER_NORM_VX_SUPPORT)
 #define REGISTER_L1_LAYER_NORM_OPENVX_KERNEL( kernel_name )   \
    static vsi_nn_kernel_node_t _##kernel_name##setup \
        ( \
        vsi_nn_graph_t              * graph, \
        vsi_nn_tensor_t            ** inputs, \
        size_t                        input_num, \
        vsi_nn_tensor_t            ** outputs, \
        size_t                        output_num,\
        const vsi_nn_kernel_param_t * params, \
        vsi_nn_kernel_t             * kernel \
        ); \
    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
    static vsi_nn_kernel_node_t _##kernel_name##setup \
        ( \
        vsi_nn_graph_t              * graph, \
        vsi_nn_tensor_t            ** inputs, \
        size_t                        input_num, \
        vsi_nn_tensor_t            ** outputs, \
        size_t                        output_num,\
        const vsi_nn_kernel_param_t * params, \
        vsi_nn_kernel_t             * kernel \
        )
 REGISTER_L1_LAYER_NORM_OPENVX_KERNEL( l1_layer_norm )
 {
    vx_node node = NULL;
    float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
    VSI_UNREFERENCED(kernel);
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    node = vxL1LayerNormalizationLayer(
        graph->g,
        eps,
        axis,
        inputs[0]->t,
        inputs[1]->t,
        inputs[2]->t,
        inputs[3]->t,
        outputs[0]->t
        );
    return (vsi_nn_kernel_node_t)node;
 } /* l1_layer_norm() */
 #undef REGISTER_L1_LAYER_NORM_OPENVX_KERNEL
 #endif
--- a/src/tim/vx/internal/src/libnnext/ops/cl/col2im.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/col2im.cl
@ -0,0 +1,162 @@
 #pragma OPENCL EXTENSION cl_viv_vx_extension : enable
 #include "cl_viv_vx_ext.h"
 _viv_uniform int width_pad;
 _viv_uniform int height_pad;
 _viv_uniform int depth_pad;
 _viv_uniform int move_time_x;
 _viv_uniform int move_time_y;
 _viv_uniform int kernel_x_new;
 _viv_uniform int kernel_y_new;
 _viv_uniform int kernel_z_new;
 _viv_uniform int depth;
 #define COL2IM(name, read_type, dst_type ,convert_type, write_type) \
 __kernel void col2im_##name \
 ( \
    __read_only image2d_array_t   input, \
    __write_only image2d_array_t  output, \
                 int              stride_w, \
                 int              stride_h, \
                 int              stride_d, \
                 int              dilation_w, \
                 int              dilation_h, \
                 int              dilation_d, \
                 int              pad_w_front, \
                 int              pad_w_end, \
                 int              pad_h_front, \
                 int              pad_h_end, \
                 int              pad_d_front, \
                 int              pad_d_end, \
                 int              kernel_x, \
                 int              kernel_y, \
                 int              kernel_z, \
                 float            inOutScale, \
                 float            inOutTile \
 ) \
 { \
    int x = get_global_id(0); \
    int y = get_global_id(1); \
    int z = get_global_id(2); \
    int4 coord_out = (int4)(x,y,z,0); \
    int b = z / depth; \
    z = z % depth; \
    int4 coord_in = (int4)(0,0,b,0); \
 \
    float sum = 0.0f; \
    x = x + pad_w_front; \
    y = y + pad_h_front; \
    z = z + pad_d_front; \
    int offset_x = x % stride_w; \
    int offset_y = y % stride_h; \
    int offset_z = z % stride_d; \
    int i,j,k; \
    for (k = offset_z; k < kernel_z_new; k += stride_d) \
    { \
        if ((z - k) < 0 || (z + (kernel_z_new - k)) > depth_pad || k % dilation_d != 0) \
        { \
            continue; \
        } \
        for (j = offset_y; j < kernel_y_new; j = j + stride_h) \
        { \
            if ((y - j) < 0 || (y + (kernel_y_new - j)) > height_pad || j % dilation_h != 0) \
            { \
                continue; \
            } \
            for (i = offset_x; i < kernel_x_new; i = i + stride_w) \
            { \
                if ((x - i) < 0 || (x + (kernel_x_new - i)) > width_pad || i % dilation_w != 0) \
                { \
                    continue; \
                } \
                coord_in.x = (x - i + stride_w - 1) / stride_w + \
                             (y - j + stride_h - 1) / stride_h * move_time_x + \
                             (z - k + stride_d - 1) / stride_d * move_time_y * move_time_x; \
                coord_in.y = i / dilation_w + j * kernel_x / dilation_h + k * kernel_x * kernel_y / dilation_d; \
                sum = sum + convert_float(read_type(input, coord_in).x); \
            } \
        } \
    } \
    sum = sum * inOutScale + inOutTile; \
    dst_type dst = 0; \
    dst.x = convert_type(sum); \
    write_type(output, coord_out, dst); \
 }
 COL2IM(U32toU32, read_imageui, uint4,  convert_uint,  write_imageui)
 COL2IM(U32toI32, read_imageui, int4,   convert_int,   write_imagei)
 COL2IM(U32toF32, read_imageui, float4, convert_float, write_imagef)
 COL2IM(I32toU32, read_imagei,  uint4,  convert_uint,  write_imageui)
 COL2IM(I32toI32, read_imagei,  int4,   convert_int,   write_imagei)
 COL2IM(I32toF32, read_imagei,  float4, convert_float, write_imagef)
 COL2IM(F32toU32, read_imagef,  uint4,  convert_uint,  write_imageui)
 COL2IM(F32toI32, read_imagef,  int4,   convert_int,   write_imagei)
 COL2IM(F32toF32, read_imagef,  float4, convert_float, write_imagef)
 #define COL2IM_2D(name, read_type, dst_type ,convert_type, write_type) \
 __kernel void col2im_##name##_2D \
 ( \
    __read_only image2d_array_t   input, \
    __write_only image2d_array_t  output, \
                 int              stride_w, \
                 int              stride_h, \
                 int              stride_d, \
                 int              dilation_w, \
                 int              dilation_h, \
                 int              dilation_d, \
                 int              pad_w_front, \
                 int              pad_w_end, \
                 int              pad_h_front, \
                 int              pad_h_end, \
                 int              pad_d_front, \
                 int              pad_d_end, \
                 int              kernel_x, \
                 int              kernel_y, \
                 int              kernel_z, \
                 float            inOutScale, \
                 float            inOutTile \
 ) \
 { \
    int x = get_global_id(0); \
    int y = get_global_id(1); \
    int z = get_global_id(2); \
    int4 coord_out = (int4)(x,y,z,0); \
    int4 coord_in = (int4)(0,0,z,0); \
 \
    float sum = 0.0f; \
    x = x + pad_w_front; \
    y = y + pad_h_front; \
    int offset_x = x % stride_w; \
    int offset_y = y % stride_h; \
    int i,j; \
    for (j = offset_y; j < kernel_y_new; j = j + stride_h) \
    { \
        if ((y - j) < 0 || (y + (kernel_y_new - j)) > height_pad || j % dilation_h != 0) \
        { \
            continue; \
        } \
        for (i = offset_x; i < kernel_x_new; i = i + stride_w) \
        { \
            if ((x - i) < 0 || (x + (kernel_x_new - i)) > width_pad || i % dilation_w != 0) \
            { \
                continue; \
            } \
            coord_in.x = (x - i + stride_w - 1) / stride_w + \
                         (y - j + stride_h - 1) / stride_h * move_time_x; \
            coord_in.y = i / dilation_w + j * kernel_x / dilation_h; \
            sum = sum + convert_float(read_type(input, coord_in).x); \
        } \
    } \
    sum = sum * inOutScale + inOutTile; \
    dst_type dst = 0; \
    dst.x = convert_type(sum); \
    write_type(output, coord_out, dst); \
 }
 COL2IM_2D(U32toU32, read_imageui, uint4,  convert_uint,  write_imageui)
 COL2IM_2D(U32toI32, read_imageui, int4,   convert_int,   write_imagei)
 COL2IM_2D(U32toF32, read_imageui, float4, convert_float, write_imagef)
 COL2IM_2D(I32toU32, read_imagei,  uint4,  convert_uint,  write_imageui)
 COL2IM_2D(I32toI32, read_imagei,  int4,   convert_int,   write_imagei)
 COL2IM_2D(I32toF32, read_imagei,  float4, convert_float, write_imagef)
 COL2IM_2D(F32toU32, read_imagef,  uint4,  convert_uint,  write_imageui)
 COL2IM_2D(F32toI32, read_imagef,  int4,   convert_int,   write_imagei)
 COL2IM_2D(F32toF32, read_imagef,  float4, convert_float, write_imagef)
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis0.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis0.cl
@ -0,0 +1,332 @@
 __kernel void cumsum_array_F32toF32_axis0_2D(
    __read_only image2d_t  input,
    __write_only image2d_t  output,
    int axis,
    int exclusive,
    int rev,
    int width,
    int height,
    int chn,
    int input_zp,
    float in_out_scale,
    float in_out_zp_scale,
    float output_zp
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
    float sum = (float)(0);
    Image img1 = create_image_from_image2d(input, 4);
    Image img2 = create_image_from_image2d(output, 4);
    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
    __global float* in_ptr = (__global float*)input_ptr;
    __global float* out_ptr = (__global float*)output_ptr;
    if(exclusive && rev)
    {
        coord.x = width - 1;
        coord.z = coord.x;
        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
        out_ptr = (__global float*)output_ptr;
        out_ptr[0] = sum;
        for(; coord.x > 0; coord.x--)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            coord.z--;
            sum += data;
            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
    else if(exclusive)
    {
        coord.z = 0;
        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
        out_ptr = (__global float*)output_ptr;
        out_ptr[0] = sum;
        for(coord.x = 0; coord.x < width - 1; coord.x++)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            coord.z++;
            sum += data;
            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
    else if(rev)
    {
        for(coord.x = width - 1; coord.x >= 0; coord.x--)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            sum += data;
            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
    else
    {
        for(coord.x = 0; coord.x < width; coord.x++)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            sum += data;
            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
 }
 __kernel void cumsum_array_U8toU8_axis0_2D(
    __read_only image2d_t  input,
    __write_only image2d_t  output,
    int axis,
    int exclusive,
    int rev,
    int width,
    int height,
    int chn,
    int input_zp,
    float in_out_scale,
    float in_out_zp_scale,
    float output_zp
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
    uint sum = (uint)(0);
    uint dst = (uint)(0);
    int tmp_zp = convert_int_rte(output_zp);
    dst.x = convert_uint_sat(tmp_zp);
    float cnt = 0.0f;
    Image img1 = create_image_from_image2d(input, 4);
    Image img2 = create_image_from_image2d(output, 4);
    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
    __global uint* in_ptr = (__global uint*)input_ptr;
    __global uint* out_ptr = (__global uint*)output_ptr;
    if(exclusive && rev)
    {
        coord.x = width - 1;
        coord.z = coord.x;
        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
        out_ptr = (__global float*)output_ptr;
        out_ptr[0] = dst;
        for(; coord.x > 0; coord.x--)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global uint*)input_ptr;
            uint data = in_ptr[0];
            coord.z--;
            cnt += 1.0;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum * in_out_scale + tmpAlpha;
            dst = (uint)convert_int_rte(tmpSum);
            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
    else if(exclusive)
    {
        coord.z = 0;
        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
        out_ptr = (__global float*)output_ptr;
        out_ptr[0] = dst;
        for(coord.x = 0; coord.x < width - 1; coord.x++)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global uint*)input_ptr;
            uint data = in_ptr[0];
            cnt += 1.0f;
            coord.z++;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum * in_out_scale + tmpAlpha;
            dst = (uint)convert_int_rte(tmpSum);
            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = dst;
        }
    }
    else if(rev)
    {
        for(coord.x = width - 1; coord.x >= 0; coord.x--)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global uint*)input_ptr;
            uint data = in_ptr[0];
            cnt += 1.0f;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum * in_out_scale + tmpAlpha;
            dst = (uint)convert_int_rte(tmpSum);
            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = dst;
        }
    }
    else
    {
        for(coord.x = 0; coord.x < width; coord.x++)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global uint*)input_ptr;
            uint data = in_ptr[0];
            cnt += 1.0f;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum * in_out_scale + tmpAlpha;
            dst = (uint)convert_int_rte(tmpSum);
            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = dst;
        }
    }
 }
 __kernel void cumsum_array_F32toU8_axis0_2D(
    __read_only image2d_t  input,
    __write_only image2d_t  output,
    int axis,
    int exclusive,
    int rev,
    int width,
    int height,
    int chn,
    int input_zp,
    float in_out_scale,
    float in_out_zp_scale,
    float output_zp
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
    float4 sum = (float4)(0);
    uint4 dst = (uint4)(0);
    int tmp_zp = convert_int_rte(output_zp);
    dst.x = convert_uint_sat(tmp_zp);
    float cnt = 0.0f;
    Image img1 = create_image_from_image2d(input, 4);
    Image img2 = create_image_from_image2d(output, 4);
    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
    __global float* in_ptr = (__global float*)input_ptr;
    __global uint* out_ptr = (__global uint*)output_ptr;
    if(exclusive && rev)
    {
        coord.x = width - 1;
        coord.z = coord.x;
        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
        out_ptr = (__global uint*)output_ptr;
        out_ptr[0] = dst;
        for(; coord.x > 0; coord.x--)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            coord.z--;
            cnt += 1.0;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum * in_out_scale + tmpAlpha;
            dst = (uint)convert_int_rte(tmpSum);
            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
            out_ptr = (__global uint*)output_ptr;
            out_ptr[0] = dst;
        }
    }
    else if(exclusive)
    {
        coord.z = 0;
        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
        out_ptr = (__global uint*)output_ptr;
        out_ptr[0] = dst;
        for(coord.x = 0; coord.x < width - 1; coord.x++)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            cnt += 1.0f;
            coord.z++;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum * in_out_scale + tmpAlpha;
            dst = (uint)convert_int_rte(tmpSum);
            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
            out_ptr = (__global uint*)output_ptr;
            out_ptr[0] = dst;
        }
    }
    else if(rev)
    {
        for(coord.x = width - 1; coord.x >= 0; coord.x--)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            cnt += 1.0f;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum * in_out_scale + tmpAlpha;
            dst = (uint)convert_int_rte(tmpSum);
            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
            out_ptr = (__global uint*)output_ptr;
            out_ptr[0] = dst;
        }
    }
    else
    {
        for(coord.x = 0; coord.x < width; coord.x++)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            cnt += 1.0f;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum.x * in_out_scale + tmpAlpha;
            dst.x = (uint)convert_int_rte(tmpSum);
            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
            out_ptr = (__global uint*)output_ptr;
            out_ptr[0] = dst;
        }
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis1.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis1.cl
@ -0,0 +1,321 @@
 __kernel void cumsum_array_F32toF32_axis1_2D(
    __read_only image2d_t  input,
    __write_only image2d_t  output,
    int axis,
    int exclusive,
    int rev,
    int width,
    int height,
    int chn,
    int input_zp,
    float in_out_scale,
    float in_out_zp_scale,
    float output_zp
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
    float sum = (float)(0);
    Image img1 = create_image_from_image2d(input, 4);
    Image img2 = create_image_from_image2d(output, 4);
    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
    __global float* in_ptr = (__global float*)input_ptr;
    __global float* out_ptr = (__global float*)output_ptr;
    if(exclusive && rev)
    {
        coord.w = height - 1;
        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
        out_ptr = (__global float*)output_ptr;
        out_ptr[0] = sum;
        for(coord.y = height - 1; coord.y > 0; coord.y--)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            coord.w--;
            sum += data;
            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
    else if(exclusive)
    {
        write_imagef(output, coord.zw, sum);
        for(coord.y = 0; coord.y < height - 1; coord.y++)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            coord.w++;
            sum += data;
            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
    else if(rev)
    {
        for(coord.y = height - 1; coord.y >= 0; coord.y--)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            sum += data;
            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
    else
    {
        for(coord.y = 0; coord.y < height; coord.y++)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            sum += data;
            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
 }
 __kernel void cumsum_array_U8toU8_axis1_2D(
    __read_only image2d_t  input,
    __write_only image2d_t  output,
    int axis,
    int exclusive,
    int rev,
    int width,
    int height,
    int chn,
    int input_zp,
    float in_out_scale,
    float in_out_zp_scale,
    float output_zp
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
    uint sum = (uint)(0);
    uint dst = (uint)(0);
    int tmp_zp = convert_int_rte(output_zp);
    dst = convert_uint_sat(tmp_zp);
    float cnt = 0;
    Image img1 = create_image_from_image2d(input, 4);
    Image img2 = create_image_from_image2d(output, 4);
    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
    __global uint* in_ptr = (__global uint*)input_ptr;
    __global uint* out_ptr = (__global uint*)output_ptr;
    if(exclusive && rev)
    {
        coord.w = height - 1;
        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
        out_ptr = (__global float*)output_ptr;
        out_ptr[0] = dst;
        for(coord.y = height - 1; coord.y > 0; coord.y--)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global uint*)input_ptr;
            uint data = in_ptr[0];
            cnt += 1.0f;
            coord.w--;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum * in_out_scale + tmpAlpha;
            dst = (uint)convert_int_rte(tmpSum);
            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
            out_ptr = (__global uint*)output_ptr;
            out_ptr[0] = dst;
        }
    }
    else if(exclusive)
    {
        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
        out_ptr = (__global float*)output_ptr;
        out_ptr[0] = dst;
        for(coord.y = 0; coord.y < height - 1; coord.y++)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global uint*)input_ptr;
            uint data = in_ptr[0];
            cnt += 1.0f;
            coord.w++;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum * in_out_scale + tmpAlpha;
            dst = (uint)convert_int_rte(tmpSum);
            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
            out_ptr = (__global uint*)output_ptr;
            out_ptr[0] = dst;
        }
    }
    else if(rev)
    {
        for(coord.y = height - 1; coord.y >= 0; coord.y--)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global uint*)input_ptr;
            uint data = in_ptr[0];
            cnt += 1.0f;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum * in_out_scale + tmpAlpha;
            dst = (uint)convert_int_rte(tmpSum);
            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
            out_ptr = (__global uint*)output_ptr;
            out_ptr[0] = dst;
        }
    }
    else
    {
        for(coord.y = 0; coord.y < height; coord.y++)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global uint*)input_ptr;
            uint data = in_ptr[0];
            cnt += 1.0f;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum * in_out_scale + tmpAlpha;
            dst = (uint)convert_int_rte(tmpSum);
            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
            out_ptr = (__global uint*)output_ptr;
            out_ptr[0] = dst;
        }
    }
 }
 __kernel void cumsum_array_F32toU8_axis1_2D(
    __read_only image2d_t  input,
    __write_only image2d_t  output,
    int axis,
    int exclusive,
    int rev,
    int width,
    int height,
    int chn,
    int input_zp,
    float in_out_scale,
    float in_out_zp_scale,
    float output_zp
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
    float sum = (float)(0);
    uint dst = (uint)(0);
    int tmp_zp = convert_int_rte(output_zp);
    dst = convert_uint_sat(tmp_zp);
    float cnt = 0;
    Image img1 = create_image_from_image2d(input, 4);
    Image img2 = create_image_from_image2d(output, 4);
    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
    __global float* in_ptr = (__global float*)input_ptr;
    __global uint* out_ptr = (__global uint*)output_ptr;
    if(exclusive && rev)
    {
        coord.w = height - 1;
        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
        out_ptr = (__global uint*)output_ptr;
        out_ptr[0] = dst;
        for(coord.y = height - 1; coord.y > 0; coord.y--)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            cnt += 1.0f;
            coord.w--;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum * in_out_scale + tmpAlpha;
            dst = (uint)convert_int_rte(tmpSum);
            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
            out_ptr = (__global uint*)output_ptr;
            out_ptr[0] = dst;
        }
    }
    else if(exclusive)
    {
        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
        out_ptr = (__global uint*)output_ptr;
        out_ptr[0] = dst;
        for(coord.y = 0; coord.y < height - 1; coord.y++)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            cnt += 1.0f;
            coord.w++;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum * in_out_scale + tmpAlpha;
            dst = (uint)convert_int_rte(tmpSum);
            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
            out_ptr = (__global uint*)output_ptr;
            out_ptr[0] = dst;
        }
    }
    else if(rev)
    {
        for(coord.y = height - 1; coord.y >= 0; coord.y--)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            cnt += 1.0f;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum * in_out_scale + tmpAlpha;
            dst = (uint)convert_int_rte(tmpSum);
            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
            out_ptr = (__global uint*)output_ptr;
            out_ptr[0] = dst;
        }
    }
    else
    {
        for(coord.y = 0; coord.y < height; coord.y++)
        {
            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            cnt += 1.0f;
            sum += data;
            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
            float tmpSum = sum * in_out_scale + tmpAlpha;
            dst = (uint)convert_int_rte(tmpSum);
            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
            out_ptr = (__global uint*)output_ptr;
            out_ptr[0] = dst;
        }
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis0.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis0.cl
@ -0,0 +1,215 @@
 __kernel void cumsum_array_F32toF32_axis0(
    __read_only image2d_array_t  input,
    __write_only image2d_array_t  output,
    int axis,
    int exclusive,
    int rev,
    int width,
    int height,
    int channel,
    int input_zp,
    float in_out_scale,
    float in_out_zp_scale,
    float output_zp
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4 coord_out = coord;
    float sum = (float)(0);
    Tensor img1 = create_tensor_from_image2d_array(input, 4);
    Tensor img2 = create_tensor_from_image2d_array(output, 4);
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
    __global float* in_ptr = (__global float*)input_ptr;
    __global float* out_ptr = (__global float*)output_ptr;
    if(exclusive && rev)
    {
        coord_out.x = width - 1;
        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
        out_ptr = (__global float*)output_ptr;
        out_ptr[0] = sum;
        for(coord.x = width - 1; coord.x > 0; coord.x--)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            coord_out.x--;
            sum += data;
            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
    else if(exclusive)
    {
        coord_out.x = 0;
        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
        out_ptr = (__global float*)output_ptr;
        out_ptr[0] = sum;
        for(coord.x = 0; coord.x < width - 1; coord.x++)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            coord_out.x++;
            sum += data;
            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
    else if(rev)
    {
        for(coord.x = width - 1; coord.x >= 0; coord.x--)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            sum += data;
            output_ptr = get_tensor_ptr_from_coord(img2, coord);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
    else
    {
        for(coord.x = 0; coord.x < width; coord.x++)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            sum += data;
            output_ptr = get_tensor_ptr_from_coord(img2, coord);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
 }
 #define CUMSUM_ARRAY_toU8_AXIS0_SH(name, src_type) \
 __kernel void cumsum_array_##name##toU8_axis0( \
    __read_only image2d_array_t  input, \
    __write_only image2d_array_t  output, \
    int axis, \
    int exclusive, \
    int rev, \
    int width, \
    int height, \
    int channel, \
    int input_zp, \
    float in_out_scale, \
    float in_out_zp_scale, \
    float output_zp \
    ) \
 { \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
    int4 coord_out = coord; \
 \
    src_type sum = (src_type)(0); \
    uint dst = (uint)(0); \
    int tmp_zp = convert_int_rte(output_zp); \
    dst = convert_uint_sat(tmp_zp); \
 \
    float cnt = 0; \
 \
    Tensor img1 = create_tensor_from_image2d_array(input, 4); \
    Tensor img2 = create_tensor_from_image2d_array(output, 4); \
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
    __global src_type* in_ptr = (__global src_type*)input_ptr; \
    __global uint* out_ptr = (__global uint*)output_ptr; \
    if(exclusive && rev) \
    { \
        coord_out.x = width - 1; \
        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
        out_ptr = (__global uint*)output_ptr; \
        out_ptr[0] = dst; \
        for(coord.x = width - 1; coord.x > 0; coord.x--) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            src_type data = in_ptr[0]; \
            coord_out.x--; \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum * in_out_scale + tmpAlpha; \
 \
            dst = (uint)convert_int_rte(tmpSum); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
            out_ptr = (__global uint*)output_ptr; \
            out_ptr[0] = dst; \
        } \
    } \
    else if(exclusive) \
    { \
        coord_out.x = 0; \
        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
        out_ptr = (__global uint*)output_ptr; \
        out_ptr[0] = dst; \
        for(coord.x = 0; coord.x < width - 1; coord.x++) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            src_type data = in_ptr[0]; \
            coord_out.x++; \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum * in_out_scale + tmpAlpha; \
 \
            dst = (uint)convert_int_rte(tmpSum); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
            out_ptr = (__global uint*)output_ptr; \
            out_ptr[0] = dst; \
        } \
    } \
    else if(rev) \
    { \
        for(coord.x = width - 1; coord.x >= 0; coord.x--) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            src_type data = in_ptr[0]; \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum * in_out_scale + tmpAlpha; \
 \
            dst = (uint)convert_int_rte(tmpSum); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            out_ptr = (__global uint*)output_ptr; \
            out_ptr[0] = dst; \
        } \
    } \
    else \
    { \
        for(coord.x = 0; coord.x < width; coord.x++) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            src_type data = in_ptr[0]; \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum * in_out_scale + tmpAlpha; \
 \
            dst = (uint)convert_int_rte(tmpSum); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            out_ptr = (__global uint*)output_ptr; \
            out_ptr[0] = dst; \
        } \
    } \
 }
 CUMSUM_ARRAY_toU8_AXIS0_SH(U8,uint)
 CUMSUM_ARRAY_toU8_AXIS0_SH(F32,float)
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis1.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis1.cl
@ -0,0 +1,216 @@
 __kernel void cumsum_array_F32toF32_axis1(
    __read_only image2d_array_t  input,
    __write_only image2d_array_t  output,
    int axis,
    int exclusive,
    int rev,
    int width,
    int height,
    int channel,
    int input_zp,
    float in_out_scale,
    float in_out_zp_scale,
    float output_zp
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4 coord_out = coord;
    float sum = (float)(0);
    Tensor img1 = create_tensor_from_image2d_array(input, 4);
    Tensor img2 = create_tensor_from_image2d_array(output, 4);
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
    __global float* in_ptr = (__global float*)input_ptr;
    __global float* out_ptr = (__global float*)output_ptr;
    if(exclusive && rev)
    {
        coord_out.y = height - 1;
        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
        out_ptr = (__global float*)output_ptr;
        out_ptr[0] = sum;
        for(coord.y = height - 1; coord.y > 0; coord.y--)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            coord_out.y--;
            sum += data;
            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
    else if(exclusive)
    {
        coord_out.y = 0;
        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
        out_ptr = (__global float*)output_ptr;
        out_ptr[0] = sum;
        for(coord.y = 0; coord.y < height - 1; coord.y++)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            coord_out.y++;
            sum += data;
            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
    else if(rev)
    {
        for(coord.y = height - 1; coord.y >= 0; coord.y--)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            sum += data;
            output_ptr = get_tensor_ptr_from_coord(img2, coord);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
    else
    {
        for(coord.y = 0; coord.y < height; coord.y++)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            sum += data;
            output_ptr = get_tensor_ptr_from_coord(img2, coord);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
 }
 #define CUMSUM_ARRAY_toU8_AXIS1_SH(name, src_type) \
 __kernel void cumsum_array_##name##toU8_axis1( \
    __read_only image2d_array_t  input, \
    __write_only image2d_array_t  output, \
    int axis, \
    int exclusive, \
    int rev, \
    int width, \
    int height, \
    int channel, \
    int input_zp, \
    float in_out_scale, \
    float in_out_zp_scale, \
    float output_zp \
    ) \
 { \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
    int4 coord_out = coord; \
 \
    src_type sum = (src_type)(0); \
    uint dst = (uint4)(0); \
    int tmp_zp = convert_int_rte(output_zp); \
    dst = convert_uint_sat(tmp_zp); \
 \
    float cnt = 0; \
 \
    Tensor img1 = create_tensor_from_image2d_array(input, 4); \
    Tensor img2 = create_tensor_from_image2d_array(output, 4); \
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
    __global src_type* in_ptr = (__global src_type*)input_ptr; \
    __global uint* out_ptr = (__global uint*)output_ptr; \
    if(exclusive && rev) \
    { \
        coord_out.y = height - 1; \
        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
        out_ptr = (__global uint*)output_ptr; \
        out_ptr[0] = dst; \
 \
        for(coord.y = height - 1; coord.y > 0; coord.y--) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            src_type data = in_ptr[0]; \
            cnt += 1.0f; \
            coord_out.y--; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum * in_out_scale + tmpAlpha; \
 \
            dst = (uint)convert_int_rte(tmpSum); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
            out_ptr = (__global uint*)output_ptr; \
            out_ptr[0] = dst; \
        } \
    } \
    else if(exclusive) \
    { \
        coord_out.y = 0; \
        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
        out_ptr = (__global uint*)output_ptr; \
        out_ptr[0] = dst; \
        for(coord.y = 0; coord.y < height - 1; coord.y++) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            src_type data = in_ptr[0]; \
            cnt += 1.0f; \
            coord_out.y++; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum * in_out_scale + tmpAlpha; \
 \
            dst = (uint)convert_int_rte(tmpSum); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
            out_ptr = (__global uint*)output_ptr; \
            out_ptr[0] = dst; \
        } \
    } \
    else if(rev) \
    { \
        for(coord.y = height - 1; coord.y >= 0; coord.y--) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            src_type data = in_ptr[0]; \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum * in_out_scale + tmpAlpha; \
 \
            dst = (uint)convert_int_rte(tmpSum); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            out_ptr = (__global uint*)output_ptr; \
            out_ptr[0] = dst; \
        } \
    } \
    else \
    { \
        for(coord.y = 0; coord.y < height; coord.y++) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            src_type data = in_ptr[0]; \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum * in_out_scale + tmpAlpha; \
 \
            dst = (uint)convert_int_rte(tmpSum); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            out_ptr = (__global uint*)output_ptr; \
            out_ptr[0] = dst; \
        } \
    } \
 }
 CUMSUM_ARRAY_toU8_AXIS1_SH(U8,uint)
 CUMSUM_ARRAY_toU8_AXIS1_SH(F32,float)
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis2.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis2.cl
@ -0,0 +1,215 @@
 __kernel void cumsum_array_F32toF32_axis2(
    __read_only image2d_array_t  input,
    __write_only image2d_array_t  output,
    int axis,
    int exclusive,
    int rev,
    int width,
    int height,
    int channel,
    int input_zp,
    float in_out_scale,
    float in_out_zp_scale,
    float output_zp
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    int4 coord_out = coord;
    float sum = 0;
    Tensor img1 = create_tensor_from_image2d_array(input, 4);
    Tensor img2 = create_tensor_from_image2d_array(output, 4);
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
    __global float* in_ptr = (__global float*)input_ptr;
    __global float* out_ptr = (__global float*)output_ptr;
    if(exclusive && rev)
    {
        coord_out.z = channel - 1;
        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
        out_ptr = (__global float*)output_ptr;
        out_ptr[0] = sum;
        for(coord.z = channel - 1; coord.z > 0; coord.z--)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            coord_out.z--;
            sum += data;
            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
    else if(exclusive)
    {
        coord_out.z = 0;
        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
        out_ptr = (__global float*)output_ptr;
        out_ptr[0] = sum;
        for(coord.z = 0; coord.z < channel - 1; coord.z++)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            coord_out.z++;
            sum += data;
            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
    else if(rev)
    {
        for(coord.z = channel - 1; coord.z >= 0; coord.z--)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            sum += data;
            output_ptr = get_tensor_ptr_from_coord(img2, coord);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
    else
    {
        for(coord.z = 0; coord.z < channel; coord.z++)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global float*)input_ptr;
            float data = in_ptr[0];
            sum += data;
            output_ptr = get_tensor_ptr_from_coord(img2, coord);
            out_ptr = (__global float*)output_ptr;
            out_ptr[0] = sum;
        }
    }
 }
 #define CUMSUM_ARRAY_toU8_AXIS2_SH(name, src_type) \
 __kernel void cumsum_array_##name##toU8_axis2( \
    __read_only image2d_array_t  input, \
    __write_only image2d_array_t  output, \
    int axis, \
    int exclusive, \
    int rev, \
    int width, \
    int height, \
    int channel, \
    int input_zp, \
    float in_out_scale, \
    float in_out_zp_scale, \
    float output_zp \
    ) \
 { \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
    int4 coord_out = coord; \
 \
    src_type sum = (src_type)(0); \
    uint dst = (uint)(0); \
    int tmp_zp = convert_int_rte(output_zp); \
    dst = convert_uint_sat(tmp_zp); \
 \
    float cnt = 0.0f; \
    Tensor img1 = create_tensor_from_image2d_array(input, 4); \
    Tensor img2 = create_tensor_from_image2d_array(output, 4); \
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
    __global src_type* in_ptr = (__global src_type*)input_ptr; \
    __global uint* out_ptr = (__global uint*)output_ptr; \
 \
    if(exclusive && rev) \
    { \
        coord_out.z = channel - 1; \
        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
        out_ptr = (__global uint*)output_ptr; \
        out_ptr[0] = dst; \
        for(coord.z = channel - 1; coord.z > 0; coord.z--) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            src_type data = in_ptr[0]; \
            coord_out.z--; \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum * in_out_scale + tmpAlpha; \
 \
            dst = (uint)convert_int_rte(tmpSum); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
            out_ptr = (__global uint*)output_ptr; \
            out_ptr[0] = dst; \
        } \
    } \
    else if(exclusive) \
    { \
        coord_out.z = 0; \
        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
        out_ptr = (__global uint*)output_ptr; \
        out_ptr[0] = dst; \
        for(coord.z = 0; coord.z < channel - 1; coord.z++) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            src_type data = in_ptr[0]; \
            coord_out.z++; \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum * in_out_scale + tmpAlpha; \
 \
            dst = (uint)convert_int_rte(tmpSum); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
            out_ptr = (__global uint*)output_ptr; \
            out_ptr[0] = dst; \
        } \
    } \
    else if(rev) \
    { \
        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            src_type data = in_ptr[0]; \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum * in_out_scale + tmpAlpha; \
 \
            dst = (uint)convert_int_rte(tmpSum); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            out_ptr = (__global uint*)output_ptr; \
            out_ptr[0] = dst; \
        } \
    } \
    else \
    { \
        for(coord.z = 0; coord.z < channel; coord.z++) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            src_type data = in_ptr[0]; \
            cnt += 1.0f; \
            sum += data; \
 \
            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
            float tmpSum = sum * in_out_scale + tmpAlpha; \
 \
            dst = (uint)convert_int_rte(tmpSum); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            out_ptr = (__global uint*)output_ptr; \
            out_ptr[0] = dst; \
        } \
    } \
 }
 CUMSUM_ARRAY_toU8_AXIS2_SH(U8,uint)
 CUMSUM_ARRAY_toU8_AXIS2_SH(F32,float)
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
@ -18,8 +18,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
 \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
 \
-    __local float local_data[128]; \
+    __local float local_data[LOCAL_SIZE0 * 2]; \
-    __local uint local_indices[128]; \
+    __local uint local_indices[LOCAL_SIZE0 * 2]; \
 \
    float left = read_imagef(input, coord.xy).x; \
    coord.z += work_group_size; \
@ -51,7 +51,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            float left_elem = local_data[left_id]; \
            float right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -78,13 +78,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
    write_imagei(indices, coord.xy, index.xxxx); \
    write_imagei(indices, coord.zy, index.yyyy); \
 }
-TOPK_F32(1 << 0, 0)
+TOPK_F32((1 << 0), 0)
-TOPK_F32(1 << 1, 1)
+TOPK_F32((1 << 1), 1)
-TOPK_F32(1 << 2, 2)
+TOPK_F32((1 << 2), 2)
-TOPK_F32(1 << 3, 3)
+TOPK_F32((1 << 3), 3)
-TOPK_F32(1 << 4, 4)
+TOPK_F32((1 << 4), 4)
-TOPK_F32(1 << 5, 5)
+TOPK_F32((1 << 5), 5)
-TOPK_F32(1 << 6, 6)
+TOPK_F32((1 << 6), 6)
 TOPK_F32((1 << 9), 9)
 #define TOPK_U32(LOCAL_SIZE0, STAGES) \
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_U32toU32_I32 \
@ -106,8 +107,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
 \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
 \
-    __local uint local_data[128]; \
+    __local uint local_data[LOCAL_SIZE0 * 2]; \
-    __local uint local_indices[128]; \
+    __local uint local_indices[LOCAL_SIZE0 * 2]; \
 \
    uint left = read_imageui(input, coord.xy).x; \
    coord.z += work_group_size; \
@ -139,7 +140,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            uint left_elem = local_data[left_id]; \
            uint right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -166,13 +167,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
    write_imagei(indices, coord.xy, index.xxxx); \
    write_imagei(indices, coord.zy, index.yyyy); \
 }
-TOPK_U32(1 << 0, 0)
+TOPK_U32((1 << 0), 0)
-TOPK_U32(1 << 1, 1)
+TOPK_U32((1 << 1), 1)
-TOPK_U32(1 << 2, 2)
+TOPK_U32((1 << 2), 2)
-TOPK_U32(1 << 3, 3)
+TOPK_U32((1 << 3), 3)
-TOPK_U32(1 << 4, 4)
+TOPK_U32((1 << 4), 4)
-TOPK_U32(1 << 5, 5)
+TOPK_U32((1 << 5), 5)
-TOPK_U32(1 << 6, 6)
+TOPK_U32((1 << 6), 6)
 TOPK_U32((1 << 9), 9)
 #define TOPK_I32(LOCAL_SIZE0, STAGES) \
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_I32toI32_I32 \
@ -194,8 +196,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
 \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
 \
-    __local int local_data[128]; \
+    __local int local_data[LOCAL_SIZE0 * 2]; \
-    __local int local_indices[128]; \
+    __local int local_indices[LOCAL_SIZE0 * 2]; \
 \
    int left = read_imagei(input, coord.xy).x; \
    coord.z += work_group_size; \
@ -227,7 +229,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            int left_elem = local_data[left_id]; \
            int right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -254,13 +256,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
    write_imagei(indices, coord.xy, index.xxxx); \
    write_imagei(indices, coord.zy, index.yyyy); \
 }
-TOPK_I32(1 << 0, 0)
+TOPK_I32((1 << 0), 0)
-TOPK_I32(1 << 1, 1)
+TOPK_I32((1 << 1), 1)
-TOPK_I32(1 << 2, 2)
+TOPK_I32((1 << 2), 2)
-TOPK_I32(1 << 3, 3)
+TOPK_I32((1 << 3), 3)
-TOPK_I32(1 << 4, 4)
+TOPK_I32((1 << 4), 4)
-TOPK_I32(1 << 5, 5)
+TOPK_I32((1 << 5), 5)
-TOPK_I32(1 << 6, 6)
+TOPK_I32((1 << 6), 6)
 TOPK_I32((1 << 9), 9)
 #define TOPK_F32toU32(LOCAL_SIZE0, STAGES) \
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toU32_I32 \
@ -282,8 +285,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
 \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
 \
-    __local float local_data[128]; \
+    __local float local_data[LOCAL_SIZE0 * 2]; \
-    __local uint local_indices[128]; \
+    __local uint local_indices[LOCAL_SIZE0 * 2]; \
 \
    float left = read_imagef(input, coord.xy).x; \
    coord.z += work_group_size; \
@ -315,7 +318,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            float left_elem = local_data[left_id]; \
            float right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -342,13 +345,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
    write_imagei(indices, coord.zy, index.yyyy); \
 }
-TOPK_F32toU32(1 << 0, 0)
+TOPK_F32toU32((1 << 0), 0)
-TOPK_F32toU32(1 << 1, 1)
+TOPK_F32toU32((1 << 1), 1)
-TOPK_F32toU32(1 << 2, 2)
+TOPK_F32toU32((1 << 2), 2)
-TOPK_F32toU32(1 << 3, 3)
+TOPK_F32toU32((1 << 3), 3)
-TOPK_F32toU32(1 << 4, 4)
+TOPK_F32toU32((1 << 4), 4)
-TOPK_F32toU32(1 << 5, 5)
+TOPK_F32toU32((1 << 5), 5)
-TOPK_F32toU32(1 << 6, 6)
+TOPK_F32toU32((1 << 6), 6)
 TOPK_F32toU32((1 << 9), 9)
 #define TOPK_F32toI32(LOCAL_SIZE0, STAGES) \
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toI32_I32 \
@ -370,8 +374,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
 \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
 \
-    __local float local_data[128]; \
+    __local float local_data[LOCAL_SIZE0 * 2]; \
-    __local uint local_indices[128]; \
+    __local uint local_indices[LOCAL_SIZE0 * 2]; \
 \
    float left = read_imagef(input, coord.xy).x; \
    coord.z += work_group_size; \
@ -403,7 +407,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            float left_elem = local_data[left_id]; \
            float right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -430,10 +434,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
    write_imagei(indices, coord.zy, index.yyyy); \
 }
-TOPK_F32toI32(1 << 0, 0)
+TOPK_F32toI32((1 << 0), 0)
-TOPK_F32toI32(1 << 1, 1)
+TOPK_F32toI32((1 << 1), 1)
-TOPK_F32toI32(1 << 2, 2)
+TOPK_F32toI32((1 << 2), 2)
-TOPK_F32toI32(1 << 3, 3)
+TOPK_F32toI32((1 << 3), 3)
-TOPK_F32toI32(1 << 4, 4)
+TOPK_F32toI32((1 << 4), 4)
-TOPK_F32toI32(1 << 5, 5)
+TOPK_F32toI32((1 << 5), 5)
-TOPK_F32toI32(1 << 6, 6)
+TOPK_F32toI32((1 << 6), 6)
 TOPK_F32toI32((1 << 9), 9)
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl
@ -0,0 +1,368 @@
 #define BITONIC_STEP(dtype) \
 void bitonic_step_##dtype(uint num_stages, int lx, \
        __local dtype *local_data, __local int *local_indices) \
 { \
    for (uint stage = 0; stage < num_stages + 1; ++stage) \
    { \
        uint signo = (lx >> stage) & 1; \
 \
        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
        { \
            uint postShift = (stage - passOfStage); \
            uint pairDistance = 1 << postShift; \
 \
            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
            uint right_id = left_id + pairDistance; \
 \
            int left_idx = local_indices[left_id]; \
            int right_idx = local_indices[right_id]; \
 \
            dtype left_elem = local_data[left_id]; \
            dtype right_elem = local_data[right_id]; \
 \
            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
 \
                local_indices[left_id] = right_idx; \
                local_indices[right_id] = left_idx; \
            } \
 \
            barrier(CLK_LOCAL_MEM_FENCE); \
        } \
    } \
 }
 BITONIC_STEP(int)
 BITONIC_STEP(uint)
 #define BITONIC_STEP_ASCEND(dtype) \
 void bitonic_step_ascend_##dtype(uint num_stages, int lx, \
        __local dtype *p_share_k, __local int *p_share_v) \
 { \
    for (uint stage = 0; stage < num_stages + 1; ++stage) \
    { \
        uint signo = (lx >> stage) & 1; \
 \
        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
        { \
            uint postShift = (stage - passOfStage); \
            uint pairDistance = 1 << postShift; \
 \
            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
            uint right_id = left_id + pairDistance; \
 \
            int left_idx = p_share_v[left_id]; \
            int right_idx = p_share_v[right_id]; \
 \
            dtype left_elem = p_share_k[left_id]; \
            dtype right_elem = p_share_k[right_id]; \
 \
            if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \
            { \
                p_share_k[left_id] = right_elem; \
                p_share_k[right_id] = left_elem; \
 \
                p_share_v[left_id] = right_idx; \
                p_share_v[right_id] = left_idx; \
            } \
 \
            barrier(CLK_LOCAL_MEM_FENCE); \
        } \
    } \
 }
 BITONIC_STEP_ASCEND(int)
 BITONIC_STEP_ASCEND(uint)
 #define BITONIC_MERGE(dtype) \
 void bitonic_merge_##dtype(uint num_stages, int lx, \
        __local dtype *local_data, __local int *local_indices) \
 { \
    uint stage = num_stages; \
    uint signo = (lx >> stage) & 1; \
 \
    for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
    { \
        uint postShift = (stage - passOfStage); \
        uint pairDistance = 1 << postShift; \
 \
        uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
        uint right_id = left_id + pairDistance; \
 \
        int left_idx = local_indices[left_id]; \
        int right_idx = local_indices[right_id]; \
 \
        dtype left_elem = local_data[left_id]; \
        dtype right_elem = local_data[right_id]; \
 \
        if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
        { \
            local_data[left_id] = right_elem; \
            local_data[right_id] = left_elem; \
 \
            local_indices[left_id] = right_idx; \
            local_indices[right_id] = left_idx; \
        } \
 \
        barrier(CLK_LOCAL_MEM_FENCE); \
    } \
 }
 BITONIC_MERGE(int)
 BITONIC_MERGE(uint)
 #define BLOCK_SIZE              (512)
 __kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32
 (
  __read_only  image2d_t input,
  __write_only image2d_t output,
  __write_only image2d_t indices,
               float     input_scale,
               float     input_tail,
               float     output_scale,
               float     output_tail,
               int       _num_stages,
               int       width
  )
 {
    uint lx = get_local_id(0);
    const int init_k = -2147483647;
    const int init_v = -2147483647;
    const int num_stages = 9;
    const int threads_per_block = BLOCK_SIZE;
    const int index_minus_1 = threads_per_block * 2 - 1;
    uint offset = 0;
    uint lx1 = lx + threads_per_block;
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
    __local int local_data[1536];
    __local int local_indices[1536];
    int left = read_imagei(input, coord.xy).x;
    coord.z += threads_per_block;
    int right = read_imagei(input, coord.zy).x;
    local_data[lx] = left;
    local_indices[lx] = coord.x;
    local_data[lx1] = right;
    local_indices[lx1] = coord.z;
    barrier(CLK_LOCAL_MEM_FENCE);
    bitonic_step_int(num_stages, lx, local_data, local_indices);
    int min_data = local_data[511];
    int *p_share_k = local_data + threads_per_block;
    int *p_share_v = local_indices + threads_per_block;
    int limit = (width >> 10) << 10;
    p_share_k[lx] = init_k;
    p_share_v[lx] = init_v;
    p_share_k[lx1] = init_k;
    p_share_v[lx1] = init_v;
    barrier(CLK_LOCAL_MEM_FENCE);
    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
    {
        int2 data;
        coord.z = coord.x + threads_per_block;
        data.x = read_imagei(input, coord.xy).x;
        data.y = read_imagei(input, coord.zy).x;
        p_share_k[lx] = data.x;
        p_share_v[lx] = coord.x;
        p_share_k[lx1] = data.y;
        p_share_v[lx1] = coord.z;
        barrier(CLK_LOCAL_MEM_FENCE);
        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
        if (p_share_k[index_minus_1] < min_data)
        {
            continue;
        }
        p_share_k[lx] = p_share_k[lx1];
        p_share_v[lx] = p_share_v[lx1];
        barrier(CLK_LOCAL_MEM_FENCE);
        bitonic_merge_int(num_stages, lx, local_data, local_indices);
        min_data = local_data[511];
        p_share_k[lx] = init_k;
        p_share_v[lx] = init_v;
        p_share_k[lx1] = init_k;
        p_share_v[lx1] = init_v;
    }
    if (width > limit)
    {
        if (coord.x < width)
        {
            int2 data;
            data.x = read_imagei(input, coord.xy).x;
            coord.z = coord.x + threads_per_block;
            data.y = read_imagei(input, coord.zy).x;
            p_share_k[lx] = data.x;
            p_share_v[lx] = coord.x;
            p_share_k[lx1] = coord.z < width ? data.y : init_k;
            p_share_v[lx1] = coord.z < width ? coord.z : init_v;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
        if (p_share_k[index_minus_1] >= min_data)
        {
            p_share_k[lx] = p_share_k[lx1];
            p_share_v[lx] = p_share_v[lx1];
            barrier(CLK_LOCAL_MEM_FENCE);
            bitonic_merge_int(num_stages, lx, local_data, local_indices);
        }
    }
    int4 dst;
    dst.x = local_data[lx];
    coord.x = lx;
    write_imagei(output, coord.xy, dst.xxxx);
    int4 index;
    index.x = local_indices[lx];
    write_imagei(indices, coord.xy, index.xxxx);
 }
 __kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32
 (
  __read_only  image2d_t input,
  __write_only image2d_t output,
  __write_only image2d_t indices,
               float     input_scale,
               float     input_tail,
               float     output_scale,
               float     output_tail,
               int       _num_stages,
               int       width
  )
 {
    uint lx = get_local_id(0);
    const uint init_k = 0;
    const int init_v = -2147483647;
    const int num_stages = 9;
    const int threads_per_block = BLOCK_SIZE;
    const int index_minus_1 = threads_per_block * 2 - 1;
    uint offset = 0;
    uint lx1 = lx + threads_per_block;
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
    __local uint local_data[1536];
    __local int local_indices[1536];
    uint left = read_imageui(input, coord.xy).x;
    coord.z += threads_per_block;
    uint right = read_imageui(input, coord.zy).x;
    local_data[lx] = left;
    local_indices[lx] = coord.x;
    local_data[lx1] = right;
    local_indices[lx1] = coord.z;
    barrier(CLK_LOCAL_MEM_FENCE);
    bitonic_step_uint(num_stages, lx, local_data, local_indices);
    uint min_data = local_data[511];
    uint *p_share_k = local_data + threads_per_block;
    int *p_share_v = local_indices + threads_per_block;
    int limit = (width >> 10) << 10;
    p_share_k[lx] = init_k;
    p_share_v[lx] = init_v;
    p_share_k[lx1] = init_k;
    p_share_v[lx1] = init_v;
    barrier(CLK_LOCAL_MEM_FENCE);
    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
    {
        uint2 data;
        coord.z = coord.x + threads_per_block;
        data.x = read_imageui(input, coord.xy).x;
        data.y = read_imageui(input, coord.zy).x;
        p_share_k[lx] = data.x;
        p_share_v[lx] = coord.x;
        p_share_k[lx1] = data.y;
        p_share_v[lx1] = coord.z;
        barrier(CLK_LOCAL_MEM_FENCE);
        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
        if (p_share_k[index_minus_1] < min_data)
        {
            continue;
        }
        p_share_k[lx] = p_share_k[lx1];
        p_share_v[lx] = p_share_v[lx1];
        barrier(CLK_LOCAL_MEM_FENCE);
        bitonic_merge_uint(num_stages, lx, local_data, local_indices);
        min_data = local_data[511];
        p_share_k[lx] = init_k;
        p_share_v[lx] = init_v;
        p_share_k[lx1] = init_k;
        p_share_v[lx1] = init_v;
    }
    if (width > limit)
    {
        if (coord.x < width)
        {
            uint2 data;
            data.x = read_imageui(input, coord.xy).x;
            coord.z = coord.x + threads_per_block;
            data.y = read_imageui(input, coord.zy).x;
            p_share_k[lx] = data.x;
            p_share_v[lx] = coord.x;
            p_share_k[lx1] = coord.z < width ? data.y : init_k;
            p_share_v[lx1] = coord.z < width ? coord.z : init_v;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
        if (p_share_k[index_minus_1] >= min_data)
        {
            p_share_k[lx] = p_share_k[lx1];
            p_share_v[lx] = p_share_v[lx1];
            barrier(CLK_LOCAL_MEM_FENCE);
            bitonic_merge_uint(num_stages, lx, local_data, local_indices);
        }
    }
    uint4 dst;
    dst.x = local_data[lx];
    coord.x = lx;
    write_imageui(output, coord.xy, dst.xxxx);
    int4 index;
    index.x = local_indices[lx];
    write_imagei(indices, coord.xy, index.xxxx);
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array.vx
@ -0,0 +1,344 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
 _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
 _viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
 _viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
 _viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
 _viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
 _viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;
 _viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;
 _viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;
 _viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;
 _viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;
 _viv_uniform VXC_512Bits uniSetZeroF16_2x8;
 _viv_uniform int width;
 _viv_uniform int height;
 _viv_uniform int channel;
 _viv_uniform int input_zp;
 _viv_uniform float in_out_scale;
 _viv_uniform float in_out_zp_scale;
 _viv_uniform float output_zp;
 _viv_uniform int remainder;
 _viv_uniform int w_size;
 __kernel void cumsum_array_F16toF16_axis2(
    __read_only image2d_array_t   input,
    __write_only image2d_array_t  output,
    int axis, int exclusive, int rev
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    vxc_short8 src, dst;
    vxc_half8 data, sum;
    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
    Tensor img1 = create_tensor_from_image2d_array(input, 2);
    Tensor img2 = create_tensor_from_image2d_array(output, 2);
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
    {
        coord.x = coord.x - (8 - remainder);
    }
    for(coord.z = 0; coord.z < channel; coord.z++)
    {
        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
        src = in_ptr[0];
        _viv_asm(COPY, data, src, 16);
        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
        _viv_asm(COPY, dst, sum, 16);
        out_ptr[0] = dst;
    }
 }
 #define CUMSUM_8BITS_ARRAY_AXIS2(in_name, out_name, src_type, dst_type) \
 __kernel void cumsum_array_##in_name##to##out_name##_axis2( \
    __read_only image2d_array_t   input, \
    __write_only image2d_array_t  output, \
    int axis, int exclusive, int rev \
    ) \
 { \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
 \
    src_type src; \
    dst_type dst; \
    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
 \
    Tensor img1 = create_tensor_from_image2d_array(input, 1); \
    Tensor img2 = create_tensor_from_image2d_array(output, 1); \
    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
    { \
        coord.x = coord.x - (16 - remainder); \
    } \
    for(coord.z = 0; coord.z < channel; coord.z++) \
    { \
        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
        __global src_type* in_ptr = (__global src_type*)input_ptr; \
        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
        src = in_ptr[0]; \
        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
        float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp; \
        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
        int4 tmpDst2 = convert_int4_rte(tmpSum2); \
        int4 tmpDst3 = convert_int4_rte(tmpSum3); \
        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);\
        out_ptr[0] = dst; \
    } \
 }
 CUMSUM_8BITS_ARRAY_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)
 CUMSUM_8BITS_ARRAY_AXIS2(I8, I8, vxc_char16, vxc_char16)
 __kernel void cumsum_array_I16toI16_axis2(
    __read_only image2d_array_t   input,
    __write_only image2d_array_t  output,
    int axis, int exclusive, int rev
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    vxc_short8 src, dst;
    int4 sum0 = (int4)(0), sum1 = (int4)(0);
    Tensor img1 = create_tensor_from_image2d_array(input, 2);
    Tensor img2 = create_tensor_from_image2d_array(output, 2);
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
    {
        coord.x = coord.x - (8 - remainder);
    }
    for(coord.z = 0; coord.z < channel; coord.z++)
    {
        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
        src = in_ptr[0];
        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
        float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp;
        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
        int4 tmpDst0 = convert_int4_rte(tmpSum0);
        int4 tmpDst1 = convert_int4_rte(tmpSum1);
        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);
        out_ptr[0] = dst;
    }
 }
 __kernel void cumsum_array_F16toF16_axis1(
    __read_only image2d_array_t   input,
    __write_only image2d_array_t  output,
    int axis, int exclusive, int rev
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    vxc_short8 src, dst;
    vxc_half8 data, sum;
    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
    Tensor img1 = create_tensor_from_image2d_array(input, 2);
    Tensor img2 = create_tensor_from_image2d_array(output, 2);
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
    {
        coord.x = coord.x - (8 - remainder);
    }
    for(coord.y = 0; coord.y < height; coord.y++)
    {
        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
        src = in_ptr[0];
        _viv_asm(COPY, data, src, 16);
        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
        _viv_asm(COPY, dst, sum, 16);
        out_ptr[0] = dst;
    }
 }
 #define CUMSUM_8BITS_ARRAY_AXIS1(in_name, out_name, src_type, dst_type) \
 __kernel void cumsum_array_##in_name##to##out_name##_axis1( \
    __read_only image2d_array_t   input, \
    __write_only image2d_array_t  output, \
    int axis, int exclusive, int rev \
    ) \
 { \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
 \
    src_type src; \
    dst_type dst; \
    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
    Tensor img2 = create_tensor_from_image2d_array(output, 2); \
    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
    { \
        coord.x = coord.x - (16 - remainder); \
    } \
 \
    for(coord.y = 0; coord.y < height; coord.y++) \
    { \
        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
        __global src_type* in_ptr = (__global src_type*)input_ptr; \
        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
        src = in_ptr[0]; \
        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \
        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
        int4 tmpDst2 = convert_int4_rte(tmpSum2); \
        int4 tmpDst3 = convert_int4_rte(tmpSum3); \
        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
        out_ptr[0] = dst; \
    } \
 }
 CUMSUM_8BITS_ARRAY_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)
 CUMSUM_8BITS_ARRAY_AXIS1(I8, I8, vxc_char16,  vxc_char16)
 __kernel void cumsum_array_I16toI16_axis1(
    __read_only image2d_array_t   input,
    __write_only image2d_array_t  output,
    int axis, int exclusive, int rev
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    vxc_short8 src, dst;
    int4 sum0 = (int4)(0), sum1 = (int4)(0);
    Tensor img1 = create_tensor_from_image2d_array(input, 2);
    Tensor img2 = create_tensor_from_image2d_array(output, 2);
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
    {
        coord.x = coord.x - (8 - remainder);
    }
    for(coord.y = 0; coord.y < height; coord.y++)
    {
        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
        src = in_ptr[0];
        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;
        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
        int4 tmpDst0 = convert_int4_rte(tmpSum0);
        int4 tmpDst1 = convert_int4_rte(tmpSum1);
        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
        out_ptr[0] = dst;
    }
 }
 __kernel void cumsum_array_F16toF16_axis0(
    __read_only image2d_array_t   input,
    __write_only image2d_array_t  output,
    int axis, int exclusive, int rev
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    vxc_short8 src, dst;
    vxc_half8 data, tmpsum, sum;
    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
    Tensor img1 = create_tensor_from_image2d_array(input, 2);
    Tensor img2 = create_tensor_from_image2d_array(output, 2);
    for(; coord.x < width; coord.x += 8)
    {
        if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
        {
            coord.x = coord.x - (8 - remainder);
        }
        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
        src = in_ptr[0];
        _viv_asm(COPY, data, src, 16);
        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);
        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);
        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);
        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);
        _viv_asm(COPY, dst, sum, 16);
        out_ptr[0] = dst;
    }
 }
 #define CUMSUM_ARRAY_QINT_AXIS0(in_name, out_name, src_type, dst_type) \
 __kernel void cumsum_array_##in_name##to##out_name##_axis0( \
    __read_only image2d_array_t   input, \
    __write_only image2d_array_t  output, \
    int axis, int exclusive, int rev \
    ) \
 { \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
 \
    src_type src; \
    dst_type dst; \
    vxc_short8 rowSum; \
    int4 sum0 = (int4)(0), sum1 = (int4)(0); \
    short zp = (short)input_zp; \
 \
    for(; coord.x < width; coord.x += 8) \
    { \
        if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
        { \
            coord.x = coord.x - (8 - remainder); \
        } \
        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
        __global src_type* in_ptr = (__global src_type*)input_ptr; \
        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
        src = in_ptr[0]; \
        VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \
        VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \
        VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \
        VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32A_4x4); \
        VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32B_4x4); \
 \
        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
        out_ptr[0] = dst; \
    } \
 }
 CUMSUM_ARRAY_QINT_AXIS0(U8,  U8,  vxc_uchar16, vxc_uchar16)
 CUMSUM_ARRAY_QINT_AXIS0(I8,  I8,  vxc_char16,  vxc_char16)
 CUMSUM_ARRAY_QINT_AXIS0(I16, I16, vxc_short8,  vxc_short8)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_2d.vx
@ -0,0 +1,259 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
 _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
 _viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
 _viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
 _viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
 _viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
 _viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;
 _viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;
 _viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;
 _viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;
 _viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;
 _viv_uniform VXC_512Bits uniSetZeroF16_2x8;
 _viv_uniform int width;
 _viv_uniform int height;
 _viv_uniform int input_zp;
 _viv_uniform float in_out_scale;
 _viv_uniform float in_out_zp_scale;
 _viv_uniform float output_zp;
 _viv_uniform int remainder;
 _viv_uniform int w_size;
 __kernel void cumsum_array_F16toF16_axis1_2D(
    __read_only image2d_t   input,
    __write_only image2d_t  output,
    int axis, int exclusive, int rev
    )
 {
    int2 coord = (int2)(get_global_id(0), 0);
    vxc_short8 src, dst;
    vxc_half8 data, sum;
    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
    Image img1 = create_image_from_image2d(input, 2);
    Image img2 = create_image_from_image2d(output, 2);
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
    {
        coord.x = coord.x - (8 - remainder);
    }
    for(; coord.y < height; coord.y++)
    {
        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
        src = in_ptr[0];
        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        _viv_asm(COPY, data, src, 16);
        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
                uniAccSumVertF16toF16_2x8);
        _viv_asm(COPY, dst, sum, 16);
        out_ptr[0] = dst;
    }
 }
 #define CUMSUM_8BITS_ARRAY_AXIS1_2D(in_name, out_name, src_type, dst_type) \
 __kernel void cumsum_array_##in_name##to##out_name##_axis1_2D( \
    __read_only image2d_t   input, \
    __write_only image2d_t  output, \
    int axis, int exclusive, int rev \
    ) \
 { \
    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
 \
    src_type src; \
    dst_type dst; \
    int4 sum0 = (int4)(0); \
    int4 sum1 = (int4)(0); \
    int4 sum2 = (int4)(0); \
    int4 sum3 = (int4)(0); \
 \
    Image img1 = create_image_from_image2d(input, 1); \
    Image img2 = create_image_from_image2d(output, 1); \
    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
    { \
        coord.x = coord.x - (16 - remainder); \
    } \
    for(coord.y = 0; coord.y < height; coord.y++) \
    { \
        uchar* input_ptr = get_image_ptr_from_coord(img1, coord); \
        uchar* output_ptr = get_image_ptr_from_coord(img2, coord); \
        __global src_type* in_ptr = (__global src_type*)input_ptr; \
        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
        src = in_ptr[0]; \
        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
                uniAccSumVertU8toI32A_4x4); \
        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
                uniAccSumVertU8toI32B_4x4); \
        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
                uniAccSumVertU8toI32C_4x4); \
        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
                uniAccSumVertU8toI32D_4x4); \
 \
        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \
        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
        int4 tmpDst2 = convert_int4_rte(tmpSum2); \
        int4 tmpDst3 = convert_int4_rte(tmpSum3); \
 \
        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
                 uniConvertInt32toUint8_2x8); \
        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \
                 uniConvertInt32toUint8_2x8); \
        out_ptr[0] = dst; \
    } \
 }
 CUMSUM_8BITS_ARRAY_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16)
 CUMSUM_8BITS_ARRAY_AXIS1_2D(I8, I8, vxc_char16, vxc_char16)
 __kernel void cumsum_array_I16toI16_axis1_2D(
    __read_only image2d_t   input,
    __write_only image2d_t  output,
    int axis, int exclusive, int rev
    )
 {
    int2 coord = (int2)(get_global_id(0), get_global_id(1));
    vxc_short8 src, dst;
    int4 sum0 = (int4)(0), sum1 = (int4)(0);
    Image img1 = create_image_from_image2d(input, 2);
    Image img2 = create_image_from_image2d(output, 2);
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
    {
        coord.x = coord.x - (8 - remainder);
    }
    for(coord.y = 0; coord.y < height; coord.y++)
    {
        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
        src = in_ptr[0];
        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
                uniAccSumVertU8toI32A_4x4);
        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
                uniAccSumVertU8toI32B_4x4);
        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;
        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
        int4 tmpDst0 = convert_int4_rte(tmpSum0);
        int4 tmpDst1 = convert_int4_rte(tmpSum1);
        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
                 uniConvertInt32toUint8_2x8);
        out_ptr[0] = dst;
    }
 }
 __kernel void cumsum_array_F16toF16_axis0_2D(
    __read_only image2d_t   input,
    __write_only image2d_t  output,
    int axis, int exclusive, int rev
    )
 {
    int2 coord = (int2)(get_global_id(0), get_global_id(1));
    vxc_short8 src, dst;
    vxc_half8 data, tmpsum, sum;
    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
    Image img1 = create_image_from_image2d(input, 2);
    Image img2 = create_image_from_image2d(output, 2);
    for(; coord.x < width; coord.x += 8)
    {
        if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
        {
            coord.x = coord.x - (8 - remainder);
        }
        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
        src = in_ptr[0];
        _viv_asm(COPY, data, src, 16);
        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
                uniSumHorzF16toF16A_4x4);
        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\
                uniSumHorzF16toF16B_4x4);
        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
                uniSumHorzF16toF16C_2x8);
        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
                uniAccSumHorzF16toF16_2x8);
        _viv_asm(COPY, dst, sum, 16);
        out_ptr[0] = dst;
    }
 }
 #define CUMSUM_ARRAY_QINT_AXIS0_2D(in_name, out_name, src_type, dst_type, stride_data) \
 __kernel void cumsum_array_##in_name##to##out_name##_axis0_2D( \
    __read_only image2d_t   input, \
    __write_only image2d_t  output, \
    int axis, int exclusive, int rev \
    ) \
 { \
    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
 \
    src_type src; \
    dst_type dst; \
    vxc_short8 rowSum; \
    int4 sum0, sum1; \
    sum0 ^= sum0; \
    sum1 ^= sum1; \
    short zp = (short)input_zp; \
    Image img1 = create_image_from_image2d(input, stride_data); \
    Image img2 = create_image_from_image2d(output, stride_data); \
 \
    for(; coord.x < width; coord.x += 8) \
    { \
        if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
        { \
            coord.x = coord.x - (8 - remainder); \
        } \
        uchar* input_ptr = get_image_ptr_from_coord(img1, coord); \
        uchar* output_ptr = get_image_ptr_from_coord(img2, coord); \
        __global src_type* in_ptr = (__global src_type*)input_ptr; \
        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
        src = in_ptr[0]; \
        VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
                uniSumHorzU8toI16A_4x4); \
        VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\
                uniSumHorzU8toI16B_8x4); \
        VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
                uniSubZpI16toI16_2x8); \
        VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
                uniAccSumHorzI16toI32A_4x4); \
        VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
                uniAccSumHorzI16toI32B_4x4); \
 \
        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
 \
        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
                 uniConvertInt32toUint8_2x8); \
        out_ptr[0] = dst; \
    } \
 }
 CUMSUM_ARRAY_QINT_AXIS0_2D(U8,  U8,  vxc_uchar16, vxc_uchar16, 1)
 CUMSUM_ARRAY_QINT_AXIS0_2D(I8,  I8,  vxc_char16,  vxc_char16, 1)
 CUMSUM_ARRAY_QINT_AXIS0_2D(I16, I16, vxc_short8,  vxc_short8, 2)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_bf16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_bf16.vx
@ -0,0 +1,244 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
 _viv_uniform VXC_512Bits uniExtractOddData_2x8;
 _viv_uniform int width;
 _viv_uniform int height;
 _viv_uniform int channel;
 _viv_uniform int remainder;
 _viv_uniform int w_size;
 __kernel void cumsum_array_BF16toBF16_axis2(
    __read_only image2d_array_t   input,
    __write_only image2d_array_t  output,
    int axis, int exclusive, int rev
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    vxc_ushort8 src, val0, val1;
    vxc_ushort8 dst0, dst1, dst;
    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
    float4 sum0 = (float4)(0), sum1 = (float4)(0);
    Tensor img1 = create_tensor_from_image2d_array(input, 2);
    Tensor img2 = create_tensor_from_image2d_array(output, 2);
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
    {
        coord.x = coord.x - (8 - remainder);
    }
    for(coord.z = 0; coord.z < channel; coord.z++)
    {
        float4 data0, data1;
        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
        src = in_ptr[0];
        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
        _viv_asm(COPY, data0, val0, 16);
        _viv_asm(COPY, data1, val1, 16);
        sum0 += data0;
        sum1 += data1;
        _viv_asm(COPY, dst0, sum0, 16);
        _viv_asm(COPY, dst1, sum1, 16);
        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
        out_ptr[0] = dst;
    }
 }
 __kernel void cumsum_BF16toBF16_axis1(
    __read_only image2d_array_t   input,
    __write_only image2d_array_t  output,
    int axis, int exclusive, int rev
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    vxc_ushort8 src, val0, val1;
    vxc_ushort8 dst0, dst1, dst;
    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
    float4 sum0 = (float4)(0), sum1 = (float4)(0);
    Tensor img1 = create_tensor_from_image2d_array(input, 2);
    Tensor img2 = create_tensor_from_image2d_array(output, 2);
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
    {
        coord.x = coord.x - (8 - remainder);
    }
    for(coord.y = 0; coord.y < height; coord.y++)
    {
        float4 data0, data1;
        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
        src = in_ptr[0];
        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
        _viv_asm(COPY, data0, val0, 16);
        _viv_asm(COPY, data1, val1, 16);
        sum0 += data0;
        sum1 += data1;
        _viv_asm(COPY, dst0, sum0, 16);
        _viv_asm(COPY, dst1, sum1, 16);
        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
        out_ptr[0] = dst;
    }
 }
 __kernel void cumsum_BF16toBF16_axis0(
    __read_only image2d_array_t   input,
    __write_only image2d_array_t  output,
    int axis, int exclusive, int rev
    )
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
    vxc_ushort8 src, val0, val1;
    vxc_ushort8 dst0, dst1, dst;
    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
    float preSum = 0;
    float4 one = (float4)(1.0, 1.0, 1.0, 1.0);
    float4 q = (float4)(1.0, 1.0, 1.0, 0);
    Tensor img1 = create_tensor_from_image2d_array(input, 2);
    Tensor img2 = create_tensor_from_image2d_array(output, 2);
    for(; coord.x < width; coord.x += 8)
    {
        float4 data0, data1;
        if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
        {
            coord.x = coord.x - (8 - remainder);
        }
        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
        src = in_ptr[0];
        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
        _viv_asm(COPY, data0, val0, 16);
        _viv_asm(COPY, data1, val1, 16);
        float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));
        float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));
        tmpSum1 += tmpSum0.w;
        tmpSum0 += preSum;
        tmpSum1 += preSum;
        preSum = tmpSum1.w;
        _viv_asm(COPY, dst0, tmpSum0, 16);
        _viv_asm(COPY, dst1, tmpSum1, 16);
        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
        out_ptr[0] = dst;
    }
 }
 __kernel void cumsum_BF16toBF16_axis1_2D(
    __read_only image2d_t   input,
    __write_only image2d_t  output,
    int axis, int exclusive, int rev
    )
 {
    int2 coord = (int2)(get_global_id(0), 0);
    vxc_ushort8 src, val0, val1;
    vxc_ushort8 dst0, dst1, dst;
    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
    float4 sum0 = (float4)(0), sum1 = (float4)(0);
    Image img1 = create_image_from_image2d(input, 2);
    Image img2 = create_image_from_image2d(output, 2);
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
    {
        coord.x = coord.x - (8 - remainder);
    }
    for(; coord.y < height; coord.y++)
    {
        float4 data0, data1;
        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
        src = in_ptr[0];
        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
                    uniConvBF16toF32_Part0_2x8);
        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
                    uniConvBF16toF32_Part1_2x8);
        _viv_asm(COPY, data0, val0, 16);
        _viv_asm(COPY, data1, val1, 16);
        sum0 += data0;
        sum1 += data1;
        _viv_asm(COPY, dst0, sum0, 16);
        _viv_asm(COPY, dst1, sum1, 16);
        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
                uniExtractOddData_2x8);
        out_ptr[0] = dst;
    }
 }
 __kernel void cumsum_BF16toBF16_axis0_2D(
    __read_only image2d_t   input,
    __write_only image2d_t  output,
    int axis, int exclusive, int rev
    )
 {
    int2 coord = (int2)(get_global_id(0), get_global_id(1));
    vxc_ushort8 src, val0, val1;
    vxc_ushort8 dst0, dst1, dst;
    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
    float preSum = 0;
    float4 one = (float4)(1.0, 1.0, 1.0, 1.0);
    float4 q = (float4)(1.0, 1.0, 1.0, 0);
    Image img1 = create_image_from_image2d(input, 2);
    Image img2 = create_image_from_image2d(output, 2);
    for(; coord.x < width; coord.x += 8)
    {
        if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
        {
            coord.x = coord.x - (8 - remainder);
        }
        float4 data0, data1;
        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
        src = in_ptr[0];
        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
                    uniConvBF16toF32_Part0_2x8);
        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
                    uniConvBF16toF32_Part1_2x8);
        _viv_asm(COPY, data0, val0, 16);
        _viv_asm(COPY, data1, val1, 16);
        float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));
        float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));
        tmpSum1 += tmpSum0.w;
        tmpSum0 += preSum;
        tmpSum1 += preSum;
        preSum = tmpSum1.w;
        _viv_asm(COPY, dst0, tmpSum0, 16);
        _viv_asm(COPY, dst1, tmpSum1, 16);
        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
                uniExtractOddData_2x8);
        out_ptr[0] = dst;
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis0.vx
@ -0,0 +1,259 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
 _viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
 _viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
 _viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
 _viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
 _viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;
 _viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;
 _viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;
 _viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;
 _viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;
 _viv_uniform VXC_512Bits uniSetZeroF16_2x8;
 _viv_uniform VXC_512Bits uniSumHorzRevF16toF16A_4x4;
 _viv_uniform VXC_512Bits uniSumHorzRevF16toF16B_4x4;
 _viv_uniform VXC_512Bits uniSumHorzRevF16toF16C_2x8;
 _viv_uniform VXC_512Bits uniAccSumHorzRevF16toF16_2x8;
 _viv_uniform VXC_512Bits uniSumHorzRevU8toI16A_4x4;
 _viv_uniform VXC_512Bits uniSumHorzRevU8toI16B_8x4;
 _viv_uniform VXC_512Bits uniSubZpRevI16toI16_2x8;
 _viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32A_4x4;
 _viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32B_4x4;
 _viv_uniform int width;
 _viv_uniform int input_zp;
 _viv_uniform float in_out_scale;
 _viv_uniform float output_zp;
 _viv_uniform int remainder;
 _viv_uniform int w_size;
 __kernel void cumsum_ex_rev_array_F16toF16_axis0(
    __read_only image2d_array_t   input,
    __write_only image2d_array_t  output,
    int axis, int exclusive, int rev
    )
 {
    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
    int4 coord_out = coord;
    vxc_short8 src, dst;
    vxc_half8 data, tmpsum, sum;
    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
    Tensor img1 = create_tensor_from_image2d_array(input, 2);
    Tensor img2 = create_tensor_from_image2d_array(output, 2);
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
    if(exclusive == 0 && rev)
    {
        for(coord.x = width - 8; coord.x >= 0; coord.x -= 8)
        {
            if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
            {
                coord.x = coord.x - (8 - remainder);
            }
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            output_ptr = get_tensor_ptr_from_coord(img2, coord);
            in_ptr = (__global vxc_short8*)input_ptr;
            out_ptr = (__global vxc_short8*)output_ptr;
            src = in_ptr[0];
            _viv_asm(COPY, data, src, 16);
            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);
            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);
            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
                        uniSumHorzRevF16toF16C_2x8);
            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);
            _viv_asm(COPY, dst, sum, 16);
            out_ptr[0] = dst;
        }
    }
    else if(exclusive && rev == 0)
    {
        _viv_asm(COPY, dst, sum, 16);
        out_ptr[0] = dst;
        for(; coord.x < width - 8;)
        {
            if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
            {
                coord.x = coord.x - (8 - remainder);
            }
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global vxc_short8*)input_ptr;
            src = in_ptr[0];
            coord_out.x = coord.x + 1;
            coord.x += 8;
            _viv_asm(COPY, data, src, 16);
            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
            out_ptr = (__global vxc_short8*)output_ptr;
            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);
            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);
            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);
            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);
            _viv_asm(COPY, dst, sum, 16);
            out_ptr[0] = dst;
        }
    }
    else if(exclusive && rev)
    {
        coord.x = width - 8;
        coord_out.x = width - 1;
        _viv_asm(COPY, dst, sum, 16);
        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
        out_ptr = (__global vxc_short8*)output_ptr;
        out_ptr[0] = dst;
        for(; coord.x > 0;)
        {
            if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
            {
                coord.x = coord.x - (8 - remainder);
            }
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            output_ptr = get_tensor_ptr_from_coord(img2, coord);
            in_ptr = (__global vxc_short8*)input_ptr;
            out_ptr = (__global vxc_short8*)output_ptr;
            src = in_ptr[0];
            coord_out.x = coord.x - 1;
            coord.x -= 8;
            _viv_asm(COPY, data, src, 16);
            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);
            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);
            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
                        uniSumHorzRevF16toF16C_2x8);
            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);
            _viv_asm(COPY, dst, sum, 16);
            out_ptr[0] = dst;
        }
    }
 }
 #define CUMSUM_QINT_EX_REV_ARRAY_AXIS0(in_name, out_name, src_type, dst_type, stride_data) \
 __kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis0( \
    __read_only image2d_array_t   input, \
    __write_only image2d_array_t  output, \
    int axis, int exclusive, int rev \
    ) \
 { \
    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); \
    int4 coord_out = coord; \
 \
    src_type src; \
    dst_type dst; \
    vxc_short8 rowSum; \
    int4 sum0 = (int4)(0), sum1 = (int4)(0); \
    short zp = (short)input_zp; \
 \
    Tensor img1 = create_tensor_from_image2d_array(input, stride_data); \
    Tensor img2 = create_tensor_from_image2d_array(output, stride_data); \
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
    __global src_type* in_ptr = (__global src_type*)input_ptr; \
    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
    if(exclusive == 0 && rev) \
    { \
        for(coord.x = width - 8; coord.x >= 0; coord.x -= 8) \
        { \
            if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
            { \
                coord.x = coord.x - (8 - remainder); \
            } \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            out_ptr = (__global dst_type*)output_ptr; \
            src = in_ptr[0]; \
            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \
            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \
            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \
            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
                        uniAccSumHorzRevI16toI32A_4x4); \
            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
                        uniAccSumHorzRevI16toI32B_4x4); \
            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
            VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
                        uniConvertInt32toUint8_2x8); \
            out_ptr[0] = dst; \
        } \
    } \
    else if(exclusive && rev == 0) \
    { \
        for(coord.x = -1; coord.x < width - 8;) \
        { \
            if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
            { \
                coord.x = coord.x - (8 - remainder); \
            } \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            src = in_ptr[0]; \
            coord_out.x = coord.x + 1; \
            coord.x += 8; \
            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
            out_ptr = (__global dst_type*)output_ptr; \
            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \
            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \
            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \
            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
                        uniAccSumHorzI16toI32A_4x4); \
            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
                        uniAccSumHorzI16toI32B_4x4); \
            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
                        uniConvertInt32toUint8_2x8); \
            out_ptr[0] = dst; \
        } \
    } \
    else if(exclusive && rev) \
    { \
        for(coord.x = width - 7; coord.x > 0;) \
        { \
            if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
            { \
                coord.x = coord.x - (8 - remainder); \
            } \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            out_ptr = (__global dst_type*)output_ptr; \
            src = in_ptr[0]; \
            coord_out.x = coord.x - 1; \
            coord.x -= 8; \
            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \
            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \
            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \
            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
                        uniAccSumHorzRevI16toI32A_4x4); \
            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
                        uniAccSumHorzRevI16toI32B_4x4); \
            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
            VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
                        uniConvertInt32toUint8_2x8); \
            out_ptr[0] = dst; \
        } \
    } \
 }
 CUMSUM_QINT_EX_REV_ARRAY_AXIS0(U8,  U8,  vxc_uchar16, vxc_uchar16, 1)
 CUMSUM_QINT_EX_REV_ARRAY_AXIS0(I8,  I8,  vxc_char16,  vxc_char16, 1)
 CUMSUM_QINT_EX_REV_ARRAY_AXIS0(I16, I16, vxc_short8,  vxc_short8, 2)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis1.vx
@ -0,0 +1,330 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
 _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
 _viv_uniform VXC_512Bits uniSetZeroF16_2x8;
 _viv_uniform int height;
 _viv_uniform float in_out_scale;
 _viv_uniform float in_out_zp_scale;
 _viv_uniform float output_zp;
 _viv_uniform int remainder;
 _viv_uniform int w_size;
 __kernel void cumsum_ex_rev_array_F16toF16_axis1(
    __read_only image2d_array_t   input,
    __write_only image2d_array_t  output,
    int axis, int exclusive, int rev)
 {
    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);
    vxc_short8 src, dst;
    vxc_half8 data, sum;
    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
    {
        coord.x = coord.x - (8 - remainder);
    }
    Tensor img1 = create_tensor_from_image2d_array(input, 2);
    Tensor img2 = create_tensor_from_image2d_array(output, 2);
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
    if(exclusive == 0 && rev)
    {
        for(coord.y = height - 1; coord.y >= 0; coord.y--)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            output_ptr = get_tensor_ptr_from_coord(img2, coord);
            in_ptr = (__global vxc_short8*)input_ptr;
            out_ptr = (__global vxc_short8*)output_ptr;
            src = in_ptr[0];
            _viv_asm(COPY, data, src, 16);
            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
            _viv_asm(COPY, dst, sum, 16);
            out_ptr[0] = dst;
        }
    }
    else if(exclusive && rev == 0)
    {
        dst ^= dst;
        out_ptr[0] = dst;
        for(; coord.y < height - 1;)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global vxc_short8*)input_ptr;
            src = in_ptr[0];
            coord.y++;
            _viv_asm(COPY, data, src, 16);
            output_ptr = get_tensor_ptr_from_coord(img2, coord);
            out_ptr = (__global vxc_short8*)output_ptr;
            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
            _viv_asm(COPY, dst, sum, 16);
            out_ptr[0] = dst;
        }
    }
    else if(exclusive && rev)
    {
        dst ^= dst;
        coord.y = height - 1;
        output_ptr = get_tensor_ptr_from_coord(img2, coord);
        out_ptr = (__global vxc_short8*)output_ptr;
        out_ptr[0] = dst;
        for(; coord.y > 0;)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global vxc_short8*)input_ptr;
            src = in_ptr[0];
            coord.y--;
            _viv_asm(COPY, data, src, 16);
            output_ptr = get_tensor_ptr_from_coord(img2, coord);
            out_ptr = (__global vxc_short8*)output_ptr;
            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
            _viv_asm(COPY, dst, sum, 16);
            out_ptr[0] = dst;
        }
    }
 }
 #define CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(in_name, out_name, src_type, dst_type) \
 __kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis1( \
    __read_only image2d_array_t   input, \
    __write_only image2d_array_t  output, \
    int axis, int exclusive, int rev) \
 { \
    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \
 \
    src_type src; \
    dst_type dst; \
    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
 \
    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
    { \
        coord.x = coord.x - (16 - remainder); \
    } \
    Tensor img1 = create_tensor_from_image2d_array(input, 1); \
    Tensor img2 = create_tensor_from_image2d_array(output, 1); \
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
    __global src_type* in_ptr = (__global src_type*)input_ptr; \
    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
    if(exclusive == 0 && rev) \
    { \
        for(coord.y = height - 1; coord.y >= 0; coord.y--) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            out_ptr = (__global dst_type*)output_ptr; \
            src = in_ptr[0]; \
            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \
            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
                        uniConvertInt32toUint8_2x8); \
            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
                        uniConvertInt32toUint8_2x8); \
            out_ptr[0] = dst; \
        } \
    } \
    else if(exclusive && rev == 0) \
    { \
        int tmpAlpha0 = convert_int_rte(output_zp); \
        int4 tmpVal; \
        tmpVal.x = tmpAlpha0; \
        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
        out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \
        for(; coord.y < height - 1;) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            src = in_ptr[0]; \
            coord.y++; \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            out_ptr = (__global dst_type*)output_ptr; \
            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
            float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp; \
            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
                        uniConvertInt32toUint8_2x8);\
            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
                        uniConvertInt32toUint8_2x8);\
            out_ptr[0] = dst; \
        } \
    } \
    else if(exclusive && rev) \
    { \
        coord.y = height - 1; \
        int tmpAlpha0 = convert_int_rte(output_zp); \
        int4 tmpVal; \
        tmpVal.x = tmpAlpha0; \
        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
        output_ptr = get_tensor_ptr_from_coord(img2, coord); \
        out_ptr = (__global vxc_short8*)output_ptr; \
        out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \
        for(; coord.y > 0;) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            src = in_ptr[0]; \
            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \
            coord.y--; \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            out_ptr = (__global dst_type*)output_ptr; \
            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
                        uniConvertInt32toUint8_2x8);\
            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
                        uniConvertInt32toUint8_2x8);\
            out_ptr[0] = dst; \
        } \
    } \
 }
 CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)
 CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(I8, I8, vxc_char16,  vxc_char16)
 __kernel void cumsum_ex_rev_array_I16toI16_axis1(
    __read_only image2d_array_t   input,
    __write_only image2d_array_t  output,
    int axis, int exclusive, int rev)
 {
    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);
    vxc_short8 src, dst;
    int4 sum0 = (int4)(0), sum1 = (int4)(0);
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
    {
        coord.x = coord.x - (8 - remainder);
    }
    Tensor img1 = create_tensor_from_image2d_array(input, 2);
    Tensor img2 = create_tensor_from_image2d_array(output, 2);
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
    if(exclusive == 0 && rev)
    {
        for(coord.y = height - 1; coord.y >= 0; coord.y--)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            output_ptr = get_tensor_ptr_from_coord(img2, coord);
            in_ptr = (__global vxc_short8*)input_ptr;
            out_ptr = (__global vxc_short8*)output_ptr;
            src = in_ptr[0];
            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;
            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
            int4 tmpDst0 = convert_int4_rte(tmpSum0);
            int4 tmpDst1 = convert_int4_rte(tmpSum1);
            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
                        uniConvertInt32toUint8_2x8);
            out_ptr[0] = dst;
        }
    }
    else if(exclusive && rev == 0)
    {
        int tmpAlpha0 = convert_int_rte(output_zp);
        int4 tmpVal;
        tmpVal.x = tmpAlpha0;
        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
        out_ptr[0] = dst.xxxxxxxx;
        for(; coord.y < height - 1;)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global vxc_short8*)input_ptr;
            src = in_ptr[0];
            coord.y++;
            output_ptr = get_tensor_ptr_from_coord(img2, coord);
            out_ptr = (__global vxc_short8*)output_ptr;
            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
            float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp;
            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
            int4 tmpDst0 = convert_int4_rte(tmpSum0);
            int4 tmpDst1 = convert_int4_rte(tmpSum1);
            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
                        uniConvertInt32toUint8_2x8);
            out_ptr[0] = dst;
        }
    }
    else if(exclusive && rev)
    {
        coord.y = height - 1;
        int tmpAlpha0 = convert_int_rte(output_zp);
        int4 tmpVal;
        tmpVal.x = tmpAlpha0;
        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
        output_ptr = get_tensor_ptr_from_coord(img2, coord);
        out_ptr = (__global vxc_short8*)output_ptr;
        out_ptr[0] = dst.xxxxxxxx;
        for(; coord.y > 0;)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global vxc_short8*)input_ptr;
            src = in_ptr[0];
            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;
            coord.y--;
            output_ptr = get_tensor_ptr_from_coord(img2, coord);
            out_ptr = (__global vxc_short8*)output_ptr;
            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
            int4 tmpDst0 = convert_int4_rte(tmpSum0);
            int4 tmpDst1 = convert_int4_rte(tmpSum1);
            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
                        uniConvertInt32toUint8_2x8);
            out_ptr[0] = dst;
        }
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis2.vx
@ -0,0 +1,322 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
 _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
 _viv_uniform VXC_512Bits uniSetZeroF16_2x8;
 _viv_uniform int channel;
 _viv_uniform float in_out_scale;
 _viv_uniform float in_out_zp_scale;
 _viv_uniform float output_zp;
 _viv_uniform int remainder;
 _viv_uniform int w_size;
 __kernel void cumsum_ex_rev_array_F16toF16_axis2(
    __read_only image2d_array_t   input,
    __write_only image2d_array_t  output,
    int axis, int exclusive, int rev)
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
    vxc_short8 src, dst;
    vxc_half8 data, sum;
    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
    {
        coord.x = coord.x - (8 - remainder);
    }
    Tensor img1 = create_tensor_from_image2d_array(input, 2);
    Tensor img2 = create_tensor_from_image2d_array(output, 2);
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
    if(rev && exclusive == 0)
    {
        for(coord.z = channel - 1; coord.z >= 0; coord.z--)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            output_ptr = get_tensor_ptr_from_coord(img2, coord);
            in_ptr = (__global vxc_short8*)input_ptr;
            out_ptr = (__global vxc_short8*)output_ptr;
            src = in_ptr[0];
            _viv_asm(COPY, data, src, 16);
            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
            _viv_asm(COPY, dst, sum, 16);
            out_ptr[0] = dst;
        }
    }
    else if(rev == 0 && exclusive)
    {
        _viv_asm(COPY, dst, sum, 16);
        out_ptr[0] = dst;
        for(; coord.z < channel - 1;)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global vxc_short8*)input_ptr;
            src = in_ptr[0];
            coord.z++;
            _viv_asm(COPY, data, src, 16);
            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
            _viv_asm(COPY, dst, sum, 16);
            out_ptr[0] = dst;
        }
    }
    else if(rev && exclusive)
    {
        _viv_asm(COPY, dst, sum, 16);
        coord.z = channel - 1;
        output_ptr = get_tensor_ptr_from_coord(img2, coord);
        out_ptr = (__global vxc_short8*)output_ptr;
        out_ptr[0] = dst;
        for(; coord.z > 0;)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global vxc_short8*)input_ptr;
            src = in_ptr[0];
            coord.z--;
            _viv_asm(COPY, data, src, 16);
            output_ptr = get_tensor_ptr_from_coord(img2, coord);
            out_ptr = (__global vxc_short8*)output_ptr;
            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
            _viv_asm(COPY, dst, sum, 16);
            out_ptr[0] = dst;
        }
    }
 }
 #define CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(in_name, out_name, src_type, dst_type) \
 __kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis2( \
    __read_only image2d_array_t   input, \
    __write_only image2d_array_t  output, \
    int axis, int exclusive, int rev) \
 { \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
 \
    src_type src; \
    dst_type dst; \
    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
 \
    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
    { \
        coord.x = coord.x - (16 - remainder); \
    } \
    Tensor img1 = create_tensor_from_image2d_array(input, 1); \
    Tensor img2 = create_tensor_from_image2d_array(output, 1); \
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
    __global src_type* in_ptr = (__global src_type*)input_ptr; \
    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
    if(rev && exclusive == 0) \
    { \
        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            out_ptr = (__global dst_type*)output_ptr; \
            src = in_ptr[0]; \
            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \
            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
                        uniConvertInt32toUint8_2x8);\
            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \
                        uniConvertInt32toUint8_2x8);\
            out_ptr[0] = dst; \
        } \
    } \
    else if(exclusive && rev == 0) \
    { \
        int tmpAlpha0 = convert_int_rte(output_zp); \
        int4 tmpVal; \
        tmpVal.x = tmpAlpha0; \
        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
        out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \
        for(; coord.z < channel - 1;) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            src = in_ptr[0]; \
            coord.z++; \
            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
            float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp; \
            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
                        uniConvertInt32toUint8_2x8); \
            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \
                        uniConvertInt32toUint8_2x8); \
            out_ptr[0] = dst; \
        } \
    } \
    else if(rev && exclusive) \
    { \
        coord.z = channel - 1; \
        int tmpAlpha0 = convert_int_rte(output_zp); \
        int4 tmpVal; \
        tmpVal.x = tmpAlpha0; \
        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
        output_ptr = get_tensor_ptr_from_coord(img2, coord); \
        out_ptr = (__global vxc_short8*)output_ptr; \
        out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \
        for(; coord.z > 0;) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global src_type*)input_ptr; \
            src = in_ptr[0]; \
            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \
            coord.z--; \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            out_ptr = (__global dst_type*)output_ptr; \
            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
                        uniConvertInt32toUint8_2x8); \
            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1),
                        uniConvertInt32toUint8_2x8); \
            out_ptr[0] = dst; \
        } \
    } \
 }
 CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)
 CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(I8, I8, vxc_char16, vxc_char16)
 __kernel void cumsum_ex_rev_array_I16toI16_axis2(
    __read_only image2d_array_t   input,
    __write_only image2d_array_t  output,
    int axis, int exclusive, int rev)
 {
    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
    vxc_short8 src, dst;
    int4 sum0 = (int4)(0), sum1 = (int4)(0);
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
    {
        coord.x = coord.x - (8 - remainder);
    }
    Tensor img1 = create_tensor_from_image2d_array(input, 2);
    Tensor img2 = create_tensor_from_image2d_array(output, 2);
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
    if(exclusive == 0 && rev)
    {
        for(coord.z = channel - 1; coord.z >= 0; coord.z--)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            output_ptr = get_tensor_ptr_from_coord(img2, coord);
            in_ptr = (__global vxc_short8*)input_ptr;
            out_ptr = (__global vxc_short8*)output_ptr;
            src = in_ptr[0];
            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;
            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
            int4 tmpDst0 = convert_int4_rte(tmpSum0);
            int4 tmpDst1 = convert_int4_rte(tmpSum1);
            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
                        uniConvertInt32toUint8_2x8);
            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
        }
    }
    else if(exclusive && rev == 0)
    {
        int tmpAlpha0 = convert_int_rte(output_zp);
        int4 tmpVal;
        tmpVal.x = tmpAlpha0;
        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
        out_ptr[0] = dst.xxxxxxxx;
        for(; coord.z < channel - 1;)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global vxc_short8*)input_ptr;
            src = in_ptr[0];
            coord.z++;
            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
            float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp;
            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
            int4 tmpDst0 = convert_int4_rte(tmpSum0);
            int4 tmpDst1 = convert_int4_rte(tmpSum1);
            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
                        uniConvertInt32toUint8_2x8);
            out_ptr[0] = dst;
        }
    }
    else if(exclusive && rev)
    {
        coord.z = channel - 1;
        int tmpAlpha0 = convert_int_rte(output_zp);
        int4 tmpVal;
        tmpVal.x = tmpAlpha0;
        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
        output_ptr = get_tensor_ptr_from_coord(img2, coord);
        out_ptr = (__global vxc_short8*)output_ptr;
        out_ptr[0] = dst.xxxxxxxx;
        for(; coord.z > 0;)
        {
            input_ptr = get_tensor_ptr_from_coord(img1, coord);
            in_ptr = (__global vxc_short8*)input_ptr;
            src = in_ptr[0];
            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;
            coord.z--;
            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
            int4 tmpDst0 = convert_int4_rte(tmpSum0);
            int4 tmpDst1 = convert_int4_rte(tmpSum1);
            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
                        uniConvertInt32toUint8_2x8);
            out_ptr[0] = dst;
        }
    }
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8.vx
@ -0,0 +1,324 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
 _viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
 _viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
 _viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
 _viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
 _viv_uniform VXC_512Bits uniSetZeroF16_2x8;
 _viv_uniform int width;
 _viv_uniform int height;
 _viv_uniform int channel;
 _viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
 _viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
 _viv_uniform int remainder;
 _viv_uniform int w_size;
 #define CUMSUM_ARRAY_F16TOQINT_AXIS2(out_name, src_type, dst_type, stride_out) \
 __kernel void cumsum_array_F16to##out_name##_axis2( \
    __read_only image2d_array_t   input, \
    __write_only image2d_array_t  output, \
    int axis, int exclusive, int rev \
    ) \
 { \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
 \
    vxc_short8 src; \
    dst_type dst; \
    vxc_half8 data, sum; \
    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
    vxc_ushort8 ms0; \
    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
    { \
        coord.x = coord.x - (8 - remainder); \
    } \
    for(coord.z = 0; coord.z < channel; coord.z++) \
    { \
        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
        src = in_ptr[0]; \
        _viv_asm(COPY, data, src, 16); \
 \
        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
                uniU8MulAndPostShift_0_Lo_2x8); \
        out_ptr[0] = dst; \
    } \
 }
 CUMSUM_ARRAY_F16TOQINT_AXIS2(I8,  vxc_half8, vxc_char16, 1)
 CUMSUM_ARRAY_F16TOQINT_AXIS2(I16, vxc_half8, vxc_short8, 2)
 CUMSUM_ARRAY_F16TOQINT_AXIS2(U8,  vxc_half8, vxc_uchar16, 1)
 #define CUMSUM_ARRAY_F16TOQINT_AXIS1(out_name, src_type, dst_type, stride_out) \
 __kernel void cumsum_array_F16to##out_name##_axis1( \
    __read_only image2d_array_t   input, \
    __write_only image2d_array_t  output, \
    int axis, int exclusive, int rev \
    ) \
 { \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
 \
    vxc_short8 src; \
    dst_type dst; \
    vxc_half8 data, sum; \
    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
    vxc_ushort8 ms0; \
    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
    { \
        coord.x = coord.x - (8 - remainder); \
    } \
    for(coord.y = 0; coord.y < height; coord.y++) \
    { \
        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
        src = in_ptr[0]; \
        _viv_asm(COPY, data, src, 16); \
 \
        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
                uniU8MulAndPostShift_0_Lo_2x8); \
        out_ptr[0] = dst; \
    } \
 }
 CUMSUM_ARRAY_F16TOQINT_AXIS1(I8,  vxc_half8, vxc_char16, 1)
 CUMSUM_ARRAY_F16TOQINT_AXIS1(I16, vxc_half8, vxc_short8, 2)
 CUMSUM_ARRAY_F16TOQINT_AXIS1(U8,  vxc_half8, vxc_uchar16, 1)
 #define CUMSUM_ARRAY_F16TOQINT_AXIS0(out_name, src_type, dst_type, stride_out) \
 __kernel void cumsum_array_F16to##out_name##_axis0( \
    __read_only image2d_array_t   input, \
    __write_only image2d_array_t  output, \
    int axis, int exclusive, int rev \
    ) \
 { \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
 \
    vxc_short8 src; \
    dst_type dst; \
    vxc_half8 data, tmpsum, sum; \
    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
    vxc_ushort8 ms0; \
    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
    for(; coord.x < width; coord.x += 8) \
    { \
        if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
        { \
            coord.x = coord.x - (8 - remainder); \
        } \
        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
        src = in_ptr[0]; \
        _viv_asm(COPY, data, src, 16); \
 \
        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4); \
        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4); \
        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8); \
        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8); \
        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
                uniU8MulAndPostShift_0_Lo_2x8); \
        out_ptr[0] = dst; \
    } \
 }
 CUMSUM_ARRAY_F16TOQINT_AXIS0(I8,  vxc_half8, vxc_char16, 1)
 CUMSUM_ARRAY_F16TOQINT_AXIS0(I16, vxc_half8, vxc_short8, 2)
 CUMSUM_ARRAY_F16TOQINT_AXIS0(U8,  vxc_half8, vxc_uchar16, 1)
 #define CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(out_name, src_type, dst_type, stride_out) \
 __kernel void cumsum_array_ex_rev_F16to##out_name##_axis2( \
    __read_only image2d_array_t   input, \
    __write_only image2d_array_t  output, \
    int axis, int exclusive, int rev \
    ) \
 { \
    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
 \
    vxc_short8 src; \
    dst_type dst; \
    vxc_half8 data, sum; \
    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
    vxc_ushort8 ms0; \
    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
    { \
        coord.x = coord.x - (8 - remainder); \
    } \
    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
    if(exclusive == 0 && rev) \
    { \
        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            in_ptr = (__global vxc_short8*)input_ptr; \
            out_ptr = (__global dst_type*)output_ptr; \
            src = in_ptr[0]; \
            _viv_asm(COPY, data, src, 16); \
            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
                    uniU8MulAndPostShift_0_Lo_2x8); \
            out_ptr[0] = dst; \
        } \
    } \
    else if(exclusive && rev == 0) \
    { \
        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
                    uniU8MulAndPostShift_0_Lo_2x8); \
        out_ptr[0] = dst; \
        for(; coord.z < channel - 1;) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global vxc_short8*)input_ptr; \
            src = in_ptr[0]; \
            coord.z++; \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            out_ptr = (__global dst_type*)output_ptr; \
            _viv_asm(COPY, data, src, 16); \
     \
            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
                    uniU8MulAndPostShift_0_Lo_2x8); \
            out_ptr[0] = dst; \
        } \
    } \
    else if(exclusive && rev) \
    { \
        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
                    uniU8MulAndPostShift_0_Lo_2x8); \
        coord.z = channel - 1; \
        output_ptr = get_tensor_ptr_from_coord(img2, coord); \
        out_ptr = (__global dst_type*)output_ptr; \
        out_ptr[0] = dst; \
        for(; coord.z > 0;) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global vxc_short8*)input_ptr; \
            src = in_ptr[0]; \
            coord.z--; \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            out_ptr = (__global dst_type*)output_ptr; \
            _viv_asm(COPY, data, src, 16); \
     \
            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
                    uniU8MulAndPostShift_0_Lo_2x8); \
            out_ptr[0] = dst; \
        } \
    } \
 }
 CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(I8,  vxc_half8, vxc_char16, 1)
 CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(I16, vxc_half8, vxc_short8, 2)
 CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(U8,  vxc_half8, vxc_uchar16, 1)
 #define CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(out_name, src_type, dst_type, stride_out) \
 __kernel void cumsum_array_ex_rev_F16to##out_name##_axis1( \
    __read_only image2d_array_t   input, \
    __write_only image2d_array_t  output, \
    int axis, int exclusive, int rev \
    ) \
 { \
    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \
 \
    vxc_short8 src; \
    dst_type dst; \
    vxc_half8 data, sum; \
    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
    vxc_ushort8 ms0; \
    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
    { \
        coord.x = coord.x - (8 - remainder); \
    } \
    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
    if(exclusive == 0 && rev) \
    { \
        for(coord.y = height - 1; coord.y >= 0; coord.y--) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            in_ptr = (__global vxc_short8*)input_ptr; \
            out_ptr = (__global dst_type*)output_ptr; \
            src = in_ptr[0]; \
            _viv_asm(COPY, data, src, 16); \
            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
                    uniU8MulAndPostShift_0_Lo_2x8); \
            out_ptr[0] = dst; \
        } \
    } \
    else if(exclusive && rev == 0) \
    { \
        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
                    uniU8MulAndPostShift_0_Lo_2x8); \
        out_ptr[0] = dst; \
        for(; coord.y < height - 1;) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global vxc_short8*)input_ptr; \
            src = in_ptr[0]; \
            coord.y++; \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            out_ptr = (__global dst_type*)output_ptr; \
            _viv_asm(COPY, data, src, 16); \
            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
                    uniU8MulAndPostShift_0_Lo_2x8); \
            out_ptr[0] = dst; \
        } \
    } \
    else if(exclusive && rev) \
    { \
        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
                    uniU8MulAndPostShift_0_Lo_2x8); \
        coord.y = height - 1; \
        output_ptr = get_tensor_ptr_from_coord(img2, coord); \
        out_ptr = (__global dst_type*)output_ptr; \
        out_ptr[0] = dst; \
        for(; coord.y > 0;) \
        { \
            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
            in_ptr = (__global vxc_short8*)input_ptr; \
            src = in_ptr[0]; \
            coord.y--; \
            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
            out_ptr = (__global dst_type*)output_ptr; \
            _viv_asm(COPY, data, src, 16); \
            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
                    uniU8MulAndPostShift_0_Lo_2x8); \
            out_ptr[0] = dst; \
        } \
    } \
 }
 CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(I8,  vxc_half8, vxc_char16, 1)
 CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8, 2)
 CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(U8,  vxc_half8, vxc_uchar16, 1)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8_2d.vx
@ -0,0 +1,108 @@
 #include "cl_viv_vx_ext.h"
 _viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
 _viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
 _viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
 _viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
 _viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
 _viv_uniform VXC_512Bits uniSetZeroF16_2x8;
 _viv_uniform int width;
 _viv_uniform int height;
 _viv_uniform int channel;
 _viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
 _viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
 _viv_uniform int remainder;
 _viv_uniform int w_size;
 #define CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(out_name, src_type, dst_type, stride_out) \
 __kernel void cumsum_array_F16to##out_name##_axis1_2D( \
    __read_only image2d_t   input, \
    __write_only image2d_t  output, \
    int axis, int exclusive, int rev \
    ) \
 { \
    int2 coord = (int2)(get_global_id(0), 0); \
 \
    vxc_short8 src; \
    dst_type dst; \
    vxc_half8 data, sum; \
    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
    vxc_ushort8 ms0; \
    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
    { \
        coord.x = coord.x - (8 - remainder); \
    } \
    for(; coord.y < height; coord.y++) \
    { \
        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
        src = in_ptr[0]; \
        _viv_asm(COPY, data, src, 16); \
 \
        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
                uniAccSumVertF16toF16_2x8); \
        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
                uniU8MulAndPostShift_0_Lo_2x8); \
        out_ptr[0] = dst; \
    } \
 }
 CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(I8,  vxc_half8, vxc_char16, 1)
 CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(I16, vxc_half8, vxc_short8, 2)
 CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(U8,  vxc_half8, vxc_uchar16, 1)
 #define CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(out_name, src_type, dst_type, stride_out) \
 __kernel void cumsum_array_F16to##out_name##_axis0_2D( \
    __read_only image2d_t   input, \
    __write_only image2d_t  output, \
    int axis, int exclusive, int rev \
    ) \
 { \
    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
 \
    vxc_short8 src; \
    dst_type dst; \
    vxc_half8 data, tmpsum, sum; \
    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
    vxc_ushort8 ms0; \
    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
    for(; coord.x < width; coord.x += 8) \
    { \
        if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
        { \
            coord.x = coord.x - (8 - remainder); \
        } \
        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
        src = in_ptr[0]; \
        _viv_asm(COPY, data, src, 16); \
 \
        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
                uniSumHorzF16toF16A_4x4); \
        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\
                uniSumHorzF16toF16B_4x4); \
        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
                uniSumHorzF16toF16C_2x8); \
        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
                uniAccSumHorzF16toF16_2x8); \
        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
                uniU8MulAndPostShift_0_Lo_2x8); \
        out_ptr[0] = dst; \
    } \
 }
 CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(I8,  vxc_half8, vxc_char16, 1)
 CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8, 2)
 CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(U8,  vxc_half8, vxc_uchar16, 1)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx
@ -92,3 +92,116 @@ __kernel void gather_nd_F16toF16_1D(
    VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
 __kernel void gather_nd_array_I8toI8_1D(
    __read_only image2d_t   input0,
    __read_only image2d_t   input1,
    __write_only image2d_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // indices_num
    int4 coord = (int4)(0, gidy, gidx, 0);
    Image img = create_image_from_image2d(input1, 4);
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
    int4 indice = ((int4 *)indice_ptr)[0];
    coord.w = indice.x;
    Image img1 = create_image_from_image2d(input0, 1);
    Image img2 = create_image_from_image2d(output, 1);
    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
    __global char* data_ptr = (__global char*)input_ptr;
    __global char* dst_ptr = (__global char*)output_ptr;
    char src = data_ptr[0];
    dst_ptr[0] = src;
 }
 __kernel void gather_nd_array_U8toU8_1D(
    __read_only image2d_t   input0,
    __read_only image2d_t   input1,
    __write_only image2d_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // indices_num
    int4 coord = (int4)(0, gidy, gidx, 0);
    Image img = create_image_from_image2d(input1, 4);
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
    int4 indice = ((int4 *)indice_ptr)[0];
    coord.w = indice.x;
    Image img1 = create_image_from_image2d(input0, 1);
    Image img2 = create_image_from_image2d(output, 1);
    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
    __global uchar* data_ptr = (__global uchar*)input_ptr;
    __global uchar* dst_ptr = (__global uchar*)output_ptr;
    uchar src = data_ptr[0];
    dst_ptr[0] = src;
 }
 __kernel void gather_nd_array_I16toI16_1D(
    __read_only image2d_t   input0,
    __read_only image2d_t   input1,
    __write_only image2d_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // indices_num
    int4 coord = (int4)(0, gidy, gidx, 0);
    Image img = create_image_from_image2d(input1, 4);
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
    int4 indice = ((int4 *)indice_ptr)[0];
    coord.w = indice.x;
    Image img1 = create_image_from_image2d(input0, 2);
    Image img2 = create_image_from_image2d(output, 2);
    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
    __global short* data_ptr = (__global short*)input_ptr;
    __global short* dst_ptr = (__global short*)output_ptr;
    short src = data_ptr[0];
    dst_ptr[0] = src;
 }
 __kernel void gather_nd_array_F16toF16_1D(
    __read_only image2d_t   input0,
    __read_only image2d_t   input1,
    __write_only image2d_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // indices_num
    int4 coord = (int4)(0, gidy, gidx, 0);
    Image img = create_image_from_image2d(input1, 4);
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
    int4 indice = ((int4 *)indice_ptr)[0];
    coord.w = indice.x;
    Image img1 = create_image_from_image2d(input0, 2);
    Image img2 = create_image_from_image2d(output, 2);
    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
    __global short* data_ptr = (__global short*)input_ptr;
    __global short* dst_ptr = (__global short*)output_ptr;
    short src = data_ptr[0];
    dst_ptr[0] = src;
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx
@ -92,3 +92,116 @@ __kernel void gather_nd_F16toF16_2D(
    VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
 __kernel void gather_nd_array_I8toI8_2D(
    __read_only image2d_t   input0,
    __read_only image2d_t   input1,
    __write_only image2d_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // indices_num
    int4 coord = (int4)(0, gidy, gidx, 0);
    Image img = create_image_from_image2d(input1, 4);
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
    int4 indice = ((int4 *)indice_ptr)[0];
    indice.x = indice.x * block_size + gidx;
    Image img1 = create_image_from_image2d(input0, 1);
    Image img2 = create_image_from_image2d(output, 1);
    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
    __global char* data_ptr = (__global char*)input_ptr;
    __global char* dst_ptr = (__global char*)output_ptr;
    char src = data_ptr[0];
    dst_ptr[0] = src;
 }
 __kernel void gather_nd_array_U8toU8_2D(
    __read_only image2d_t   input0,
    __read_only image2d_t   input1,
    __write_only image2d_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // indices_num
    int4 coord = (int4)(0, gidy, gidx, 0);
    Image img = create_image_from_image2d(input1, 4);
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
    int4 indice = ((int4 *)indice_ptr)[0];
    indice.x = indice.x * block_size + gidx;
    Image img1 = create_image_from_image2d(input0, 1);
    Image img2 = create_image_from_image2d(output, 1);
    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
    __global uchar* data_ptr = (__global uchar*)input_ptr;
    __global uchar* dst_ptr = (__global uchar*)output_ptr;
    uchar src = data_ptr[0];
    dst_ptr[0] = src;
 }
 __kernel void gather_nd_array_I16toI16_2D(
    __read_only image2d_t   input0,
    __read_only image2d_t   input1,
    __write_only image2d_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // indices_num
    int4 coord = (int4)(0, gidy, gidx, 0);
    Image img = create_image_from_image2d(input1, 4);
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
    int4 indice = ((int4 *)indice_ptr)[0];
    indice.x = indice.x * block_size + gidx;
    Image img1 = create_image_from_image2d(input0, 2);
    Image img2 = create_image_from_image2d(output, 2);
    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
    __global short* data_ptr = (__global short*)input_ptr;
    __global short* dst_ptr = (__global short*)output_ptr;
    short src = data_ptr[0];
    dst_ptr[0] = src;
 }
 __kernel void gather_nd_array_F16toF16_2D(
    __read_only image2d_t   input0,
    __read_only image2d_t   input1,
    __write_only image2d_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // indices_num
    int4 coord = (int4)(0, gidy, gidx, 0);
    Image img = create_image_from_image2d(input1, 4);
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
    int4 indice = ((int4 *)indice_ptr)[0];
    indice.x = indice.x * block_size + gidx;
    Image img1 = create_image_from_image2d(input0, 2);
    Image img2 = create_image_from_image2d(output, 2);
    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
    __global short* data_ptr = (__global short*)input_ptr;
    __global short* dst_ptr = (__global short*)output_ptr;
    short src = data_ptr[0];
    dst_ptr[0] = src;
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx
@ -80,3 +80,85 @@ __kernel void gather_nd_F16to##src1_type_name##_2D( \
 GATHER_ND_F16_TO_QINT_2D(U8, vxc_uchar16)
 GATHER_ND_F16_TO_QINT_2D(I8, vxc_char16)
 GATHER_ND_F16_TO_QINT_2D(I16, vxc_short8)
 #define GATHER_ND_ARRAY_QINT_TO_F16_2D(src0_type_name, read_type, ptr_type, stride) \
 __kernel void gather_nd_array_##src0_type_name##toF16_2D( \
    __read_only image2d_t   input0, \
    __read_only image2d_t   input1, \
    __write_only image2d_t  output, \
    int block_size, \
    int coord_dim \
    ) \
 { \
    int gidx = get_global_id(0); \
    int gidy = get_global_id(1); \
 \
    int4 coord = (int4)(0, gidy, gidx, 0); \
    Image img = create_image_from_image2d(input1, 4); \
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
    int4 indice = ((int4 *)indice_ptr)[0]; \
 \
    indice.x = indice.x * block_size + gidx; \
 \
    Image img1 = create_image_from_image2d(input0, stride); \
    Image img2 = create_image_from_image2d(output, 2); \
 \
    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); \
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
 \
    __global ptr_type data_ptr = (__global ptr_type)input_ptr; \
    __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \
    read_type src = data_ptr[0]; \
 \
    vxc_half8  src0; \
    vxc_short8 dst0; \
    vxc_ushort8 ms0; \
    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
    VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \
    _viv_asm(COPY, dst0, src0, 16); \
    dst_ptr[0] = dst0; \
 }
 GATHER_ND_ARRAY_QINT_TO_F16_2D(U8, vxc_uchar16, vxc_uchar16*, 1)
 GATHER_ND_ARRAY_QINT_TO_F16_2D(I8, vxc_char16, vxc_char16*, 1)
 GATHER_ND_ARRAY_QINT_TO_F16_2D(I16, vxc_short8, vxc_short8*, 2)
 #define GATHER_ND_ARRAY_F16_TO_QINT_2D(src1_type_name, write_type, ptr_type, stride) \
 __kernel void gather_nd_array_F16to##src1_type_name##_2D( \
    __read_only image2d_t   input0, \
    __read_only image2d_t   input1, \
    __write_only image2d_t  output, \
    int block_size, \
    int coord_dim \
    ) \
 { \
    int gidx = get_global_id(0); \
    int gidy = get_global_id(1); \
 \
    int4 coord = (int4)(0, gidy, gidx, 0); \
    Image img = create_image_from_image2d(input1, 4); \
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
    int4 indice = ((int4 *)indice_ptr)[0]; \
 \
    indice.x = indice.x * block_size + gidx; \
 \
    Image img1 = create_image_from_image2d(input0, 2); \
    Image img2 = create_image_from_image2d(output, stride); \
 \
    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); \
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
 \
    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \
    __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \
    vxc_short8 src = data_ptr[0]; \
 \
    vxc_ushort8 mp1; \
    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
    vxc_half8 data; \
    write_type dst; \
    _viv_asm(COPY, data, src, 16); \
    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1),uniConvertFp16toU8_2x8); \
    dst_ptr[0] = dst; \
 }
 GATHER_ND_ARRAY_F16_TO_QINT_2D(U8, vxc_uchar16, vxc_uchar16*, 1)
 GATHER_ND_ARRAY_F16_TO_QINT_2D(I8, vxc_char16, vxc_char16*, 1)
 GATHER_ND_ARRAY_F16_TO_QINT_2D(I16, vxc_short8, vxc_short8*, 2)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx
@ -98,3 +98,120 @@ __kernel void gather_nd_F16toF16_3D(
    VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
 __kernel void gather_nd_array_I8toI8_3D(
    __read_only image2d_array_t   input0,
    __read_only image2d_t   input1,
    __write_only image2d_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // indices_num
    int4 coord = (int4)(0, gidy, gidx, 0);
    Image img = create_image_from_image2d(input1, 4);
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
    int4 indice = ((int4 *)indice_ptr)[0];
    indice.x = indice.x * block_size + gidx;
    indice.w = 0;
    Tensor img1 = create_tensor_from_image2d_array(input0, 1);
    Image img2 = create_image_from_image2d(output, 1);
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
    __global char* data_ptr = (__global char*)input_ptr;
    __global char* dst_ptr = (__global char*)output_ptr;
    char src = data_ptr[0];
    dst_ptr[0] = src;
 }
 __kernel void gather_nd_array_U8toU8_3D(
    __read_only image2d_array_t   input0,
    __read_only image2d_t   input1,
    __write_only image2d_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // indices_num
    int4 coord = (int4)(0, gidy, gidx, 0);
    Image img = create_image_from_image2d(input1, 4);
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
    int4 indice = ((int4 *)indice_ptr)[0];
    indice.x = indice.x * block_size + gidx;
    indice.w = 0;
    Tensor img1 = create_tensor_from_image2d_array(input0, 1);
    Image img2 = create_image_from_image2d(output, 1);
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
    __global uchar* data_ptr = (__global uchar*)input_ptr;
    __global uchar* dst_ptr = (__global uchar*)output_ptr;
    uchar src = data_ptr[0];
    dst_ptr[0] = src;
 }
 __kernel void gather_nd_array_I16toI16_3D(
    __read_only image2d_array_t   input0,
    __read_only image2d_t   input1,
    __write_only image2d_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // indices_num
    int4 coord = (int4)(0, gidy, gidx, 0);
    Image img = create_image_from_image2d(input1, 4);
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
    int4 indice = ((int4 *)indice_ptr)[0];
    indice.x = indice.x * block_size + gidx;
    indice.w = 0;
    Tensor img1 = create_tensor_from_image2d_array(input0, 2);
    Image img2 = create_image_from_image2d(output, 2);
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
    __global short* data_ptr = (__global short*)input_ptr;
    __global short* dst_ptr = (__global short*)output_ptr;
    short src = data_ptr[0];
    dst_ptr[0] = src;
 }
 __kernel void gather_nd_array_F16toF16_3D(
    __read_only image2d_array_t   input0,
    __read_only image2d_t   input1,
    __write_only image2d_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // indices_num
    int4 coord = (int4)(0, gidy, gidx, 0);
    Image img = create_image_from_image2d(input1, 4);
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
    int4 indice = ((int4 *)indice_ptr)[0];
    indice.x = indice.x * block_size + gidx;
    indice.w = 0;
    Tensor img1 = create_tensor_from_image2d_array(input0, 2);
    Image img2 = create_image_from_image2d(output, 2);
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
    __global short* data_ptr = (__global short*)input_ptr;
    __global short* dst_ptr = (__global short*)output_ptr;
    short src = data_ptr[0];
    dst_ptr[0] = src;
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx
@ -80,3 +80,86 @@ GATHER_ND_F16_TO_QINT_3D(U8, vxc_uchar16)
 GATHER_ND_F16_TO_QINT_3D(I8, vxc_char16)
 GATHER_ND_F16_TO_QINT_3D(I16, vxc_short8)
 #define GATHER_ND_ARRAY_QINT_TO_F16_3D(src0_type_name, read_type, ptr_type, stride) \
 __kernel void gather_nd_array_##src0_type_name##toF16_3D( \
    __read_only image2d_array_t   input0, \
    __read_only image2d_t   input1, \
    __write_only image2d_t  output, \
    int block_size, \
    int coord_dim \
    ) \
 { \
    int gidx = get_global_id(0); \
    int gidy = get_global_id(1); \
 \
    int4 coord = (int4)(0, gidy, gidx, 0); \
    Image img = create_image_from_image2d(input1, 4); \
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
    int4 indice = ((int4 *)indice_ptr)[0]; \
 \
    indice.x = indice.x * block_size + gidx; \
    indice.w = 0; \
    Tensor img1 = create_tensor_from_image2d_array(input0, stride); \
    Image img2 = create_image_from_image2d(output, 2); \
 \
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); \
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
 \
    __global ptr_type data_ptr = (__global ptr_type)input_ptr; \
    __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \
    read_type src = data_ptr[0]; \
 \
    vxc_half8  src0; \
    vxc_short8 dst0; \
    vxc_ushort8 ms0; \
    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
    VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \
    _viv_asm(COPY, dst0, src0, 16); \
    dst_ptr[0] = dst0; \
 }
 GATHER_ND_ARRAY_QINT_TO_F16_3D(U8, vxc_uchar16, vxc_uchar16*, 1)
 GATHER_ND_ARRAY_QINT_TO_F16_3D(I8, vxc_char16, vxc_char16*, 1)
 GATHER_ND_ARRAY_QINT_TO_F16_3D(I16, vxc_short8, vxc_short8*, 2)
 #define GATHER_ND_ARRAY_F16_TO_QINT_3D(src1_type_name, write_type, ptr_type, stride) \
 __kernel void gather_nd_array_F16to##src1_type_name##_3D( \
    __read_only image2d_array_t   input0, \
    __read_only image2d_t   input1, \
    __write_only image2d_t  output, \
    int block_size, \
    int coord_dim \
    ) \
 { \
    int gidx = get_global_id(0); \
    int gidy = get_global_id(1); \
 \
    int4 coord = (int4)(0, gidy, gidx, 0); \
    Image img = create_image_from_image2d(input1, 4); \
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
    int4 indice = ((int4 *)indice_ptr)[0]; \
 \
    indice.x = indice.x * block_size + gidx; \
    indice.w = 0; \
 \
    Tensor img1 = create_tensor_from_image2d_array(input0, 2); \
    Image img2 = create_image_from_image2d(output, stride); \
 \
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); \
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
 \
    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \
    __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \
    vxc_short8 src = data_ptr[0]; \
 \
    vxc_ushort8 mp1; \
    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
    vxc_half8 data; \
    write_type dst; \
    _viv_asm(COPY, data, src, 16); \
    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1), uniConvertFp16toU8_2x8); \
    dst_ptr[0] = dst; \
 }
 GATHER_ND_ARRAY_F16_TO_QINT_3D(U8, vxc_uchar16, vxc_uchar16*, 1)
 GATHER_ND_ARRAY_F16_TO_QINT_3D(I8, vxc_char16, vxc_char16*, 1)
 GATHER_ND_ARRAY_F16_TO_QINT_3D(I16, vxc_short8, vxc_short8*, 2)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx
@ -95,3 +95,118 @@ __kernel void gather_nd_batch_F16toF16_1D(
    VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
 __kernel void gather_nd_array_batch_I8toI8_1D(
    __read_only image2d_t   input0,
    __read_only image2d_array_t   input1,
    __write_only image2d_array_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // index num
    int gidz = get_global_id(2);  // batch num
    int4 coord = (int4)(gidx, gidy, gidz, 0);
    Tensor img = create_tensor_from_image2d_array(input1, 4);
    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
    int4 indice = ((int4 *)indice_ptr)[0];
    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
    Image img1 = create_image_from_image2d(input0, 1);
    Tensor img2 = create_tensor_from_image2d_array(output, 1);
    uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
    __global char* data_ptr = (__global char*)input_ptr;
    __global char* dst_ptr = (__global char*)output_ptr;
    char src = data_ptr[0];
    dst_ptr[0] = src;
 }
 __kernel void gather_nd_array_batch_U8toU8_1D(
    __read_only image2d_t   input0,
    __read_only image2d_array_t   input1,
    __write_only image2d_array_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // index num
    int gidz = get_global_id(2);  // batch num
    int4 coord = (int4)(gidx, gidy, gidz, 0);
    Tensor img = create_tensor_from_image2d_array(input1, 4);
    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
    int4 indice = ((int4 *)indice_ptr)[0];
    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
    Image img1 = create_image_from_image2d(input0, 1);
    Tensor img2 = create_tensor_from_image2d_array(output, 1);
    uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
    __global uchar* data_ptr = (__global uchar*)input_ptr;
    __global uchar* dst_ptr = (__global uchar*)output_ptr;
    uchar src = data_ptr[0];
    dst_ptr[0] = src;
 }
 __kernel void gather_nd_array_batch_I16toI16_1D(
    __read_only image2d_t   input0,
    __read_only image2d_array_t   input1,
    __write_only image2d_array_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // index num
    int gidz = get_global_id(2);  // batch num
    int4 coord = (int4)(gidx, gidy, gidz, 0);
    Tensor img = create_tensor_from_image2d_array(input1, 4);
    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
    int4 indice = ((int4 *)indice_ptr)[0];
    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
    Image img1 = create_image_from_image2d(input0, 2);
    Tensor img2 = create_tensor_from_image2d_array(output, 2);
    uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
    __global short* data_ptr = (__global short*)input_ptr;
    __global short* dst_ptr = (__global short*)output_ptr;
    short src = data_ptr[0];
    dst_ptr[0] = src;
 }
 __kernel void gather_nd_array_batch_F16toF16_1D(
    __read_only image2d_t   input0,
    __read_only image2d_array_t   input1,
    __write_only image2d_array_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // index num
    int gidz = get_global_id(2);  // batch num
    int4 coord = (int4)(gidx, gidy, gidz, 0);
    Tensor img = create_tensor_from_image2d_array(input1, 4);
    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
    int4 indice = ((int4 *)indice_ptr)[0];
    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
    Image img1 = create_image_from_image2d(input0, 2);
    Tensor img2 = create_tensor_from_image2d_array(output, 2);
    uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
    __global short* data_ptr = (__global short*)input_ptr;
    __global short* dst_ptr = (__global short*)output_ptr;
    short src = data_ptr[0];
    dst_ptr[0] = src;
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx
@ -26,7 +26,7 @@ __kernel void gather_nd_batch_I8toI8_2D(
    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
-__kernel void gather_nd_U8toU8_2D(
+__kernel void gather_nd_batch_U8toU8_2D(
    __read_only image2d_array_t   input0,
    __read_only image2d_array_t   input1,
    __write_only image2d_array_t  output,
@ -51,7 +51,7 @@ __kernel void gather_nd_U8toU8_2D(
    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
-__kernel void gather_nd_I16toI16_2D(
+__kernel void gather_nd_batch_I16toI16_2D(
    __read_only image2d_array_t   input0,
    __read_only image2d_array_t   input1,
    __write_only image2d_array_t  output,
@ -76,7 +76,7 @@ __kernel void gather_nd_I16toI16_2D(
    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
-__kernel void gather_nd_F16toF16_2D(
+__kernel void gather_nd_batch_F16toF16_2D(
    __read_only image2d_array_t   input0,
    __read_only image2d_array_t   input1,
    __write_only image2d_array_t  output,
@ -100,3 +100,123 @@ __kernel void gather_nd_F16toF16_2D(
    VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
 __kernel void gather_nd_array_batch_I8toI8_2D(
    __read_only image2d_array_t   input0,
    __read_only image2d_array_t   input1,
    __write_only image2d_array_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // index num
    int gidz = get_global_id(2);  // batch num
    int4 coord = (int4)(gidx, gidy, gidz, 0);
    Tensor img = create_tensor_from_image2d_array(input1, 4);
    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
    int4 indice = ((int4 *)indice_ptr)[0];
    indice.x = indice.x * block_size + gidx;
    indice.zw = coord.zw;
    Tensor img1 = create_tensor_from_image2d_array(input0, 1);
    Tensor img2 = create_tensor_from_image2d_array(output, 1);
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
    __global char* data_ptr = (__global char*)input_ptr;
    __global char* dst_ptr = (__global char*)output_ptr;
    char src = data_ptr[0];
    dst_ptr[0] = src;
 }
 __kernel void gather_nd_array_batch_U8toU8_2D(
    __read_only image2d_array_t   input0,
    __read_only image2d_array_t   input1,
    __write_only image2d_array_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // index num
    int gidz = get_global_id(2);  // batch num
    int4 coord = (int4)(gidx, gidy, gidz, 0);
    Tensor img = create_tensor_from_image2d_array(input1, 4);
    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
    int4 indice = ((int4 *)indice_ptr)[0];
    indice.x = indice.x * block_size + gidx;
    indice.zw = coord.zw;
    Tensor img1 = create_tensor_from_image2d_array(input0, 1);
    Tensor img2 = create_tensor_from_image2d_array(output, 1);
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
    __global uchar* data_ptr = (__global uchar*)input_ptr;
    __global uchar* dst_ptr = (__global uchar*)output_ptr;
    uchar src = data_ptr[0];
    dst_ptr[0] = src;
 }
 __kernel void gather_nd_array_batch_I16toI16_2D(
    __read_only image2d_array_t   input0,
    __read_only image2d_array_t   input1,
    __write_only image2d_array_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // index num
    int gidz = get_global_id(2);  // batch num
    int4 coord = (int4)(gidx, gidy, gidz, 0);
    Tensor img = create_tensor_from_image2d_array(input1, 4);
    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
    int4 indice = ((int4 *)indice_ptr)[0];
    indice.x = indice.x * block_size + gidx;
    indice.zw = coord.zw;
    Tensor img1 = create_tensor_from_image2d_array(input0, 2);
    Tensor img2 = create_tensor_from_image2d_array(output, 2);
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
    __global short* data_ptr = (__global short*)input_ptr;
    __global short* dst_ptr = (__global short*)output_ptr;
    short src = data_ptr[0];
    dst_ptr[0] = src;
 }
 __kernel void gather_nd_array_batch_F16toF16_2D(
    __read_only image2d_array_t   input0,
    __read_only image2d_array_t   input1,
    __write_only image2d_array_t  output,
    int block_size,
    int coord_dim
    )
 {
    int gidx = get_global_id(0);  // block_size
    int gidy = get_global_id(1);  // index num
    int gidz = get_global_id(2);  // batch num
    int4 coord = (int4)(gidx, gidy, gidz, 0);
    Tensor img = create_tensor_from_image2d_array(input1, 4);
    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
    int4 indice = ((int4 *)indice_ptr)[0];
    indice.x = indice.x * block_size + gidx;
    indice.zw = coord.zw;
    Tensor img1 = create_tensor_from_image2d_array(input0, 2);
    Tensor img2 = create_tensor_from_image2d_array(output, 2);
    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
    __global short* data_ptr = (__global short*)input_ptr;
    __global short* dst_ptr = (__global short*)output_ptr;
    short src = data_ptr[0];
    dst_ptr[0] = src;
 }
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx
@ -81,3 +81,85 @@ GATHER_ND_F16_TO_QINT_1D(U8, vxc_uchar16)
 GATHER_ND_F16_TO_QINT_1D(I8, vxc_char16)
 GATHER_ND_F16_TO_QINT_1D(I16, vxc_short8)
 #define GATHER_ND_ARRAY_QINT_TO_F16_1D(src0_type_name, read_type, ptr_type, stride) \
 __kernel void gather_nd_array_##src0_type_name##toF16_1D( \
    __read_only image2d_t   input0, \
    __read_only image2d_t   input1, \
    __write_only image2d_t  output, \
    int block_size, \
    int coord_dim \
    ) \
 { \
    int gidx = get_global_id(0); \
    int gidy = get_global_id(1); \
 \
    int4 coord = (int4)(0, gidy, gidx, 0); \
    Image img = create_image_from_image2d(input1, 4); \
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
    int4 indice = ((int4 *)indice_ptr)[0]; \
 \
    coord.w = indice.x; \
 \
    Image img1 = create_image_from_image2d(input0, stride); \
    Image img2 = create_image_from_image2d(output, 2); \
 \
    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); \
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
 \
    __global ptr_type data_ptr = (__global ptr_type)input_ptr; \
    __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \
    read_type src = data_ptr[0]; \
 \
    vxc_half8  src0; \
    vxc_short8 dst0; \
    vxc_ushort8 ms0; \
    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
    VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \
    _viv_asm(COPY, dst0, src0, 16); \
    dst_ptr[0] = dst0; \
 }
 GATHER_ND_ARRAY_QINT_TO_F16_1D(U8, vxc_uchar16, vxc_uchar16*, 1)
 GATHER_ND_ARRAY_QINT_TO_F16_1D(I8, vxc_char16, vxc_char16*, 1)
 GATHER_ND_ARRAY_QINT_TO_F16_1D(I16, vxc_short8, vxc_short8*, 2)
 #define GATHER_ND_ARRAY_F16_TO_QINT_1D(src1_type_name, write_type, ptr_type, stride) \
 __kernel void gather_nd_array_F16to##src1_type_name##_1D( \
    __read_only image2d_t   input0, \
    __read_only image2d_t   input1, \
    __write_only image2d_t  output, \
    int block_size, \
    int coord_dim \
    ) \
 { \
    int gidx = get_global_id(0); \
    int gidy = get_global_id(1); \
 \
    int4 coord = (int4)(0, gidy, gidx, 0); \
    Image img = create_image_from_image2d(input1, 4); \
    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
    int4 indice = ((int4 *)indice_ptr)[0]; \
 \
    coord.w = indice.x; \
 \
    Image img1 = create_image_from_image2d(input0, 2); \
    Image img2 = create_image_from_image2d(output, stride); \
 \
    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); \
    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
 \
    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \
    __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \
    vxc_short8 src = data_ptr[0]; \
    vxc_ushort8 mp1; \
    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
    vxc_half8 data; \
    write_type dst; \
    _viv_asm(COPY, data, src, 16); \
    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \
    dst_ptr[0] = dst; \
 }
 GATHER_ND_ARRAY_F16_TO_QINT_1D(U8, vxc_uchar16, vxc_uchar16*, 1)
 GATHER_ND_ARRAY_F16_TO_QINT_1D(I8, vxc_char16, vxc_char16*, 1)
 GATHER_ND_ARRAY_F16_TO_QINT_1D(I16, vxc_short8, vxc_short8*, 2)
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx
@ -65,5 +65,5 @@ __kernel void pre_process_gray_half_U8toU8
    coord_in.xy = coord_in.xy >> 1;
-    VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 }
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
@ -62,11 +62,20 @@ static vsi_status _argmaxmin_op_compute
    }
    status = VSI_FAILURE;
-    param =vsi_nn_kernel_param_create();
+    param = vsi_nn_kernel_param_create();
    if (strcmp(kernel_name, "argmax") == 0)
    {
        vsi_nn_argmax_param * p = &(self->nn_param.argmax);
        axis = p->axis;
 #if (VX_ARGMAX_VX_SUPPORT)
        vsi_nn_kernel_param_add_int32(param, "axis", axis);
        self->n = (vx_node)vsi_nn_kernel_selector(self->graph,
            kernel_name,
            inputs, 1,
            outputs, 1, param);
        goto final;
 #endif
    }
    else
    {
@ -101,6 +110,10 @@ static vsi_status _argmaxmin_op_compute
        vsi_nn_ReleaseTensor( &reshape_tensors[0] );
        vsi_nn_ReleaseTensor( &reshape_tensors[1] );
    }
 #if (VX_ARGMAX_VX_SUPPORT)
 final:
 #endif
    if( self->n )
    {
        status = VSI_SUCCESS;
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c
@ -0,0 +1,153 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <string.h>
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_constraint_check.h"
 #include "vsi_nn_error.h"
 typedef struct _bitcast_local_data_t {
    int32_t placeholder;
 } bitcast_local_data_t;
 /*
 Declare number of input and output.
 */
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    vsi_status           status  = VSI_FAILURE;
    vsi_nn_kernel_node_t n       = NULL;
    n = vsi_nn_kernel_selector( self->graph, "bitcast", inputs, 1, outputs, 1, NULL );
    if (n != NULL)
    {
        status = VSI_SUCCESS;
    }
    self->n = (vx_node)n;
    return status;
 } /* op_compute() */
 static vsi_bool op_setup
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    int32_t i = 0;
    VSI_UNREFERENCED(self);
    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
    {
        uint32_t input_byte = 0;
        uint32_t output_byte = 0;
        uint32_t in_dim = inputs[0]->attr.dim_num;
        input_byte = vsi_nn_TypeGetBytesExt(inputs[0]->attr.dtype.vx_type);
        output_byte = vsi_nn_TypeGetBytesExt(outputs[0]->attr.dtype.vx_type);
        if (input_byte == output_byte)
        {
            outputs[0]->attr.dim_num = in_dim;
            for (i = 0; i < (int32_t)(in_dim); i++)
            {
                outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
            }
        }
        else if (input_byte > output_byte)
        {
            outputs[0]->attr.dim_num = in_dim + 1;
            outputs[0]->attr.size[0] = input_byte / output_byte;
            for (i = 1;i < (int32_t)(outputs[0]->attr.dim_num); i++)
            {
                outputs[0]->attr.size[i] = inputs[0]->attr.size[i - 1];
            }
        }
        else
        {
            if ((uint32_t)(inputs[0]->attr.size[in_dim - 1]) != output_byte / input_byte)
            {
                VSILOGE("If input datatype is smaller than output datatype, bitcast op requires that \
                    the rightmost dimension be equal to sizeof(output datatype) / sizeof(input datatype)");
                return FALSE;
            }
            outputs[0]->attr.dim_num = in_dim - 1;
            if (outputs[0]->attr.dim_num == 0)
            {
                outputs[0]->attr.size[0] = 1;
                vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
            }
            else
            {
                for (i = 0; i < (int32_t)(outputs[0]->attr.dim_num); i++)
                {
                    outputs[0]->attr.size[i] = inputs[0]->attr.size[i + 1];
                }
            }
        }
    }
    return TRUE;
 } /* op_setup() */
 __BEGIN_DECLS
 /* Registrar */
 DEF_OP_REG
    (
    /* op_name    */ BITCAST,
    /* init       */ NULL,
    /* compute    */ op_compute,
    /* deinit     */ NULL,
    /* check      */ NULL,
    /* setup      */ op_setup,
    /* optimize   */ NULL,
    /* input_num  */ _INPUT_NUM,
    /* output_num */ _OUTPUT_NUM
    );
 __END_DECLS
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_col2im.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_col2im.c
@ -0,0 +1,258 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <string.h>
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_constraint_check.h"
 typedef struct _col2im_local_data_t {
    int32_t placeholder;
 } col2im_local_data_t;
 /*
 Declare number of input and output.
 */
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_param_t* param = NULL;
    param = vsi_nn_kernel_param_create();
    vsi_nn_kernel_param_add_int32( param, "stride_w", self->nn_param.col2im.strides[0] );
    vsi_nn_kernel_param_add_int32( param, "stride_h", self->nn_param.col2im.strides[1] );
    vsi_nn_kernel_param_add_int32( param, "stride_d", self->nn_param.col2im.strides[2] );
    vsi_nn_kernel_param_add_int32( param, "pad_w_front", self->nn_param.col2im.pads[0] );
    vsi_nn_kernel_param_add_int32( param, "pad_w_end", self->nn_param.col2im.pads[1] );
    vsi_nn_kernel_param_add_int32( param, "pad_h_front", self->nn_param.col2im.pads[2] );
    vsi_nn_kernel_param_add_int32( param, "pad_h_end", self->nn_param.col2im.pads[3] );
    vsi_nn_kernel_param_add_int32( param, "pad_d_front", self->nn_param.col2im.pads[4] );
    vsi_nn_kernel_param_add_int32( param, "pad_d_end", self->nn_param.col2im.pads[5] );
    vsi_nn_kernel_param_add_int32( param, "dilation_w", self->nn_param.col2im.dilations[0] );
    vsi_nn_kernel_param_add_int32( param, "dilation_h", self->nn_param.col2im.dilations[1] );
    vsi_nn_kernel_param_add_int32( param, "dilation_d", self->nn_param.col2im.dilations[2] );
    vsi_nn_kernel_param_add_buffer( param, "block_shape", (void*)self->nn_param.col2im.block_shape, \
                                    self->nn_param.col2im.dim_num );
    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "col2im",
        inputs, 1, outputs, 1, param );
    if (self->n)
    {
        status = VSI_SUCCESS;
    }
    return status;
 } /* op_compute() */
 static vsi_bool op_check
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    BEGIN_IO_TYPE_DECL(COL2IM, 1, 1)
        IO_TYPE(D_F32,        D_F32)
        IO_TYPE(D_F32,        D_I32)
        IO_TYPE(D_F32,        D_U32)
        IO_TYPE(D_F32,        D_F16)
        IO_TYPE(D_I32,        D_F32)
        IO_TYPE(D_I32,        D_I32)
        IO_TYPE(D_I32,        D_U32)
        IO_TYPE(D_I32,        D_F16)
        IO_TYPE(D_U32,        D_F32)
        IO_TYPE(D_U32,        D_I32)
        IO_TYPE(D_U32,        D_U32)
        IO_TYPE(D_F16,        D_I16|Q_DFP)
        IO_TYPE(D_F16,        D_I16|Q_ASYM)
        IO_TYPE(D_F16,        D_I16|Q_SYM)
        IO_TYPE(D_F16,        D_I8|Q_DFP)
        IO_TYPE(D_F16,        D_I8|Q_ASYM)
        IO_TYPE(D_F16,        D_I8|Q_SYM)
        IO_TYPE(D_F16,        D_U8|Q_ASYM)
        IO_TYPE(D_I16|Q_DFP,  D_F16)
        IO_TYPE(D_I16|Q_DFP,  D_I8|Q_DFP)
        IO_TYPE(D_I16|Q_DFP,  D_U8|Q_ASYM)
        IO_TYPE(D_I16|Q_ASYM, D_F16)
        IO_TYPE(D_I16|Q_ASYM, D_I8|Q_DFP)
        IO_TYPE(D_I16|Q_ASYM, D_U8|Q_ASYM)
        IO_TYPE(D_I16|Q_SYM,  D_F16)
        IO_TYPE(D_I16|Q_SYM,  D_I8|Q_DFP)
        IO_TYPE(D_I16|Q_SYM,  D_U8|Q_ASYM)
        IO_TYPE(D_I16,        D_F16)
        IO_TYPE(D_I16,        D_I8|Q_DFP)
        IO_TYPE(D_I16,        D_U8|Q_ASYM)
        IO_TYPE(D_I16,        D_I32)
        IO_TYPE(D_I16,        D_U32)
        IO_TYPE(D_I16,        D_F32)
        IO_TYPE(D_I8|Q_DFP,   D_F16)
        IO_TYPE(D_I8|Q_DFP,   D_I16|Q_DFP)
        IO_TYPE(D_I8|Q_DFP,   D_U8|Q_ASYM)
        IO_TYPE(D_I8|Q_ASYM,  D_F16)
        IO_TYPE(D_I8|Q_ASYM,  D_I16|Q_DFP)
        IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM)
        IO_TYPE(D_I8|Q_SYM,   D_F16)
        IO_TYPE(D_I8|Q_SYM,   D_I16|Q_DFP)
        IO_TYPE(D_I8|Q_SYM,   D_U8|Q_ASYM)
        IO_TYPE(D_I8,         D_F16)
        IO_TYPE(D_I8,         D_I16|Q_DFP)
        IO_TYPE(D_I8,         D_U8|Q_ASYM)
        IO_TYPE(D_I8,         D_I32)
        IO_TYPE(D_I8,         D_U32)
        IO_TYPE(D_I8,         D_F32)
        IO_TYPE(D_U8|Q_ASYM,  D_F16)
        IO_TYPE(D_U8|Q_ASYM,  D_I16|Q_DFP)
        IO_TYPE(D_U8|Q_ASYM,  D_I8|Q_DFP)
        IO_TYPE(D_U8,         D_F16)
        IO_TYPE(D_U8,         D_I16|Q_DFP)
        IO_TYPE(D_U8,         D_I8|Q_DFP)
        IO_TYPE(D_U8,         D_I32)
        IO_TYPE(D_U8,         D_U32)
        IO_TYPE(D_U8,         D_F32)
        IO_TYPE(D_F32,        D_I16|Q_DFP)
        IO_TYPE(D_F32,        D_I16|Q_ASYM)
        IO_TYPE(D_F32,        D_I16|Q_SYM)
        IO_TYPE(D_F32,        D_I8|Q_DFP)
        IO_TYPE(D_F32,        D_I8|Q_ASYM)
        IO_TYPE(D_F32,        D_I8|Q_SYM)
        IO_TYPE(D_F32,        D_U8|Q_ASYM)
        IO_TYPE(D_I32,        D_I16|Q_DFP)
        IO_TYPE(D_I32,        D_I16|Q_ASYM)
        IO_TYPE(D_I32,        D_I16|Q_SYM)
        IO_TYPE(D_I32,        D_I8|Q_DFP)
        IO_TYPE(D_I32,        D_I8|Q_ASYM)
        IO_TYPE(D_I32,        D_I8|Q_SYM)
        IO_TYPE(D_I32,        D_U8|Q_ASYM)
        IO_TYPE(D_F16,        D_F32)
        IO_TYPE(D_F16,        D_I32)
        IO_TYPE(D_F16,        D_I16)
        IO_TYPE(D_F16,        D_U8)
        IO_TYPE(D_F16,        D_I8)
        IO_TYPE(D_F16,        D_F16)
        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
        IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM)
        IO_TYPE(D_I8|Q_SYM,   D_I8|Q_SYM)
        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
        IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM)
        IO_TYPE(D_I16|Q_SYM,  D_I16|Q_SYM)
        IO_TYPE(D_U8|Q_ASYM,  D_F32)
        IO_TYPE(D_U8|Q_ASYM,  D_I32)
        IO_TYPE(D_BF16,       D_BF16)
    END_IO_TYPE_DECL(COL2IM)
    if (!VALIDATE_OP_IO_TYPES(COL2IM, self, inputs, self->input.num, outputs, self->output.num)) {
        char* desc = generate_op_io_types_desc(inputs,
                self->input.num, outputs, self->output.num);
        VSILOGE("Inputs/Outputs data type not support: %s", desc);
        destroy_op_io_types_desc(desc);
        return FALSE;
    }
    return TRUE;
 } /* op_check() */
 static vsi_bool op_setup
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    vsi_nn_col2im_param *p = NULL;
    p = (vsi_nn_col2im_param* )&(self->nn_param.col2im);
    int32_t i = 0;
    vsi_size_t block_size = 1;
    vsi_size_t channel = 1;
    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
    {
        outputs[0]->attr.dim_num = p->dim_num + 2;
        for (i = 0; i < p->dim_num; i++)
        {
            outputs[0]->attr.size[i] = (vsi_size_t)p->image_shape[i];
            block_size = block_size * (vsi_size_t)p->block_shape[i];
        }
        channel = inputs[0]->attr.size[1] / block_size;
        outputs[0]->attr.size[i + 1] = channel;
        outputs[0]->attr.size[i + 2] = inputs[0]->attr.size[0];
    }
    return TRUE;
 } /* op_setup() */
 static vsi_status op_init
    (
    vsi_nn_node_t* self
    )
 {
    self->nn_param.col2im.pads[0] = 0;
    self->nn_param.col2im.pads[1] = 0;
    self->nn_param.col2im.pads[2] = 0;
    self->nn_param.col2im.pads[3] = 0;
    self->nn_param.col2im.pads[4] = 0;
    self->nn_param.col2im.pads[5] = 0;
    self->nn_param.col2im.strides[0] = 1;
    self->nn_param.col2im.strides[1] = 1;
    self->nn_param.col2im.strides[2] = 1;
    self->nn_param.col2im.dilations[0] = 1;
    self->nn_param.col2im.dilations[1] = 1;
    self->nn_param.col2im.dilations[2] = 1;
    return VSI_SUCCESS;
 }
 __BEGIN_DECLS
 /* Registrar */
 DEF_OP_REG
    (
    /* op_name    */ COL2IM,
    /* init       */ op_init,
    /* compute    */ op_compute,
    /* deinit     */ vsi_nn_op_common_deinit,
    /* check      */ op_check,
    /* setup      */ op_setup,
    /* optimize   */ NULL,
    /* input_num  */ _INPUT_NUM,
    /* output_num */ _OUTPUT_NUM
    );
 __END_DECLS
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
@ -28,6 +28,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
@ -278,7 +279,7 @@ static vsi_status op_compute
    if(_is_tensorview_support(self, outputs)
        && _is_same_quant(self, inputs, outputs)
        && (_has_norm_input(self, inputs) == FALSE)
-        && self->graph->ctx->options.enable_concat_optimize)
+        && ((vsi_nn_graph_prv_t*)(self->graph))->options->enable_concat_optimize)
    {
        iter = self->nn_param.concat.lcl_data;
        while( NULL != iter )
@ -443,7 +444,7 @@ static vsi_status op_optimize
    if (_is_tensorview_support(self, outputs) == FALSE ||
        _is_same_quant(self, inputs, outputs) == FALSE ||
        _has_norm_input(self, inputs) == TRUE ||
-        self->graph->ctx->options.enable_concat_optimize == 0)
+        ((vsi_nn_graph_prv_t*)(self->graph))->options->enable_concat_optimize == 0)
    {
        return status;
    }
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@ -23,6 +23,7 @@
 *****************************************************************************/
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_graph.h"
@ -95,7 +96,7 @@ static vsi_status op_optimize
    status = VSI_SUCCESS;
-    if( !self->graph->ctx->options.enable_dataconvert_optimize )
+    if( !((vsi_nn_graph_prv_t*)(self->graph))->options->enable_dataconvert_optimize )
    {
        return status;
    }
@ -266,14 +267,14 @@ static vsi_bool op_check
        IO_TYPE(D_BF16,       D_BF16)
        IO_TYPE(D_BF16,       D_F16)
        IO_TYPE(D_BF16,       D_F32)
-        IO_TYPE(D_I32,        D_I32)
+        IO_TYPE(D_I32|Q_ASYM, D_I32|Q_ASYM)
-        IO_TYPE(D_I32,        D_F32)
+        IO_TYPE(D_I32|Q_ASYM, D_F32)
-        IO_TYPE(D_I32,        D_F16)
+        IO_TYPE(D_I32|Q_ASYM, D_F16)
-        IO_TYPE(D_I32,        D_I16|Q_DFP)
+        IO_TYPE(D_I32|Q_ASYM, D_I16|Q_DFP)
-        IO_TYPE(D_I32,        D_I8|Q_DFP)
+        IO_TYPE(D_I32|Q_ASYM, D_I8|Q_DFP)
-        IO_TYPE(D_I32,        D_U32)
+        IO_TYPE(D_I32|Q_ASYM, D_U32|Q_ASYM)
-        IO_TYPE(D_I32,        D_U16)
+        IO_TYPE(D_I32|Q_ASYM, D_U16|Q_ASYM)
-        IO_TYPE(D_I32,        D_U8|Q_ASYM)
+        IO_TYPE(D_I32|Q_ASYM, D_U8|Q_ASYM)
        IO_TYPE(D_U32,        D_U32)
        IO_TYPE(D_U32,        D_I16|Q_DFP)
        IO_TYPE(D_U32,        D_I8|Q_DFP)
@ -281,7 +282,7 @@ static vsi_bool op_check
        IO_TYPE(D_U32,        D_U8|Q_ASYM)
        IO_TYPE(D_U32,        D_U8)
        IO_TYPE(D_BF16,       D_I32)
-        IO_TYPE(D_I32,        D_BF16)
+        IO_TYPE(D_I32|Q_ASYM, D_BF16)
        IO_TYPE(D_U4|Q_ASYM,  D_U8|Q_ASYM)
        IO_TYPE(D_U4|Q_SYM,   D_U8|Q_ASYM)
        IO_TYPE(D_U8|Q_ASYM,  D_U4|Q_ASYM)
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
@ -183,10 +183,16 @@ vsi_bool vsi_nn_op_eltwise_setup
        shape[i] = sz0;
    }
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
    {
        outputs[0]->attr.dim_num = out_rank;
        memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) );
        if (out_rank == 1 &&
            vsi_nn_GetTensorIsScalar(inputs[0]) &&
            vsi_nn_GetTensorIsScalar(inputs[1]))
        {
            vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
        }
    }
    else
    {
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
@ -54,10 +54,12 @@ static vsi_status op_compute
    vsi_nn_kernel_param_t* param = NULL;
    int32_t align_corners = self->nn_param.gridsample.align_corners;
    int32_t pad_mode = (int32_t)self->nn_param.gridsample.padding_mode;
    int32_t mode = (int32_t)self->nn_param.gridsample.mode;
    vsi_nn_kernel_node_t n;
    char kernel_name[128];
    param = vsi_nn_kernel_param_create();
    vsi_nn_kernel_param_add_int32(param, "mode", mode);
    vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners);
    vsi_nn_kernel_param_add_int32(param, "padding_mode", pad_mode);
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv3d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv3d.c
@ -0,0 +1,412 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <string.h>
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_prv.h"
 #include "utils/vsi_nn_math.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_constraint_check.h"
 /*
 Declare number of input and output.
 */
 #define _ARG_NUM            (1)
 #define _INPUT_NUM          (3)
 #define _OUTPUT_NUM         (1)
 #define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
 #define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
 #define LOCAL() ((vsi_nn_grouped_conv3d_param_local_data *)nn_param->local)
 typedef struct _vsi_nn_grouped_conv3d_param_local_data {
    vsi_nn_tensor_t ** input_tensor_group;
    vsi_nn_tensor_t ** weight_tensor_group;
    vsi_nn_tensor_t ** bias_tensor_group;
    vsi_nn_tensor_t ** output_tensor_group;
 } vsi_nn_grouped_conv3d_param_local_data;
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
 #if VX_CONV_3D_API_SUPPORT
 #define _TENSOR_LEN 64
    vsi_bool res;
    uint32_t i;
    char tensor_name[_TENSOR_LEN];
    vsi_nn_grouped_conv3d_param *nn_param = &self->nn_param.grouped_conv3d;
    nn_param->local = (vsi_nn_grouped_conv3d_param_local_data*)malloc(
        sizeof(vsi_nn_grouped_conv3d_param_local_data));
    if (NULL == nn_param->local)
    {
        VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
        return VSI_FAILURE;
    }
    memset(nn_param->local, 0, sizeof(vsi_nn_grouped_conv3d_param_local_data));
    LOCAL()->input_tensor_group = (vsi_nn_tensor_t **)malloc(
        nn_param->group * sizeof(vsi_nn_tensor_t *));
    if (NULL == LOCAL()->input_tensor_group)
    {
        VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
        return VSI_FAILURE;
    }
    memset(LOCAL()->input_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *));
    res = vsi_nn_CreateTensorGroup(self->graph, inputs[0], 3,
        LOCAL()->input_tensor_group, nn_param->group);
    if (res == FALSE)
    {
        VSILOGE("CreateTensorGroup fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
        return VSI_FAILURE;
    }
    LOCAL()->weight_tensor_group = (vsi_nn_tensor_t **)malloc(
        nn_param->group * sizeof(vsi_nn_tensor_t *));
    if (NULL == LOCAL()->weight_tensor_group)
    {
        VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
        return VSI_FAILURE;
    }
    memset(LOCAL()->weight_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *));
    res = vsi_nn_CreateTensorGroup(self->graph, inputs[1], 4,
        LOCAL()->weight_tensor_group, nn_param->group);
    if (res == FALSE)
    {
        VSILOGE("CreateTensorGroup fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
        return VSI_FAILURE;
    }
    LOCAL()->bias_tensor_group = (vsi_nn_tensor_t **)malloc(
        nn_param->group * sizeof(vsi_nn_tensor_t *));
    if (NULL == LOCAL()->bias_tensor_group)
    {
        VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
        return VSI_FAILURE;
    }
    memset(LOCAL()->bias_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *));
    if (inputs[2] != NULL)
    {
        res = vsi_nn_CreateTensorGroup(self->graph, inputs[2], 0,
            LOCAL()->bias_tensor_group, nn_param->group);
        if (res == FALSE)
        {
            VSILOGE("CreateTensorGroup fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__);
            return VSI_FAILURE;
        }
    }
    LOCAL()->output_tensor_group = (vsi_nn_tensor_t **)malloc(
        nn_param->group * sizeof(vsi_nn_tensor_t *));
    if (NULL == LOCAL()->output_tensor_group)
    {
        VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
        return VSI_FAILURE;
    }
    memset(LOCAL()->output_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *));
    res = vsi_nn_CreateTensorGroup(self->graph, outputs[0], 3,
        LOCAL()->output_tensor_group, nn_param->group);
    if (res == FALSE)
    {
        VSILOGE("CreateTensorGroup fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
        return VSI_FAILURE;
    }
    for (i = 0; i < nn_param->group; i++)
    {
        vx_tensor bias;
        vx_nn_convolution_3d_params_t *param = NULL;
        vx_nn_convolution_3d_params_t param_;
        memset( &param_, 0, sizeof( vx_nn_convolution_3d_params_t ) );
        param = &param_;
        param->padding_w_left = self->nn_param.grouped_conv3d.pad[0];
        param->padding_w_right = self->nn_param.grouped_conv3d.pad[1];
        param->padding_h_top = self->nn_param.grouped_conv3d.pad[2];
        param->padding_h_bottom = self->nn_param.grouped_conv3d.pad[3];
        param->padding_d_front = self->nn_param.grouped_conv3d.pad[4];
        param->padding_d_rear = self->nn_param.grouped_conv3d.pad[5];
        param->stride_w = self->nn_param.grouped_conv3d.stride[0];
        param->stride_h = self->nn_param.grouped_conv3d.stride[1];
        param->stride_d = self->nn_param.grouped_conv3d.stride[2];
        if (self->nn_param.grouped_conv3d.dilation[0] *
            self->nn_param.grouped_conv3d.dilation[1] *
            self->nn_param.grouped_conv3d.dilation[2] > 1)
        {
            VSILOGE("conv3d could not support dilation > 1\n");
            return VSI_FAILURE;
        }
        if ( self->nn_param.grouped_conv3d.dilation[0] > 0 )
        {
            param->dilation_w = self->nn_param.grouped_conv3d.dilation[0] - 1;
        }
        if ( self->nn_param.grouped_conv3d.dilation[1] > 0 )
        {
            param->dilation_h = self->nn_param.grouped_conv3d.dilation[1] - 1;
        }
        if ( self->nn_param.grouped_conv3d.dilation[2] > 0 )
        {
            param->dilation_d = self->nn_param.grouped_conv3d.dilation[2] - 1;
        }
        param->pad_mode = vsi_nn_get_vx_pad_mode(nn_param->pad_mode);
        param->depth_multiplier = self->nn_param.grouped_conv3d.multiplier;
        param->overflow_policy = self->vx_param.overflow_policy;
        param->rounding_policy = self->vx_param.rounding_policy;
        param->down_scale_size_rounding = self->vx_param.down_scale_size_rounding;
        if ( inputs[2] == NULL )
        {
            bias = NULL;
        }
        else
        {
            bias = LOCAL()->bias_tensor_group[i]->t;
        }
        self->n = vxConv3dLayer(
            self->graph->g,
            LOCAL()->input_tensor_group[i]->t,
            LOCAL()->weight_tensor_group[i]->t,
            bias,
            (vx_nn_convolution_3d_params_t* )param,
            sizeof( vx_nn_convolution_3d_params_t),
            LOCAL()->output_tensor_group[i]->t
            );
        memset(tensor_name, 0, sizeof(tensor_name));
        snprintf(tensor_name, sizeof(tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, i);
        if (vxSetReferenceName((vx_reference)LOCAL()->output_tensor_group[i]->t, tensor_name) == VSI_FAILURE)
        {
            VSILOGW("Set uid %u copy node output name fail", self->uid);
            return VSI_FAILURE;
        }
        if ( NULL == self->n )
        {
            VSILOGE("Add vxConvolutionLayer fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
            return VSI_FAILURE;
        }
        else
        {
            // no need to maintain self->n
            vxReleaseNode( &self->n );
            self->n = NULL;
        }
    }
 #else
    VSI_UNREFERENCED(self);
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);
 #endif
    return VSI_SUCCESS;
 } /* op_compute() */
 static vsi_bool op_check
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    vsi_bool ret = FALSE;
    ret = vsi_nn_OpCheck(VSI_NN_OP_CONV3D, self, inputs, outputs);
    return ret;
 } /* op_check() */
 static vsi_bool op_setup
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    /* TODO: Add code to comput outputs' shape. */
    vsi_nn_grouped_conv3d_param *nn_param;
    vsi_size_t perm[] = { 3, 2, 0, 1 };
 #ifdef VX_CONVERT_POLICY_WRAP_ENABLE
    if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 )
    {
        self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
    }
 #endif
    if ( VSI_NN_DIM_FMT_NHWC == inputs[1]->attr.dtype.fmt &&
        VSI_NN_TYPE_VDATA != inputs[1]->attr.dtype.vx_type )
    {
        vsi_nn_TransposeTensor( self->graph, inputs[1], perm, 4, NULL );
        inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW;
    }
    nn_param = &self->nn_param.grouped_conv3d;
    {
        vsi_size_t i, pad[_cnt_of_array(nn_param->pad)] = {0};
        for (i = 0; i < _cnt_of_array(nn_param->pad); i++)
        {
            pad[i] = self->nn_param.grouped_conv3d.pad[i];
        }
        vsi_nn_compute_padding_3d(
            inputs[0]->attr.size,
            inputs[1]->attr.size,
            nn_param->stride,
            nn_param->dilation,
            nn_param->pad_type,
            pad
        );
        for (i = 0; i < _cnt_of_array(nn_param->pad); i++)
        {
            self->nn_param.grouped_conv3d.pad[i] = (uint32_t)pad[i];
        }
    }
    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
    {
        outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize
            (
            inputs[0]->attr.size[0],
            inputs[1]->attr.size[0],
            &nn_param->pad[0],
            nn_param->stride[0],
            nn_param->dilation[0],
            VSI_NN_ROUND_FLOOR
            );
        outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize
            (
            inputs[0]->attr.size[1],
            inputs[1]->attr.size[1],
            &nn_param->pad[2],
            nn_param->stride[1],
            nn_param->dilation[1],
            VSI_NN_ROUND_FLOOR
            );
        outputs[0]->attr.size[2] = vsi_nn_ComputeFilterSize
            (
            inputs[0]->attr.size[2],
            inputs[1]->attr.size[2],
            &nn_param->pad[4],
            nn_param->stride[2],
            nn_param->dilation[2],
            VSI_NN_ROUND_FLOOR
            );
        if (self->nn_param.grouped_conv3d.weights > 0)
        {
            outputs[0]->attr.size[3] = self->nn_param.grouped_conv3d.weights;
        }
        else if (self->nn_param.grouped_conv3d.multiplier > 0)
        {
            outputs[0]->attr.size[3] = inputs[0]->attr.size[3] * self->nn_param.grouped_conv3d.multiplier;
        }
        else
        {
            outputs[0]->attr.size[3] = inputs[1]->attr.size[4];
        }
        outputs[0]->attr.size[4] = inputs[0]->attr.size[4];
        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
    }
    return TRUE;
 } /* op_setup() */
 static vsi_status op_deinit
    (
    vsi_nn_node_t* self
    )
 {
    vsi_nn_grouped_conv3d_param *nn_param = &(self->nn_param.grouped_conv3d);
    uint32_t i;
    if (LOCAL())
    {
        if (LOCAL()->input_tensor_group)
        {
            for (i = 0; i < nn_param->group; i++)
            {
                vsi_nn_ReleaseTensor(&(LOCAL()->input_tensor_group[i]));
            }
            free(LOCAL()->input_tensor_group);
        }
        if (LOCAL()->weight_tensor_group)
        {
            for (i = 0; i < nn_param->group; i++)
            {
                vsi_nn_ReleaseTensor(&(LOCAL()->weight_tensor_group[i]));
            }
            free(LOCAL()->weight_tensor_group);
        }
        if (LOCAL()->bias_tensor_group != NULL)
        {
            for (i = 0; i < nn_param->group; i++)
            {
                vsi_nn_ReleaseTensor(&(LOCAL()->bias_tensor_group[i]));
            }
            free(LOCAL()->bias_tensor_group);
        }
        if (LOCAL()->output_tensor_group != NULL)
        {
            for (i = 0; i < nn_param->group; i++)
            {
                vsi_nn_ReleaseTensor(&(LOCAL()->output_tensor_group[i]));
            }
            free(LOCAL()->output_tensor_group);
        }
        free(LOCAL());
    }
    vsi_nn_op_common_deinit(self);
    return VSI_SUCCESS;
 } /* op_deinit() */
 __BEGIN_DECLS
 /* Registrar */
 DEF_OP_REG
    (
    /* op_name    */ GROUPED_CONV3D,
    /* init       */ NULL,
    /* compute    */ op_compute,
    /* deinit     */ op_deinit,
    /* check      */ op_check,
    /* setup      */ op_setup,
    /* optimize   */ NULL,
    /* input_num  */ _INPUT_NUM,
    /* output_num */ _OUTPUT_NUM
    );
 __END_DECLS
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c
@ -0,0 +1,206 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <string.h>
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 #include "vsi_nn_tensor_util_prv.h"
 typedef struct _l1_layer_norm_local_data_t {
    int32_t placeholder;
 } l1_layer_norm_local_data_t;
 /*
 Declare number of input and output.
 */
 #define _INPUT_NUM          (4)
 #define _OUTPUT_NUM         (1)
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_param_t * param = NULL;
    vsi_nn_kernel_node_t    n = NULL;
    float eps = self->nn_param.l1_layer_norm.eps;
    int32_t axis = self->nn_param.l1_layer_norm.axis;
    param = vsi_nn_kernel_param_create();
    vsi_nn_kernel_param_add_float32( param, "eps", eps );
    vsi_nn_kernel_param_add_int32( param, "axis", axis );
    n = vsi_nn_kernel_selector( self->graph, "l1_layer_norm",
                    inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
    if ( n != NULL )
    {
        self->n = (vx_node)n;
        status = VSI_SUCCESS;
    }
    if (param != NULL)
    {
        vsi_nn_kernel_param_release( &param );
    }
    return status;
 } /* op_compute() */
 static vsi_bool op_check
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num);
    if (!ret)
    {
        BEGIN_IO_TYPE_DECL(L1_LAYER_NORM, 4, 1)
            IO_TYPE(D_F32,        D_F32,  D_F32,  D_F32,  D_F32)
            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_F16)
            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_U8|Q_ASYM)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_U8|Q_ASYM)
            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_DFP)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_DFP)
            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_ASYM)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_ASYM)
            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_SYM)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_SYM)
            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_DFP)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_DFP)
            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_ASYM)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_ASYM)
            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_SYM)
            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_SYM)
            IO_TYPE(D_BF16,       D_F32,  D_F32,  D_F32,  D_BF16)
            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_U8|Q_ASYM)
            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F32,  D_I16|Q_DFP)
            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_F32,  D_I16|Q_ASYM)
            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_F32,  D_I16|Q_SYM)
            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F32,  D_I8|Q_DFP)
            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_I8|Q_ASYM)
            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_F32,  D_I8|Q_SYM)
            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_F32,  D_F16)
            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_U8|Q_ASYM)
            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_F16)
            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F32,  D_I16|Q_DFP)
            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_F32,  D_I16|Q_ASYM)
            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_F32,  D_I16|Q_SYM)
            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F32,  D_F16)
            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_F32,  D_F16)
            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_F32,  D_F16)
            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_F32,  D_I8|Q_DFP)
            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_I8|Q_ASYM)
            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_F32,  D_I8|Q_SYM)
            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_F32,  D_F16)
            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_F16)
            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_F32,  D_F16)
        END_IO_TYPE_DECL(L1_LAYER_NORM)
        if (!VALIDATE_OP_IO_TYPES(L1_LAYER_NORM, self, inputs, self->input.num, outputs, self->output.num))
        {
            char* desc = generate_op_io_types_desc(inputs,
                    self->input.num, outputs, self->output.num);
            VSILOGE("Inputs/Outputs data type not support: %s", desc);
            destroy_op_io_types_desc(desc);
            return FALSE;
        }
    }
    return TRUE;
 } /* op_check() */
 static vsi_bool op_setup
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    int32_t i = 0;
    VSI_UNREFERENCED(self);
    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
    {
        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
        for (i = 0; i < (int32_t)inputs[0]->attr.dim_num; i++)
        {
            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
        }
    }
    return TRUE;
 } /* op_setup() */
 static vsi_status op_init
    (
    vsi_nn_node_t* self
    )
 {
    vsi_status status = VSI_SUCCESS;
    self->nn_param.l1_layer_norm.axis = 0;
    return status;
 } /* op_init() */
 __BEGIN_DECLS
 /* Registrar */
 DEF_OP_REG
    (
    /* op_name    */ L1_LAYER_NORM,
    /* init       */ op_init,
    /* compute    */ op_compute,
    /* deinit     */ NULL,
    /* check      */ op_check,
    /* setup      */ op_setup,
    /* optimize   */ NULL,
    /* input_num  */ _INPUT_NUM,
    /* output_num */ _OUTPUT_NUM
    );
 __END_DECLS
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
@ -25,6 +25,7 @@
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_graph.h"
@ -161,7 +162,7 @@ static vsi_bool op_setup
            if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ||
                p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP)
            {
-                enable_rgb88_planar_nhwc = self->graph->ctx->options.enable_rgb88_planar_nhwc;
+                enable_rgb88_planar_nhwc = ((vsi_nn_graph_prv_t*)(self->graph))->options->enable_rgb88_planar_nhwc;
            }
        }
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
@ -183,7 +183,8 @@ static vsi_bool _check_is_sp_supported_type
        return FALSE;
    }
-    if ( (axes_num == 1 && (axes[0] == 0 || axes[0] == 2)) ||
+    if ( (axes_num == 1 && (axes[0] == 0 || axes[0] == 2 ||
         (axes[0] == 1 && (input->attr.size[0] == 1 || input->attr.size[2] == 1)))) ||
         (axes_num == 2 && ((axes[0] < 2 && axes[1] < 2) || (axes[0] == 1 && axes[1] == 2))) )
    {
        return TRUE;
@ -1167,6 +1168,7 @@ static vsi_bool op_setup
            {
                outputs[0]->attr.dim_num = 1;
                outputs[0]->attr.size[0] = 1;
                vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
            }
            else
            {
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c
@ -93,52 +93,32 @@ static vsi_bool op_check
    if (!ret)
    {
        BEGIN_IO_TYPE_DECL(RMS_NORM, 2, 1)
-            IO_TYPE(D_F32, D_F32, D_F32)
+            IO_TYPE(D_F32,          D_F32,  D_F32)
-            IO_TYPE(D_F16, D_F32, D_F16)
+            IO_TYPE(D_F32,          D_F32,  D_F16)
-            IO_TYPE(D_F16, D_F32, D_F16)
+            IO_TYPE(D_F16,          D_F32,  D_F16)
-            IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM)
+            IO_TYPE(D_F16,          D_F32,  D_F32)
-            IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM)
+            IO_TYPE(D_F16,          D_F32,  D_U8 | Q_ASYM)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP)
+            IO_TYPE(D_F16,          D_F32,  D_I8 | Q_DFP)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP)
+            IO_TYPE(D_F16,          D_F32,  D_I8 | Q_ASYM)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM)
+            IO_TYPE(D_F16,          D_F32,  D_I8 | Q_SYM)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM)
+            IO_TYPE(D_F16,          D_F32,  D_I16 | Q_DFP)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM)
+            IO_TYPE(D_F16,          D_F32,  D_I16 | Q_ASYM)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM)
+            IO_TYPE(D_F16,          D_F32,  D_I16 | Q_SYM)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP)
+            IO_TYPE(D_BF16,         D_F32,  D_BF16)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP)
+            IO_TYPE(D_U8 | Q_ASYM,  D_F32,  D_F16)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM)
+            IO_TYPE(D_U8 | Q_ASYM,  D_F32,  D_U8 | Q_ASYM)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM)
+            IO_TYPE(D_I16 | Q_DFP,  D_F32,  D_I16 | Q_DFP)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM)
+            IO_TYPE(D_I16 | Q_ASYM, D_F32,  D_I16 | Q_ASYM)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM)
+            IO_TYPE(D_I16 | Q_SYM,  D_F32,  D_I16 | Q_SYM)
-            IO_TYPE(D_BF16, D_F32, D_BF16)
+            IO_TYPE(D_I16 | Q_DFP,  D_F32,  D_F16)
-            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16)
+            IO_TYPE(D_I16 | Q_ASYM, D_F32,  D_F16)
-            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM)
+            IO_TYPE(D_I16 | Q_SYM,  D_F32,  D_F16)
-            IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP)
+            IO_TYPE(D_I8 | Q_DFP,   D_F32,  D_I8 | Q_DFP)
-            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM)
+            IO_TYPE(D_I8 | Q_ASYM,  D_F32,  D_I8 | Q_ASYM)
-            IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM)
+            IO_TYPE(D_I8 | Q_SYM,   D_F32,  D_I8 | Q_SYM)
-            IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16)
+            IO_TYPE(D_I8 | Q_DFP,   D_F32,  D_F16)
-            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16)
+            IO_TYPE(D_I8 | Q_ASYM,  D_F32,  D_F16)
-            IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16)
+            IO_TYPE(D_I8 | Q_SYM,   D_F32,  D_F16)
            IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP)
            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM)
            IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM)
            IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16)
            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16)
            IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16)
            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM)
            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16)
            IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP)
            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM)
            IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM)
            IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16)
            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16)
            IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16)
            IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP)
            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM)
            IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM)
            IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16)
            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16)
            IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16)
            END_IO_TYPE_DECL(RMS_NORM)
            if (!VALIDATE_OP_IO_TYPES(RMS_NORM, self, inputs, self->input.num, outputs, self->output.num))
            {
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
@ -25,6 +25,7 @@
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
@ -776,7 +777,7 @@ static vsi_status op_optimize
    /* Only forward run stride_slice's optimize */
    if ( direction == VSI_NN_OPTIMIZE_BACKWARD ||
-         !self->graph->ctx->options.enable_slice_optimize )
+         !((vsi_nn_graph_prv_t*)(self->graph))->options->enable_slice_optimize )
    {
        return status;
    }
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c
@ -78,9 +78,10 @@ static vsi_status _tile_op_compute
    vsi_size_t new_rank                      = 0;
    vsi_bool   ret                          = FALSE;
    uint32_t i                              = 0;
-    vsi_size_t* multiples                   = (vsi_size_t*)self->nn_param.tile.multiples;
+    int32_t* multiples_                     = (int32_t*)self->nn_param.tile.multiples;
    vsi_nn_tensor_t* temp_tensors[3]        = { NULL };
    vsi_nn_tensor_t* reshape_tensors[3]     = { NULL };
    vsi_size_t multiples[VSI_NN_MAX_DIM_NUM] = {1};
    int32_t   multiples_value[VSI_NN_MAX_DIM_NUM] = {0};
    vsi_nn_tensor_attr_t attr;
@ -101,6 +102,11 @@ static vsi_status _tile_op_compute
        temp_tensors[2] = outputs[0];
    }
    for (i = 0; i < inputs[0]->attr.dim_num; i ++)
    {
        multiples[i] = (vsi_size_t)multiples_[i];
    }
    ret = vsi_nn_kernel_optimize_tile_shape(
            inputs[0]->attr.size, inputs[0]->attr.dim_num,
            multiples, inputs[0]->attr.dim_num,
@ -111,6 +117,7 @@ static vsi_status _tile_op_compute
    {
        if (_is_supported_axis(shapes[1], new_rank) == FALSE)
        {
            uint32_t _multiples = (uint32_t)(new_rank > 4 && shapes[1][4] > 1 ? 3 : 2);
            reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0],\
                shapes[0], (vsi_size_t)new_rank );
            reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, temp_tensors[2],\
@ -125,8 +132,11 @@ static vsi_status _tile_op_compute
            memcpy( &attr, &reshape_tensors[0]->attr, sizeof(attr));
            attr.is_const = FALSE;
            attr.vtl = TRUE;
-            attr.size[0] = reshape_tensors[2]->attr.size[0];
+
-            attr.size[1] = reshape_tensors[2]->attr.size[1];
+            for (i = 0; i < _multiples; i++)
            {
                attr.size[i] = reshape_tensors[2]->attr.size[i];
            }
            temp_tensors[0] = vsi_nn_CreateTensor( self->graph, &attr );
            memset( &attr, 0 , sizeof(vsi_nn_tensor_attr_t) );
@ -136,9 +146,11 @@ static vsi_status _tile_op_compute
            attr.size[0] = new_rank;
            attr.dim_num = 1;
-            multiples_value[0] = (int32_t)shapes[1][0];
+            for (i = 0; i < _multiples; i++)
-            multiples_value[1] = (int32_t)shapes[1][1];
+            {
-            for (i = 0; i < new_rank; i++)
+                multiples_value[i] = (int32_t)shapes[1][i];
            }
            for (i = _multiples; i < new_rank; i++)
            {
                multiples_value[i] = 1;
            }
@ -150,9 +162,11 @@ static vsi_status _tile_op_compute
                goto final;
            }
-            multiples_value[0] = 1;
+            for (i = 0; i < _multiples; i++)
-            multiples_value[1] = 1;
+            {
-            for (i = 0; i < new_rank; i++)
+                multiples_value[i] = 1;
            }
            for (i = _multiples; i < new_rank; i++)
            {
                multiples_value[i] = (int32_t)shapes[1][i];
            }
@ -257,6 +271,7 @@ static vsi_bool op_check
        IO_TYPE(D_F32,          D_F32)
        IO_TYPE(D_F32,          D_U8|Q_ASYM)
        IO_TYPE(D_F16,          D_U8|Q_ASYM)
        IO_TYPE(D_BOOL8,        D_BOOL8)
    END_IO_TYPE_DECL(TILE)
    if (!VALIDATE_OP_IO_TYPES(TILE, self, inputs, self->input.num, outputs, self->output.num)) {
        char* desc = generate_op_io_types_desc(inputs,
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@ -471,6 +471,10 @@ static _op_param_gen_t s_op_gen[] =
    /* TAN */                   NULL,
    /* RMSNORM */               NULL,
    /* SHAPE */                 NULL,
    /* BITCAST */               NULL,
    /* GROUPED_CONV3D */        NULL,
    /* COL2IM */                NULL,
    /* L1_LAYER_NORM */         NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
--- a/src/tim/vx/internal/src/utils/vsi_nn_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c
@ -772,6 +772,7 @@ vsi_bool vsi_nn_CreateTensorGroup
    end[1] = in_tensor->attr.size[1];
    end[2] = in_tensor->attr.size[2];
    end[3] = in_tensor->attr.size[3];
    end[4] = in_tensor->attr.size[4];
    end[axis] = 0;
    for( i = 0; i <  group_number; i ++ )
    {
@ -1259,6 +1260,32 @@ vsi_bool vsi_nn_is_same_quant_type(
            }
            break;
        }
 #ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
        case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC: {
            const float diff = (float)1e-5;
            int32_t i = 0;
            int32_t scale_cnt0 = src_dtype->group_count;
            int32_t scale_cnt1 = dst_dtype->group_count;
            int32_t group_size0 = src_dtype->group_size;
            int32_t group_size1 = dst_dtype->group_size;
            if (scale_cnt0 == scale_cnt1 && group_size0 == group_size1)
            {
                const float* src_scale_ptr = src_dtype->group_scales;
                const float* dst_scale_ptr = dst_dtype->group_scales;
                for (i = 0; i < scale_cnt0; i++)
                {
                    if (vsi_nn_float_compare(
                            src_scale_ptr[i], dst_scale_ptr[i], diff) == FALSE)
                    {
                        return FALSE;
                    }
                }
            } else {
                return FALSE;
            }
            break;
        }
 #endif
        default:
            break;
    }
--- a/src/tim/vx/internal/src/vsi_nn_context.c
+++ b/src/tim/vx/internal/src/vsi_nn_context.c
@ -22,10 +22,10 @@
 *
 *****************************************************************************/
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_test.h"
 #include "vsi_nn_context.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_types.h"
 static vsi_status query_hardware_caps
    (
@ -103,6 +103,9 @@ static const char* ENV_ENABLE_STREAM_PROCESSOR = "vendor.VSI_VX_ENABLE_STREAM_PR
 static const char* ENV_FORCE_RGB888_OUT_NHWC = "vendor.VSI_NN_FORCE_RGB888_OUT_NHWC";
 static const char* ENV_ENABLE_SLICE_OPTIMIZE = "vendor.VSI_NN_ENABLE_SLICE_OPTIMIZE";
 static const char* ENV_ENABLE_BATCH_OPT = "vendor.VSI_VX_ENABLE_BATCH_OPT";
 static const char* ENV_SAVE_FILE_TYPE = "vendor.VSI_SAVE_FILE_TYPE";
 static const char* VSI_USE_IMAGE_PROCESS = "vendor.VSI_USE_IMAGE_PROCESS";
 static const char* VSI_USE_FROM_HANDLE = "vendor.VSI_USE_FROM_HANDLE";
 #else
 static const char* ENV_ENABLE_SHADER = "VIV_VX_ENABLE_SHADER";
 static const char* ENV_ENABLE_OPCHECK = "VSI_NN_ENABLE_OPCHECK";
@ -113,8 +116,11 @@ static const char* ENV_ENABLE_STREAM_PROCESSOR = "VSI_VX_ENABLE_STREAM_PROCESSOR
 static const char* ENV_FORCE_RGB888_OUT_NHWC = "VSI_NN_FORCE_RGB888_OUT_NHWC";
 static const char* ENV_ENABLE_SLICE_OPTIMIZE = "VSI_NN_ENABLE_SLICE_OPTIMIZE";
 static const char* ENV_ENABLE_BATCH_OPT = "VSI_VX_ENABLE_BATCH_OPT";
 static const char* ENV_SAVE_FILE_TYPE = "VSI_SAVE_FILE_TYPE";
 static const char* VSI_USE_IMAGE_PROCESS = "VSI_USE_IMAGE_PROCESS";
 static const char* VSI_USE_FROM_HANDLE = "VSI_USE_FROM_HANDLE";
 #endif
-static vsi_status vsi_nn_initOptions
+vsi_status vsi_nn_initOptions
    (
    vsi_nn_runtime_option_t *options
    )
@ -129,7 +135,7 @@ static vsi_status vsi_nn_initOptions
    default_value = 1;
 #endif
    options->enable_concat_optimize = vsi_nn_getenv_asint(ENV_ENABLE_CONCAT_OPTIMIZE, default_value);
-    options->enable_asymi8_to_u8 = vsi_nn_getenv_asint(ENV_ENABLE_I8TOU8, 1);
+    options->enable_i8_to_u8 = vsi_nn_getenv_asint(ENV_ENABLE_I8TOU8, 1);
    options->enable_dataconvert_optimize = vsi_nn_getenv_asint(ENV_ENABLE_DATACONVERT_OPTIMIZE, 1);
    options->enable_stream_processor = vsi_nn_getenv_asint(ENV_ENABLE_STREAM_PROCESSOR, 1);
    options->enable_rgb88_planar_nhwc = vsi_nn_getenv_asint(ENV_FORCE_RGB888_OUT_NHWC, 0);
@ -140,6 +146,9 @@ static vsi_status vsi_nn_initOptions
 #endif
    options->enable_slice_optimize = vsi_nn_getenv_asint(ENV_ENABLE_SLICE_OPTIMIZE, default_value);
    options->enable_batch_opt = vsi_nn_getenv_asint(ENV_ENABLE_BATCH_OPT, 0);
    options->enable_save_file_type = vsi_nn_getenv_asint(ENV_SAVE_FILE_TYPE, 0);
    options->enable_use_image_process = vsi_nn_getenv_asint(VSI_USE_IMAGE_PROCESS, -1);
    options->enable_use_from_handle = vsi_nn_getenv_asint(VSI_USE_FROM_HANDLE, -1);
    return VSI_SUCCESS;
 }
--- a/src/tim/vx/internal/src/vsi_nn_graph.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph.c
@ -1354,20 +1354,26 @@ vsi_nn_graph_t * vsi_nn_CreateGraph
            graph->node_num = 0;
            graph->ctx = ctx;
            graph->rnn_wksp = NULL;
            ((vsi_nn_graph_prv_t*) graph)->options =
                (vsi_nn_runtime_option_t *)malloc( sizeof( vsi_nn_runtime_option_t ));
            CHECK_PTR_FAIL_GOTO(((vsi_nn_graph_prv_t*) graph)->options, "Create graph options fail.", error);
            graph->node_table = (vsi_nn_map_t *)malloc( sizeof( vsi_nn_map_t ) );
            graph->tensor_table = (vsi_nn_map_t *)malloc( sizeof( vsi_nn_map_t ) );
            graph->isAllowFastMode = TRUE;
            vsi_nn_MapInit( graph->node_table );
            vsi_nn_MapInit( graph->tensor_table );
            vsi_nn_initOptions( ((vsi_nn_graph_prv_t*) graph)->options );
        }
        else
        {
            VSILOGE( "Create vx graph fail." );
-            free( graph );
+            free(graph);
            graph = NULL;
        }
    }
    return graph;
 error:
    return graph;
 } /* vsi_nn_CreateGraph() */
@ -1429,6 +1435,10 @@ void vsi_nn_ReleaseGraph
                free( tmp );
            }
        }
        if (NULL != ((vsi_nn_graph_prv_t*)ptr)->options)
        {
            free(((vsi_nn_graph_prv_t*)ptr)->options);
        }
        free( ptr );
        *graph = NULL;
    }
@ -1500,7 +1510,7 @@ vsi_status vsi_nn_SetupGraph
    }
 #if VX_GRAPH_BATCH_OPT_SUPPORT
-    if (graph->ctx->options.enable_batch_opt)
+    if (((vsi_nn_graph_prv_t*)graph)->options->enable_batch_opt)
    {
        /*processing batch splitting*/
        status = batchInference_graph(graph, nodes_list);
@ -2064,7 +2074,7 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
    const char          * kernel_name
    )
 {
-    vsi_nn_node_t * node;
+    vsi_nn_node_prv_t* node;
    vsi_nn_node_id_t id;
    vsi_nn_op_proc_t * node_proc;
@ -2076,16 +2086,17 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
    {
        return NULL;
    }
-    node = (vsi_nn_node_t *)malloc( sizeof( vsi_nn_node_t ) );
+    node = (vsi_nn_node_prv_t*)malloc(sizeof(vsi_nn_node_prv_t));
    if( NULL != node )
    {
-        memset( node, 0, sizeof( vsi_nn_node_t ) );
+        memset(node, 0, sizeof(vsi_nn_node_prv_t));
-        node->graph = graph;
+        node->pon.graph = graph;
-        node->op = op;
+        node->pon.op = op;
-        node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
+        node->pon.vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
-        node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_ZERO;
+        node->pon.vx_param.rounding_policy = VX_ROUND_POLICY_TO_ZERO;
-        node->vx_param.down_scale_size_rounding = VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR;
+        node->pon.vx_param.down_scale_size_rounding =
            VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR;
        /* init op */
        if(node_proc->init != NULL){
@ -2093,31 +2104,31 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
        }
        /* init output struct */
-        node->output.num = node_proc->output_num;
+        node->pon.output.num = node_proc->output_num;
-        node->output.tensors = (vsi_nn_tensor_id_t *) malloc(
+        node->pon.output.tensors = (vsi_nn_tensor_id_t*)malloc(
            node_proc->output_num * sizeof( vsi_nn_tensor_id_t ) );
-        if ( NULL == node->output.tensors )
+        if (NULL == node->pon.output.tensors)
        {
            VSILOGE("Create output tensor id %s. fail", vsi_nn_OpGetName(op));
            vsi_nn_safe_free(node);
            return NULL;
        }
-        vsi_nn_InitTensorsId( node->output.tensors, node_proc->output_num );
+        vsi_nn_InitTensorsId(node->pon.output.tensors, node_proc->output_num);
        /* init input struct */
-        node->input.num = node_proc->input_num;
+        node->pon.input.num = node_proc->input_num;
-        node->input.tensors = (vsi_nn_tensor_id_t *) malloc(
+        node->pon.input.tensors = (vsi_nn_tensor_id_t*)malloc(
            node_proc->input_num * sizeof( vsi_nn_tensor_id_t ) );
-        if ( NULL == node->input.tensors )
+        if (NULL == node->pon.input.tensors)
        {
            VSILOGE("Create input tensor id %s. fail", vsi_nn_OpGetName(op));
-            vsi_nn_safe_free(node->output.tensors);
+            vsi_nn_safe_free(node->pon.output.tensors);
            vsi_nn_safe_free(node);
            return NULL;
        }
-        vsi_nn_InitTensorsId( node->input.tensors, node_proc->input_num );
+        vsi_nn_InitTensorsId(node->pon.input.tensors, node_proc->input_num);
-        node->attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE;
+        node->pon.attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE;
-        node->attr.enable_op_constraint_check = TRUE;
+        node->pon.attr.enable_op_constraint_check = TRUE;
    }
    id = graph->cur_nid;
    if(NULL != node){
@ -2126,7 +2137,7 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
        graph->cur_nid ++;
    }
    vsi_nn_OpRegisterExternalOvxInit(op, kernel_name, node_proc);
-    return node;
+    return (vsi_nn_node_t*)node;
 } /* vsi_nn_AddExternalNode() */
 void vsi_nn_RemoveNode
@ -3354,24 +3365,245 @@ final:
    return status;
 } /* vsi_nn_ExecuteGraphLoop() */
 typedef enum {
    VSI_NN_ENABLE_I8TOU8 = 0,
    VSI_NN_ENABLE_OPCHECK,
    VSI_SAVE_FILE_TYPE,
    VSI_USE_IMAGE_PROCESS,
    VSI_NN_LOG_LEVEL,
    VSI_NN_ENABLE_CONCAT_OPTIMIZE,
    VSI_NN_ENABLE_DATACONVERT_OPTIMIZE,
    VSI_VX_ENABLE_STREAM_PROCESSOR,
    VSI_NN_FORCE_RGB888_OUT_NHWC,
    VSI_NN_ENABLE_SLICE_OPTIMIZE,
    VSI_VX_ENABLE_BATCH_OPT,
    VIV_VX_ENABLE_SHADER,
    VSI_USE_FROM_HANDLE,
    VIV_VX_ENABLE_GRAPH_TRANSFORM
 } VSI_PUBLIC_TYPE vsi_nn_runtime_variable;
-vsi_status vsi_nn_SetGraphTransformOption
+typedef struct {
    const char* key;
    int32_t value;
 } VSI_PUBLIC_TYPE keyValuePair;
 char* vsi_nn_GetRunTimeVariable
    (
    const vsi_nn_graph_t* graph,
    const char* key
    )
 {
    int32_t isVaid = 1;
    int32_t value = -1;
 #define varSize 256
    char* value_str = (char*)malloc(sizeof(char) * varSize);
    CHECK_PTR_FAIL_GOTO(value_str, "Create value_str fail.", final);
    memset(value_str, 0, varSize);
    char tmp_value[varSize] = {0};
    VSI_UNREFERENCED(tmp_value);
    vsi_nn_runtime_option_t* options = ((vsi_nn_graph_prv_t*)graph)->options;
    switch (vsi_nn_GetVariable(key))
    {
        case VIV_VX_ENABLE_SHADER:
            value =options->enable_shader;
            break;
        case VSI_NN_ENABLE_OPCHECK:
            value = options->enable_opcheck;
            break;
        case VSI_NN_ENABLE_I8TOU8:
            value = options->enable_i8_to_u8;
            break;
        case VSI_VX_ENABLE_STREAM_PROCESSOR:
            value = options->enable_stream_processor;
            break;
        case VSI_VX_ENABLE_BATCH_OPT:
            value = options->enable_batch_opt;
            break;
        case VSI_NN_FORCE_RGB888_OUT_NHWC:
            value = options->enable_rgb88_planar_nhwc;
            break;
        case VSI_SAVE_FILE_TYPE:
            value = options->enable_save_file_type;
            break;
        case VSI_NN_ENABLE_CONCAT_OPTIMIZE:
            value = options->enable_concat_optimize;
            break;
        case VSI_NN_ENABLE_SLICE_OPTIMIZE:
            value = options->enable_slice_optimize;
            break;
        case VSI_USE_IMAGE_PROCESS:
            if (options->enable_use_image_process != -1)
            {
                value = options->enable_use_image_process;
            }
            else
            {
                isVaid = 0;
            }
            break;
        case VSI_USE_FROM_HANDLE:
            if (options->enable_use_from_handle != -1)
            {
                value = options->enable_use_from_handle;
            }
            else
            {
                isVaid = 0;
            }
            break;
        default:
            isVaid = 0;
            VSILOGE("Not support this key: %s.", key);
    }
    if (isVaid == 1)
    {
        snprintf(tmp_value, varSize, "%d", value);
        memcpy(value_str, tmp_value, varSize);
    } else
    {
        goto final;
    }
 #undef varSize
    return value_str;
 final:
 #undef varSize
    vsi_nn_safe_free(value_str);
    return value_str;
 }
 vsi_status vsi_nn_SetRunTimeVariable
    (
    vsi_nn_graph_t* graph,
-    const char* ctrl_str,
+    const char* key,
-    size_t size
+    const char* value
     )
 {
    vsi_status status = VSI_SUCCESS;
    size_t size = 1;  // placeholder, not used in vxSetGraphAttribute.
    if (graph == NULL)
    {
        status = VSI_FAILURE;
        return status;
    }
    vsi_nn_runtime_option_t* options = ((vsi_nn_graph_prv_t*)graph)->options;
    VSI_UNREFERENCED(size);
    if (vsi_nn_getenv(key) == NULL)
    {
        switch (vsi_nn_GetVariable(key) )
        {
            case VIV_VX_ENABLE_SHADER:
                options->enable_shader = atoi(value);
                break;
            case VSI_NN_ENABLE_OPCHECK:
                options->enable_opcheck = atoi(value);
                break;
            case VSI_NN_ENABLE_I8TOU8:
                options->enable_i8_to_u8 = atoi(value);
                break;
            case VSI_VX_ENABLE_STREAM_PROCESSOR:
                options->enable_stream_processor = atoi(value);
                break;
            case VSI_VX_ENABLE_BATCH_OPT:
                options->enable_batch_opt = atoi(value);
                break;
            case VSI_NN_FORCE_RGB888_OUT_NHWC:
                options->enable_rgb88_planar_nhwc = atoi(value);
                break;
            case VSI_NN_ENABLE_CONCAT_OPTIMIZE:
                options->enable_concat_optimize = atoi(value);
                break;
            case VSI_NN_ENABLE_DATACONVERT_OPTIMIZE:
                options->enable_dataconvert_optimize = atoi(value);
                break;
            case VSI_NN_ENABLE_SLICE_OPTIMIZE:
                options->enable_slice_optimize = atoi(value);
                break;
            case VSI_SAVE_FILE_TYPE:
                options->enable_save_file_type = atoi(value);
                break;
            case VSI_USE_IMAGE_PROCESS:
                options->enable_use_image_process = atoi(value);
                break;
            case VSI_USE_FROM_HANDLE:
                options->enable_use_from_handle = atoi(value);
                break;
            case VIV_VX_ENABLE_GRAPH_TRANSFORM:
 #ifdef VX_GRAPH_TRANSFORM_OPTION_SUPPORT
                if (graph && graph->g) {
                    status = vxSetGraphAttribute(
                        graph->g, VX_GRAPH_VSI_TRANSFORM_OPTIONS, value, size);
                }
 #else
                status = VSI_FAILURE;
                VSILOGE("VX_GRAPH_TRANSFORM_OPTION_SUPPORT is not defined, please check driver version.");
 #endif
                break;
            default:
 #ifdef VX_GRAPH_ENV_SUPPORT
                status = vxSetGraphEnv(graph->g, key, value);
 #else
                status = VSI_FAILURE;
                VSILOGE("VX_GRAPH_ENV_SUPPORT is not defined, please check driver version.");
 #endif
                break;
        }
    }
    return status;
 }
 int32_t vsi_nn_GetVariable(const char* variableKey) {
    keyValuePair dict[] = {
        {"VSI_NN_ENABLE_I8TOU8", VSI_NN_ENABLE_I8TOU8},
        {"VSI_NN_ENABLE_OPCHECK", VSI_NN_ENABLE_OPCHECK},
        {"VSI_SAVE_FILE_TYPE", VSI_SAVE_FILE_TYPE},
        {"VSI_USE_IMAGE_PROCESS", VSI_USE_IMAGE_PROCESS},
        {"VSI_NN_ENABLE_CONCAT_OPTIMIZE", VSI_NN_ENABLE_CONCAT_OPTIMIZE},
        {"VSI_NN_ENABLE_DATACONVERT_OPTIMIZE", VSI_NN_ENABLE_DATACONVERT_OPTIMIZE},
        {"VSI_VX_ENABLE_STREAM_PROCESSOR", VSI_VX_ENABLE_STREAM_PROCESSOR},
        {"VSI_NN_FORCE_RGB888_OUT_NHWC", VSI_NN_FORCE_RGB888_OUT_NHWC},
        {"VSI_NN_ENABLE_SLICE_OPTIMIZE", VSI_NN_ENABLE_SLICE_OPTIMIZE},
        {"VSI_VX_ENABLE_BATCH_OPT", VSI_VX_ENABLE_BATCH_OPT},
        {"VIV_VX_ENABLE_SHADER", VIV_VX_ENABLE_SHADER},
        {"VSI_USE_FROM_HANDLE", VSI_USE_FROM_HANDLE},
        {"VIV_VX_ENABLE_GRAPH_TRANSFORM", VIV_VX_ENABLE_GRAPH_TRANSFORM},
        {NULL, -1}
    };
    for (int32_t i = 0; dict[i].key != NULL; i++) {
        if (strcmp(dict[i].key, variableKey) == 0) {
            return dict[i].value;
        }
    }
    return -1;
 }
 OVXLIB_API char* vsi_nn_GenerateGraphJson
    (
    vsi_nn_graph_t* graph
    )
 {
    char* json = NULL;
    VSI_UNREFERENCED(graph);
 #ifdef VX_GENERATE_GRAPH_JSON_API_SUPPORT
    if (graph && graph->g)
    {
        json = vxGenerateGraphJson(graph->g);
    }
 #endif
    return json;
 }
 OVXLIB_API vsi_status vsi_nn_ReleaseGraphJson
    (
    char* json
    )
 {
    vsi_status status = VSI_FAILURE;
-    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(json);
-    VSI_UNREFERENCED(ctrl_str);
+#ifdef VX_GENERATE_GRAPH_JSON_API_SUPPORT
-    VSI_UNREFERENCED(size);
+    if (json) {
-#ifdef VX_GRAPH_TRANSFORM_OPTION_SUPPORT
+        status = vxReleaseGraphJson(json);
    if(graph && graph->g)
    {
        status = vxSetGraphAttribute(graph->g, VX_GRAPH_VSI_TRANSFORM_OPTIONS, ctrl_str, size);
    }
 #endif
    return status;
 }
--- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
@ -26,6 +26,7 @@
 #include "vsi_nn_graph_optimization.h"
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_error.h"
@ -37,14 +38,50 @@ static vsi_bool _is_asymm_int8_norm_tensor
 {
    vsi_bool ret = FALSE;
-    ret = ( tensor != NULL
+    ret = ( tensor != NULL &&
-   && tensor->attr.vtl == FALSE && tensor->attr.is_const == FALSE
+            tensor->attr.vtl == FALSE &&
-   && tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8
+            tensor->attr.is_const == FALSE &&
-   && tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC);
+            tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
            tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
          );
    return ret;
 }/* _is_asymm_int8_norm_tensor() */
 static vsi_bool _is_symm_int8_norm_tensor
 (
    vsi_nn_tensor_t* tensor
 )
 {
    vsi_bool ret = FALSE;
    ret = (tensor != NULL &&
           tensor->attr.vtl == FALSE &&
           tensor->attr.is_const == FALSE &&
           tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
           tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC
        );
    return ret;
 }/* _is_symm_int8_norm_tensor() */
 static vsi_bool _is_int8_norm_tensor
 (
    vsi_nn_graph_t* graph,
    vsi_nn_tensor_t* tensor
 )
 {
    vsi_bool ret = FALSE;
    vsi_bool support_symi8 =
       ((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8 == 2;
    ret = _is_asymm_int8_norm_tensor(tensor);
    ret = ret || (support_symi8 && _is_symm_int8_norm_tensor(tensor));
    return ret;
 }/* _is_int8_norm_tensor() */
 static vsi_bool _is_asymm_int8_const_tensor
    (
        vsi_nn_tensor_t * tensor
@ -52,14 +89,47 @@ static vsi_bool _is_asymm_int8_const_tensor
 {
    vsi_bool ret = FALSE;
-    ret = ( tensor != NULL
+    ret = ( tensor != NULL &&
-   && tensor->attr.is_const == TRUE
+            tensor->attr.is_const == TRUE &&
-   && tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8
+            tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
-   && tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC);
+            tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
          );
    return ret;
 }/* _is_asymm_int8_const_tensor() */
 static vsi_bool _is_symm_int8_const_tensor
 (
    vsi_nn_tensor_t* tensor
 )
 {
    vsi_bool ret = FALSE;
    ret = (tensor != NULL &&
        tensor->attr.is_const == TRUE &&
        tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
        tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC
        );
    return ret;
 }/* _is_symm_int8_const_tensor() */
 static vsi_bool _is_int8_const_tensor
 (
    vsi_nn_graph_t* graph,
    vsi_nn_tensor_t* tensor
 )
 {
    vsi_bool ret = FALSE;
    vsi_bool support_symi8 =
       ((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8 == 2;
    ret = _is_asymm_int8_const_tensor(tensor);
    ret = ret || (support_symi8 && _is_symm_int8_const_tensor(tensor));
    return ret;
 }/* _is_int8_const_tensor() */
 static vsi_bool _is_asymm_int8_virtual_tensor
    (
        vsi_nn_tensor_t * tensor
@ -67,14 +137,47 @@ static vsi_bool _is_asymm_int8_virtual_tensor
 {
    vsi_bool ret = FALSE;
-    ret = ( tensor != NULL
+    ret = ( tensor != NULL &&
-   && tensor->attr.vtl == TRUE
+            tensor->attr.vtl == TRUE &&
-   && tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8
+            tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
-   && tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC);
+            tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
          );
    return ret;
 }/* _is_asymm_int8_virtual_tensor() */
 static vsi_bool _is_symm_int8_virtual_tensor
 (
    vsi_nn_tensor_t* tensor
 )
 {
    vsi_bool ret = FALSE;
    ret = (tensor != NULL &&
        tensor->attr.vtl == TRUE &&
        tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
        tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC
        );
    return ret;
 }/* _is_symm_int8_virtual_tensor() */
 static vsi_bool _is_int8_virtual_tensor
 (
    vsi_nn_graph_t* graph,
    vsi_nn_tensor_t* tensor
 )
 {
    vsi_bool ret = FALSE;
    vsi_bool support_symi8 =
       ((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8 == 2;
    ret = _is_asymm_int8_virtual_tensor(tensor);
    ret = ret || (support_symi8 && _is_symm_int8_virtual_tensor(tensor));
    return ret;
 }/* _is_int8_virtual_tensor() */
 static vsi_status _add_forward_node
    (
    vsi_nn_graph_t* graph,
@ -199,7 +302,7 @@ static void _get_graph_input_asymm_int8_norm_tensor
            vsi_nn_tensor_id_t id = node->input.tensors[j];
            vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
-            if (_is_asymm_int8_norm_tensor(tensor))
+            if (_is_int8_norm_tensor(graph, tensor))
            {
                if(tensor_ids != NULL)
                {
@ -251,7 +354,7 @@ static void _get_graph_output_asymm_int8_norm_tensor
            vsi_nn_tensor_id_t id = node->output.tensors[j];
            vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
-            if (_is_asymm_int8_norm_tensor(tensor))
+            if (_is_int8_norm_tensor(graph, tensor))
            {
                if(tensor_ids != NULL)
                {
@ -360,6 +463,7 @@ static vsi_status _add_graph_dataconvert_for_int8
                {
                   memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t));
                   attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
                   attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
                   attr.dtype.zero_point += 128;
                   attr.vtl = TRUE;
                   output = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL );
@ -383,6 +487,7 @@ static vsi_status _add_graph_dataconvert_for_int8
            {
                memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t));
                attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
                attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
                attr.dtype.zero_point += 128;
                attr.vtl = TRUE;
                input = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL );
@ -788,6 +893,7 @@ static void _convert_const_I8toU8
    }
    attr->dtype.vx_type = VSI_NN_TYPE_UINT8;
    attr->dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
    attr->dtype.zero_point += 128;
    if ( tensor->t ) vxReleaseTensor(&tensor->t);
@ -818,7 +924,7 @@ static vsi_status _convert_graph_const_tensor
           vsi_nn_tensor_id_t id = node->input.tensors[j];
           vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
-           if (_is_asymm_int8_const_tensor(tensor))
+           if (_is_int8_const_tensor(graph, tensor))
           {
               _convert_const_I8toU8(graph, id);
           }
@ -835,11 +941,9 @@ static vsi_status _convert_virtual_tensor_attr
    vsi_nn_tensor_t * tensor
    )
 {
-    if (_is_asymm_int8_virtual_tensor(tensor))
+    tensor->attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
-    {
+    tensor->attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
-        tensor->attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
+    tensor->attr.dtype.zero_point += 128;
        tensor->attr.dtype.zero_point += 128;
    }
    return VSI_SUCCESS;
 }/* _convert_virtual_tensor_attr() */
@ -849,7 +953,7 @@ static vsi_status _convert_graph_virtual_tensor
    vsi_nn_graph_t* graph
    )
 {
-    vsi_status status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
    uint32_t node_num = graph->node_num;
    vsi_nn_node_t* node = NULL;
    uint32_t i = 0;
@ -865,7 +969,10 @@ static vsi_status _convert_graph_virtual_tensor
            vsi_nn_tensor_id_t id = node->input.tensors[j];
            vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
-            status = _convert_virtual_tensor_attr(tensor);
+            if (_is_int8_virtual_tensor(graph, tensor))
            {
                status = _convert_virtual_tensor_attr(tensor);
            }
        }
        for(j = 0; j < node->output.num; j++)
@ -873,7 +980,10 @@ static vsi_status _convert_graph_virtual_tensor
            vsi_nn_tensor_id_t id = node->output.tensors[j];
            vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
-            status = _convert_virtual_tensor_attr(tensor);
+            if (_is_int8_virtual_tensor(graph, tensor))
            {
                status = _convert_virtual_tensor_attr(tensor);
            }
        }
    }
@ -925,7 +1035,7 @@ vsi_status vsi_nn_OptimizeGraph
    status = VSI_SUCCESS;
-    if (!nbg_flag && graph->ctx->options.enable_asymi8_to_u8)
+    if (!nbg_flag &&((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8)
    {
        status = _graph_optimization_convert_int8_to_uint8(graph, dirty);
        CHECK_STATUS_FAIL_GOTO(status, final);
--- a/src/tim/vx/internal/src/vsi_nn_internal_node.c
+++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c
@ -452,7 +452,8 @@ void vsi_nn_internal_init_tensor_attr
    if( dtype->qnt_type == VSI_NN_QNT_TYPE_NONE &&
        ( dtype->vx_type != VSI_NN_TYPE_FLOAT16 &&
          dtype->vx_type != VSI_NN_TYPE_FLOAT32 &&
-          dtype->vx_type != VSI_NN_TYPE_BFLOAT16 ) )
+          dtype->vx_type != VSI_NN_TYPE_BFLOAT16 &&
          dtype->vx_type != VSI_NN_TYPE_INT32) )
    {
        attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
        attr->dtype.vx_type = VSI_NN_TYPE_FLOAT16;
--- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
+++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
@ -208,6 +208,10 @@ static _node_template s_template[] =
    /* RESIZE_3D */                NULL,
    /* REDUCEL2 */              NULL,
    /* CROP_AND_RESIZE */       NULL,
    /* BITCAST */       NULL,
    /* GROUPED_CONV3D */        NULL,
    /* CO2IM */        NULL,
    /* L1_LAYER_NORM */         NULL,
 };
 //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c );
--- a/src/tim/vx/internal/src/vsi_nn_ops.c
+++ b/src/tim/vx/internal/src/vsi_nn_ops.c
@ -26,6 +26,7 @@
 #include "vsi_nn_client_op.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_types.h"
 #include "vsi_nn_types_prv.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
@ -281,7 +282,7 @@ vsi_bool vsi_nn_OpCheck
    if ( NULL != proc )
    {
        ret = TRUE;
-        if ( proc->check && node->graph->ctx->options.enable_opcheck)
+        if ( proc->check && ((vsi_nn_graph_prv_t*)(node->graph))->options->enable_opcheck)
        {
            ret = proc->check( node, inputs, outputs );
        }
--- a/src/tim/vx/internal/src/vsi_nn_tensor.c
+++ b/src/tim/vx/internal/src/vsi_nn_tensor.c
@ -144,6 +144,17 @@ static void print_tensor
                         tensor->attr.dtype.scale_dim);
        ext_attr[count] = 0;
        break;
 #endif
 #ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
    case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC:
        count = snprintf(&ext_attr[0],
                         _EXT_ATTR_BUF_SZ,
                         "SYM GPTQ axis=%d, count=%d, group_size=%d",
                         tensor->attr.dtype.group_channel_dim,
                         tensor->attr.dtype.group_count,
                         tensor->attr.dtype.group_size);
        ext_attr[count] = 0;
        break;
 #endif
    default:
        vsi_nn_strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ);
@ -430,6 +441,25 @@ static vsi_bool _init_tensor
        VSILOGE(
            "can't support qnt_type "
            "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC.");
 #endif
    case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC:
 #ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
        params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_GROUP;
        // This is a hack that driver doesn't support const scales
        scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.group_count);
        CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
        memcpy(scales, tensor->attr.dtype.group_scales, tensor->attr.dtype.group_count * sizeof(float));
        params.quant_data.affinePerGroup.channel_dim = tensor->attr.dtype.group_channel_dim;
        params.quant_data.affinePerGroup.group_size = tensor->attr.dtype.group_size;
        params.quant_data.affinePerGroup.scale_group_count = tensor->attr.dtype.group_count;
        params.quant_data.affinePerGroup.scales = scales;
        params.quant_data.affinePerGroup.zero_points = NULL;
        params.quant_data.affinePerGroup.zero_point_group_count = 0;
        break;
 #else
        VSILOGE(
            "can't support qnt_type "
            "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC.");
 #endif
    default:
        break;
--- a/src/tim/vx/internal/src/vsi_nn_types_prv.h
+++ b/src/tim/vx/internal/src/vsi_nn_types_prv.h
@ -58,6 +58,7 @@ typedef struct _vsi_nn_graph_prv
    // Add graph internal attribute here...
    vsi_nn_swap_handle_cache_t swap_handle_cache;
    vsi_nn_runtime_option_t* options;
 } vsi_nn_graph_prv_t;
 /** Internal Node structure, internal use only. */