Update internal for 22Q2 release (#432)

* Update internal for 22Q2 release update to internal commit-id: e96103281b08404cabb9b65306587627cfa3cb93 Signed-off-by: yuenan.li <yuenan.li@verisilicon.com> * Update prebuilt for 22Q2 release Signed-off-by: yuenan.li <yuenan.li@verisilicon.com> Co-authored-by: yuenan.li <yuenan.li@verisilicon.com>
2022-07-25 09:29:22 +08:00 · 2022-07-25 09:29:22 +08:00 · 7d88a668e3
parent 9f331ed5ec
commit 7d88a668e3
256 changed files with 24998 additions and 22686 deletions
--- a/.gitignore
+++ b/.gitignore
@ -336,3 +336,4 @@ ASALocalRun/
 # IDE
 .settings/
 build/
+*_build/
--- a/prebuilt-sdk/x86_64_linux/VERSION
+++ b/prebuilt-sdk/x86_64_linux/VERSION
@ -1 +1 @@
-REL/6.4.10.2
+6.4.11
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
@ -499,6 +499,8 @@ enum vx_kernel_e {

    VX_KERNEL_NN_DECONV_3D_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x31,

+    VX_KERNEL_STREAM_PROCESSOR = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x32,
+
    VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */
 };

--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
@ -196,4 +196,45 @@ VX_DECONV_3D_API_SUPPORT is used to declare that vsi openvx driver can support d
 #define VX_TENSOR_STRIDE_X_BITS_SUPPORT 1
 #endif

+/*
+VX_REMOVE_RESHAPE_SUPPORT is used to declare if graph opt support to remove reshape op, if support, it's not need to remove reshape in ovxlib.
+ 0: not support
+ 1: support
+*/
+/*
+#ifndef VX_REMOVE_RESHAPE_SUPPORT
+#define VX_REMOVE_RESHAPE_SUPPORT 0
+#endif
+*/
+
+/*
+VX_STREAM_PROCESSOR_SUPPORT is used to declare that vsi openvx driver can support vxStreamProcessorNode API
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_STREAM_PROCESSOR_SUPPORT
+#define VX_STREAM_PROCESSOR_SUPPORT 0
+#endif
+
+/*
+ VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL is used to declare that this tensor connect to fixed DMA channel.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL
+#define VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL 1
+#endif
+
+/*
+ VX_SCALE_EXTRA_PARAMETER_SUPPORT is used to declare that RESIZE can support align_cornor and half_pixel_center parameter
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_SCALE_EXTRA_PARAMETER_SUPPORT
+#define VX_SCALE_EXTRA_PARAMETER_SUPPORT 1
+#endif
+
 #endif /* __VX_KHR_COMPATIBLE_H__ */
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
@ -57,6 +57,12 @@ enum vx_graph_attribute_internal_type_e
    VX_GRAPH_AXI_SRAM_PRE_LOAD                    =  VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x2,
    /*! \brief Queries a graph for its running priority (read-write. Use a <tt>\ref vx_uint32</tt> parameter. */
    VX_GRAPH_PRIORITY_VALUE_VIV                   =  VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x3,
+    VX_GRAPH_PSI_EXTRATOR_PARAMETER               = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x4,
+    VX_GRAPH_PSI_FILLER_PARAMETER                 = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x5,
+    VX_GRAPH_DENOISE_POSTPROCESS_PARAMETER        = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x6,
+    VX_GRAPH_DATA_COMPRESSION_RATIO               = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x7,
+    VX_GRAPH_ISP_EMULATION_PARAMETER              = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x8,
+    VX_GRAPH_PROCESS_FPS                          = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x9,
 };

 /*! \brief Size Alignment of User Memory
@ -209,7 +215,8 @@ enum vx_nn_activation_function_e
    VX_NN_ACTIVATION_LEAKYRELU_MAX_POOLING = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x4,
    VX_NN_ACTIVATION_SWISH = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x5,
    VX_NN_ACTIVATION_HSWISH = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x6,
-    VX_NN_ACTIVATION_NONE = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x7,
+    VX_NN_ACTIVATION_CUSTOM = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x7,
+    VX_NN_ACTIVATION_NONE = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x8,
 };

 /*! \brief  The Convolutional network type
@ -285,6 +292,59 @@ enum vx_tensor_rank_type_e
    VX_TENSOR_RANK_SN,
 };

+/*! \brief The attribute of tensor.
+ * \ingroup group_tensor
+ * \version 0.4
+ */
+enum vx_tensor_priority_e
+{
+    /*! \brief no special requirement */
+    VX_TENSOR_DEFAULT = 0,
+
+    /*! \brief  2nd input(reference) */
+    /*VX_TENSOR_2ND_INPUT_FOR       = 1,*/
+    VX_TENSOR_FOR_GRAPH_REFERENCE = 1,
+};
+
+
+/*! \brief The attribute of tensor memory.
+ * \ingroup group_tensor
+ * \version 0.4
+ */
+enum vx_tensor_memory_attribute_e
+{
+    /*! \brief no special requirement */
+    VX_TENSOR_MEMORY_DEFAULT = 0,
+
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_0 = (0x1 << 0),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_1 = (0x1 << 1),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_2 = (0x1 << 2),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_3 = (0x1 << 3),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_4 = (0x1 << 4),
+    /*
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_5 = (0x1 << VX_DMA5_IN_ISP_OCM_PSI),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_6 = (0x1 << VX_DMA6_DDR_DECOMPRESS),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_7 = (0x1 << VX_DMA7_POSTOUT_OCM_ISP),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_8 = (0x1 << VX_DMA8_COMPRESS_DDR),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_9 = (0x1 << VX_DMA9_ISP_PATTERN_GENERATOR),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_10 = (0x1 << VX_DMA10_ISP_CHECKSUM_GENERATOR),
+    */
+    /*! \brief DMA transfer data to VIP and enable circular buffer */
+#if !VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL
+    VX_TENSOR_MEMORY_ENABLE_CIRCULAR_BY_DMA = 0xFFFFFFFF,
+#endif
+};
+
+enum vx_dma_extrator_pad_mode_e
+{
+    /*! \brief no special requirement */
+    VX_DMA_EXTRATOR_PAD_CONST = 0,
+
+    /*! \brief DMA extrator pad with nearest edge */
+    VX_DMA_EXTRATOR_PAD_WITH_NEAREAST_EDGE = 1,
+};
+
+
 /*! \brief The precision of tensor.
 * \ingroup group_tensor
 * \version 0.4
@ -601,6 +661,19 @@ VX_API_ENTRY vx_tensor VX_API_CALL vxReshapeTensor(vx_tensor tensor, vx_int32* n
 */
 VX_API_ENTRY vx_status VX_API_CALL vxSetTensorAttribute(vx_tensor tensor, vx_enum attribute, const void *ptr, vx_size size);

+/*! \brief Creates an opaque reference to a tensor data buffer.
+ * \details The tensor is a dummy tensor which will not allocate any memory. And it cannot reshape or view.
+ * Not guaranteed to exist until the <tt>vx_graph</tt> containing it has been verified.
+ * \param [in] context The reference to the implementation context.
+ * \param [in] number_of_dims The number of dimensions.
+ * \param [in] dims Dimensions sizes in elements.
+ * \param [in] data_format The <tt>\ref vx_type_e</tt> that represents the data format of the tensor data elements.
+ * \return A tensor data reference or zero when an error is encountered.
+ * \ingroup group_tensor
+ * \version 0.3
+ */
+VX_API_ENTRY vx_tensor VX_API_CALL vxCreateDummyTensor(vx_context context, vx_size number_of_dims, const vx_size *dims, vx_enum data_format);
+

 /*! \brief The type enumeration lists all NN extension types.
 * \ingroup group_cnn
@ -1317,6 +1390,13 @@ typedef struct _vx_nn_scale_params_t
    vx_enum type;             /*!< \brief  The interpolation type, only support VX_INTERPOLATION_BILINEAR.  */
 } vx_nn_scale_params_t, * vx_nn_scale_params;

+typedef struct _vx_nn_scale_params_ext_t
+{
+    vx_nn_scale_params_t base;
+    vx_bool align_corners;
+    vx_bool half_pixel_centers;
+} vx_nn_scale_params_ext_t, * vx_nn_scale_params_ext;
+
 /*! \brief [Graph] Creates a scale Layer Node.
 * \param [in] graph The reference to the parent graph.
 * \param [in] input The input tensor data to scale.
@ -2054,8 +2134,15 @@ typedef struct _vx_hardware_caps_params_ext_t
    vx_hardware_caps_params_t base;
    vx_uint32 subGroupSize;        /*!< \brief  shader sub-group size.*/
    vx_bool   supportVA40;         /*!< \brief  support 40bit virtual address.*/
+    vx_uint32 supportStreamProcessor; /*!< \brief  support stream processor.*/
 } vx_hardware_caps_params_ext_t;

+typedef struct _vx_hardware_caps_params_ext2_t
+{
+    vx_hardware_caps_params_ext_t base;
+    vx_uint32 streamProcessorExecCount;     /*!< \brief  streamprocess execution count.  */
+} vx_hardware_caps_params_ext2_t;
+
 /*! \brief Queries hardware caps information.
 * \param [in] context The reference to the context.
 * \param [in] hardware_caps_params <tt>\ref vx_hardware_caps_params_t </tt>.
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
@ -219,6 +219,15 @@ typedef struct _vx_nn_convolution_relu_pooling_params_ext4_t
    vx_bool         enable_nn_tensor_add_relu;  /*!< \brief  Enable Relu function after tensor add. */
 } vx_nn_convolution_relu_pooling_params_ext4_t, * vx_nn_convolution_relu_pooling_params_ext4;

+typedef struct _vx_nn_convolution_relu_pooling_params_ext5_t
+{
+    vx_nn_convolution_relu_pooling_params_ext4_t ext4;  /*!< \brief convolution relu pooling params <tt>\ref vx_nn_convolution_relu_pooling_params_ext_t</tt> */
+
+    vx_object_array inputs_list;
+    vx_object_array outputs_list;
+    vx_spinst       spinst_obj;
+} vx_nn_convolution_relu_pooling_params_ext5_t, * vx_nn_convolution_relu_pooling_params_ext5;
+
 /*! \brief [Graph] Creates a Convolutional Network Convolution and Activation(Relu) and Pooling Layer Node, this fucntion match kronos NN Extension 1.2 verion.
 * \details This function implement Convolutional Network Convolution and Activation(Relu) and Pooling layer.
 *  For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined,
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_nodes.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_nodes.h
@ -963,6 +963,40 @@ VX_API_ENTRY vx_node VX_API_CALL vxBatchGemmNode(vx_graph graph,
                                                 vx_scalar trans_c,
                                                 vx_tensor output);

+typedef struct _vx_lut_params_s
+{
+    vx_enum         lut_function;     /*!< \brief Set VX_NN_ACTIVATION_NONE to disable lut table or set VX_NN_ACTIVATION_CUSTOM to customize lut table or set others to use fixed lut table */
+    vx_float32      float_values[4];  /*!< \brief Float parameters of fixed lut table */
+    vx_uint32       fvalues_count;    /*!< \brief Count of float_values */
+    vx_int32        int_values[4];    /*!< \brief Int parameters of fixed lut table */
+    vx_uint32       ivalues_count;    /*!< \brief Count of int_values */
+    vx_lut          in_lut;           /*!< \brief Only valid when lut_function is VX_NN_ACTIVATION_CUSTOM */
+    vx_lut          out_lut;          /*!< \brief Only valid when lut_function is VX_NN_ACTIVATION_CUSTOM */
+} vx_lut_params_s, * vx_lut_params;
+
+/*! \brief Create a stream processor node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input_list The input tensor list.
+ * \param [in] input_count The input tensor count.
+ * \param [in] output_list The output tensor list.
+ * \param [in] output_count The output tensor count.
+ * \param [in] spinst_obj The stream processor instrunction object. Use vxCreateSPINST() to create.
+ * \param [in] lut_params The lut parameters. Refer to vx_lut_params_s.
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation
+ * should be checked using <tt>\ref vxGetStatus</tt>
+ * \ingroup group_vision_function_sp
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxStreamProcessorNode(
+    vx_graph                    graph,
+    vx_tensor*                  input_list,
+    vx_uint32                   input_count,
+    vx_tensor*                  output_list,
+    vx_uint32                   output_count,
+    vx_spinst                   spinst_obj,
+    vx_lut_params               lut_params
+    );
+
 #ifdef __cplusplus
 }
 #endif
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h
@ -0,0 +1,332 @@
+/****************************************************************************
+*
+*    Copyright 2017 - 2021 Vivante Corporation, Santa Clara, California.
+*    All Rights Reserved.
+*
+*    Permission is hereby granted, free of charge, to any person obtaining
+*    a copy of this software and associated documentation files (the
+*    'Software'), to deal in the Software without restriction, including
+*    without limitation the rights to use, copy, modify, merge, publish,
+*    distribute, sub license, and/or sell copies of the Software, and to
+*    permit persons to whom the Software is furnished to do so, subject
+*    to the following conditions:
+*
+*    The above copyright notice and this permission notice (including the
+*    next paragraph) shall be included in all copies or substantial
+*    portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+*    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+*    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+*    IN NO EVENT SHALL VIVANTE AND/OR ITS SUPPLIERS BE LIABLE FOR ANY
+*    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+*    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+*    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VX_SPINST_H_
+#define _VX_SPINST_H_
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+typedef enum _vx_sp_inst_type_e
+{
+    VX_SP_INST_TYPE_FADD,
+    VX_SP_INST_TYPE_FMULT,
+    VX_SP_INST_TYPE_MOVE,
+    VX_SP_INST_TYPE_PWL,
+
+    VX_SP_INST_TYPE_COUNT,
+}
+vx_sp_inst_type_e;
+
+typedef enum _vx_sp_inst_type_fadd_e
+{
+    VX_SP_INST_TYPE_FADD_IDLE,   // FADD-IDLE
+    VX_SP_INST_TYPE_FADD_ADD,    // dst = src0 + src1
+    VX_SP_INST_TYPE_FADD_SUB,    // dst = src0 - src1
+
+    VX_SP_INST_TYPE_FADD_COUNT,
+}
+vx_sp_inst_type_fadd_e;
+
+typedef enum _vx_sp_inst_type_fmult_e
+{
+    VX_SP_INST_TYPE_FMULT_IDLE,       /* FMULT-IDLE */
+    VX_SP_INST_TYPE_FMULT_MUL,        /* dst = src0 * src1 */
+    VX_SP_INST_TYPE_FMULT_MUL_CLAMP,  /* dst = clamp (src0, src1, R6, R7) */
+
+    VX_SP_INST_TYPE_FMULT_COUNT,
+}
+vx_sp_inst_type_fmult_e;
+
+typedef enum _vx_sp_inst_type_move_e
+{
+    VX_SP_INST_TYPE_MOVE_IDLE,
+    VX_SP_INST_TYPE_MOVE_MOVE,  // dst = src1
+    VX_SP_INST_TYPE_MOVE_SEL0,  // dst = (src0 > 0) ? src1[0] : src1[1]
+    VX_SP_INST_TYPE_MOVE_SEL1,  // dst = (src0 > 0) ? src1 : FA-src0  // use FA's SRC0
+    VX_SP_INST_TYPE_MOVE_IMMD,  // dst = Constant assign immmediate
+    VX_SP_INST_TYPE_MOVE_ABS,   // dst = abs(src1)
+
+    VX_SP_INST_TYPE_MOVE_COUNT,
+}
+vx_sp_inst_type_move_e;
+
+typedef enum _vx_sp_inst_type_pwl_e
+{
+    VX_SP_INST_TYPE_PWL_IDLE,
+    VX_SP_INST_TYPE_PWL_SETUP_0,  /* PWL ID = 0 */
+    VX_SP_INST_TYPE_PWL_SETUP_1,  /* Sigmode() */
+    VX_SP_INST_TYPE_PWL_SETUP_2,  /* Tanh() */
+
+    VX_SP_INST_TYPE_PWL_COUNT,
+}
+vx_sp_inst_type_pwl_e;
+
+typedef enum _vx_sp_inst_src_dst_e
+{
+    VX_SP_INST_SPINOUT,
+    VX_SP_INST_SR1,
+    VX_SP_INST_SR2,
+    VX_SP_INST_SR3,
+    VX_SP_INST_SR4,
+    VX_SP_INST_SR5,
+    VX_SP_INST_SR6,   /* nn_clamp_min */
+    VX_SP_INST_SR7,   /* nn_clamp_max */
+    VX_SP_INST_SR8,
+    VX_SP_INST_SR9,
+    VX_SP_INST_SR10,
+    VX_SP_INST_VR11,
+    VX_SP_INST_VR12,
+    VX_SP_INST_VR13,
+    VX_SP_INST_VR14,
+    VX_SP_INST_SETUPOUT,   /* Input of PWL Mult and Add: FMInA, FMInB, FAInA, FAInB */
+}
+vx_sp_inst_src_dst_e;
+
+typedef struct _vx_spinst_unit_param
+{
+    vx_enum         op;    /* vx_sp_inst_type_e */
+
+    struct
+    {
+        vx_enum     op;    /* vx_sp_inst_type_fadd/fmult/move/pwl_e */
+
+        struct
+        {
+            vx_uint8    src0;       /* vx_sp_inst_src_dst_e */
+            vx_uint8    src1;       /* vx_sp_inst_src_dst_e */
+            vx_uint8    dst;        /* vx_sp_inst_src_dst_e */
+            vx_float32  constant;
+        } var;
+
+    } sub;
+
+}
+vx_spinst_unit_param;
+
+/**********************************************************************************************/
+
+typedef enum _vx_sp_attribute_e
+{
+    VX_SP_ATTRIBUTE_NONE,
+
+    VX_SP_ATTRIBUTE_INPUT_TILE_MAPPING,
+    VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_X,
+    VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_Y,
+    VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_Z,
+
+    VX_SP_ATTRIBUTE_PROG_INIT_INSTR_NUM,
+    VX_SP_ATTRIBUTE_PROG_LOOP_INSTR_NUM,
+    VX_SP_ATTRIBUTE_PROG_COMPLETE_INSTR_NUM,
+    VX_SP_ATTRIBUTE_PROG_ROUNDING_MODE,
+    VX_SP_ATTRIBUTE_INPUT_SETUP,
+
+    VX_SP_ATTRIBUTE_IGNORED_LEADING_OUTPUTS,
+    VX_SP_ATTRIBUTE_FLUSH_CYCLE_NUM,
+    VX_SP_ATTRIBUTE_IGNORED_LEADING_V11_WR,
+    VX_SP_ATTRIBUTE_IGNORED_LEADING_V12_WR,
+    VX_SP_ATTRIBUTE_IGNORED_LEADING_V11_RD,
+    VX_SP_ATTRIBUTE_IGNORED_LEADING_V12_RD,
+
+    VX_SP_ATTRIBUTE_CH0_POST_REDISTRIBUTE,
+    VX_SP_ATTRIBUTE_CH1_POST_REDISTRIBUTE,
+    VX_SP_ATTRIBUTE_V11_RESET_AT_START,
+    VX_SP_ATTRIBUTE_V12_RESET_AT_START,
+    VX_SP_ATTRIBUTE_V11_POP_CONFIG,
+    VX_SP_ATTRIBUTE_V12_POP_CONFIG,
+    VX_SP_ATTRIBUTE_ACCELERATOR_INPUT_SELECT,
+    VX_SP_ATTRIBUTE_IGNORED_LEADING_ACC_OUT,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_RESET,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_NUM_CH_MINUS_ONE,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_2D_ACCUM_STORAGE,
+
+    VX_SP_ATTRIBUTE_GENERAL_COUNT,
+
+    VX_SP_ATTRIBUTE_CONST0,     /* NN post multiplier    */
+    VX_SP_ATTRIBUTE_CONST1,     /* NN neg pos multiplier */
+    VX_SP_ATTRIBUTE_CONST2,     /* NN tensor add const   */
+    VX_SP_ATTRIBUTE_CONST3,     /* NN clamp max          */
+    VX_SP_ATTRIBUTE_CONST4,     /* NN clmap min          */
+
+    VX_SP_ATTRIBUTE_TOTAL_COUNT,
+}
+vx_sp_attribute_e;
+
+typedef enum _vx_sp_attribute_input_tile_mapping_e
+{
+    VX_SP_ATTRIBUTE_INPUT_TILE_MAPPING_XYMERGE,
+    VX_SP_ATTRIBUTE_INPUT_TILE_MAPPING_YZMERGE,
+}
+vx_sp_attribute_input_tile_mapping_e;
+
+typedef enum _vx_sp_attribute_output_collapse_e
+{
+    VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_DISABLED,
+    VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_ENABLED,
+}
+vx_sp_attribute_output_collapse_e;
+
+typedef enum _vx_sp_attribute_rounding_mode_e
+{
+    VX_SP_ATTRIBUTE_PROG_ROUNDING_MODE_RTNE,
+    VX_SP_ATTRIBUTE_PROG_ROUNDING_MODE_STICKY,
+}
+vx_sp_attribute_rounding_mode_e;
+
+typedef enum _vx_sp_attribute_input_setup_e
+{
+    VX_SP_ATTRIBUTE_INPUT_SETUP_SINGLE_INPUT,
+    VX_SP_ATTRIBUTE_INPUT_SETUP_INTERLEAVE_TWO_INPUTS,
+    VX_SP_ATTRIBUTE_INPUT_SETUP_V11,
+    VX_SP_ATTRIBUTE_INPUT_SETUP_V12,
+}
+vx_sp_attribute_input_setup_e;
+
+typedef enum _vx_sp_attribute_ch_post_redistribute_e
+{
+    VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_DISABLED,
+    VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_SCALAR_GATHER,
+    VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_VECTOR_GATHER,
+    VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_VECTOR_SCATTER,
+}
+vx_sp_attribute_ch_post_redistribute_e;
+
+typedef enum _vx_sp_attribute_v_reset_at_start_e
+{
+    VX_SP_ATTRIBUTE_V_RESET_AT_START_NONE,
+    VX_SP_ATTRIBUTE_V_RESET_AT_START_RESET,
+}
+vx_sp_attribute_v_reset_at_start_e;
+
+typedef enum _vx_sp_attribute_v_pop_config_e
+{
+    VX_SP_ATTRIBUTE_V_POP_CONFIG_EVERY_READ,
+    VX_SP_ATTRIBUTE_V_POP_CONFIG_EVERY_ROW,
+}
+vx_sp_attribute_v_pop_config_e;
+
+typedef enum _vx_sp_attribute_accelerator_input_select_e
+{
+    VX_SP_ATTRIBUTE_ACCELERATOR_INPUT_SELECT_FROM_OUTPUT,
+    VX_SP_ATTRIBUTE_ACCELERATOR_INPUT_SELECT_FROM_ACCLERATOR,
+}
+vx_sp_attribute_accelerator_input_select_e;
+
+typedef enum _vx_sp_attribute_sum_engine_reset_e
+{
+    VX_SP_ATTRIBUTE_SUM_ENGINE_RESET_NONE,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_RESET_RESET,
+}
+vx_sp_attribute_sum_engine_reset_e;
+
+typedef enum _vx_sp_attribute_sum_engine_control_e
+{
+    VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL_ACCUM_INTERNAL,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL_ACCUM_1D,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL_ACCUM_2D,
+}
+vx_sp_attribute_sum_engine_control_e;
+
+typedef enum _vx_sp_attribute_sum_engine_num_ch_minus_one_e
+{
+    VX_SP_ATTRIBUTE_SUM_ENGINE_NUM_CH_MINUS_ONE_ONE_CH,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_NUM_CH_MINUS_ONE_TWO_CH,
+}
+vx_sp_attribute_sum_engine_num_ch_minus_one_e;
+
+typedef enum _vx_sp_attribute_sum_engine_2d_accum_storage_e
+{
+    VX_SP_ATTRIBUTE_SUM_ENGINE_2D_ACCUM_STORAGE_SAME,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_2D_ACCUM_STORAGE_DIFFERENT,
+}
+vx_sp_attribute_sum_engine_2d_accum_storage_e;
+
+/**********************************************************************************************/
+
+/*! \brief Creates an opaque reference to a spinst data.
+ * \param [in] context The reference to the implementation context.
+ * \return A spinst data reference.
+ * \Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_object_spinst
+ */
+VX_API_ENTRY vx_spinst VX_API_CALL vxCreateSPINST(
+    vx_context          context
+    );
+
+/*! \brief Releases a reference to a spinst object.
+ * The object may not be garbage collected until its total reference count is zero.
+ * \param [in] spinst_obj The pointer to the spinst data to release.
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors; all other values indicate failure
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ * \ingroup group_object_spinst
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseSPINST(
+    vx_spinst            *spinst_obj
+    );
+
+/*! \brief Add a instruction to spinst object.
+ * \param [in] spinst_obj The reference to the spinst object.
+ * \param [in] inst_unit_array The units of one instruction. Use a <tt>\ref vx_spinst_unit_param</tt>.
+ * \param [in] inst_unit_count The count of instruction units.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If data is not a <tt>\ref spinst_obj</tt>.
+ * \retval VX_ERROR_INVALID_PARAMETERS If any of parameters is incorrect.
+ * \retval VX_ERROR_NO_MEMORY If fail to allocate internal instruction memory.
+ * \ingroup group_object_spinst
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxAddOneInstToSPINST(
+    vx_spinst                 spinst_obj,
+    vx_spinst_unit_param*     inst_unit_array,
+    vx_uint8                  inst_unit_count
+    );
+
+/*! \brief Set various attributes of a spinst data.
+ * \param [in] spinst_obj The reference to the vx_spinst object to set.
+ * \param [in] attribute The attribute to set. Use a <tt>\ref vx_sp_attribute_e</tt>.
+ * \param [in] value The value of attribute.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If data is not a <tt>\ref vx_spinst</tt>.
+ * \retval VX_ERROR_INVALID_PARAMETERS If any of attribute is incorrect.
+ * \ingroup group_object_spinst
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxSetAttributeToSPINST(
+    vx_spinst          spinst_obj,
+    vx_enum            attribute,
+    vx_uint32          value
+    );
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
@ -342,6 +342,10 @@ typedef struct _vx_tensorpatch_addressing_t * vx_trensor_addressing;
 */
 typedef struct _vx_weights_biases_parameter_s *     vx_weights_biases_parameter;

+/*! \brief The object for stream processor
+ * \ingroup group_spinst
+ */
+typedef struct _vx_spinst_s *     vx_spinst;

 /*! \brief A Boolean value.
 * This allows 0 to be FALSE, as it is in C, and any non-zero to be TRUE.
@ -470,6 +474,7 @@ enum vx_type_e {
    /* \todo add new object types here */
    VX_TYPE_BFLOAT16        = 0x81A,/*!< \brief A <tt>\ref vx_bfloat16</tt>. */

+    VX_TYPE_SPINST          = 0x81B,/*!< \brief A <tt>\ref vx_spinst</tt>. */
    VX_TYPE_INT4            = 0x81C,/*!< \brief A <tt>\ref signed 4bits tensor.</tt>. */
    VX_TYPE_UINT4           = 0x81D,/*!< \brief A <tt>\ref unsigned 4bits tensor.</tt>. */
 };
@ -1021,6 +1026,8 @@ enum vx_node_attribute_e {

    VX_NODE_ATTRIBUTE_CONST_TENSOR_CACHE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x9,

+    VX_NODE_ATTRIBUTE_FOR_HW_QUALITY     = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0xA,
+
 };

 /*! \brief The parameter attributes list
@ -1290,6 +1297,9 @@ enum vx_tensor_attribute_e
    VX_TENSOR_LIFETIME = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x5,
    /*! \brief the value status of tensor. */
    VX_TENSOR_VALUE = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x6,
+    /*XiaoMi project*/
+    VX_TENSOR_INPUT_FOR_REFERENCE = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x7,
+    VX_TENSOR_MEMORY_ATTRIBUTE = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x8,
 };

 /*! \brief The meta valid rectangle attributes.
--- a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so
--- a/prebuilt-sdk/x86_64_linux/lib/libCLC.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libCLC.so
--- a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so
--- a/prebuilt-sdk/x86_64_linux/lib/libGAL.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libGAL.so
--- a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so
@ -1 +0,0 @@
-libOpenVX.so.1.3.0
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1
+++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1
@ -1 +0,0 @@
-libOpenVX.so.1.3.0
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1
+++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0
+++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so
--- a/prebuilt-sdk/x86_64_linux/lib/libVSC.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libVSC.so
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@ -172,3 +172,10 @@ DEF_OP(PRE_PROCESS_RGB888_PLANAR)
 DEF_OP(GATHER_ELEMENTS)
 DEF_OP(SELU)
 DEF_OP(CELU)
+DEF_OP(MAX_POOL3D)
+DEF_OP(RCP)
+DEF_OP(SIGN)
+DEF_OP(SOFTSIGN)
+DEF_OP(CUMSUM)
+DEF_OP(MAXPOOLWITHARGMAX)
+DEF_OP(MOD)
--- a/src/tim/vx/internal/include/kernel/vsi_nn_gpu_config.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_gpu_config.h
@ -25,10 +25,13 @@
 #ifndef _VSI_NN_GPU_CONFIG_H
 #define _VSI_NN_GPU_CONFIG_H

-#define GPU_TENSOR_MAX_WIDTH    (65536)
+#ifdef VSI_40BIT_VA_SUPPORT
+#define GPU_TENSOR_MAX_WIDTH    (1 << 30)
+#else
+#define GPU_TENSOR_MAX_WIDTH    (1 << 16)
+#endif
 #define GPU_MAX_MULTIPLIER_NUM  (65535)
 #define GPU_MAX_POST_SHIFT_BITS (31)
 #define GPU_TENSOR_DIM_2        (2)

 #endif
-
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
@ -156,6 +156,8 @@ typedef struct
        vsi_nn_kernel_quant_asymm_t asymm;
        vsi_nn_kernel_quant_asymm_perchannel_t asymm_v;
    };
+    float scale;
+    int32_t zero_point;
 } vsi_nn_kernel_tensor_attr_t;

 typedef struct
@ -411,7 +413,7 @@ vsi_status vsi_nn_kernel_node_pass_param
    size_t num
    );

-static inline void vsi_nn_kernel_node_release
+static VSI_INLINE_API void vsi_nn_kernel_node_release
    (
    vsi_nn_kernel_node_t * node
    )
@ -422,7 +424,7 @@ static inline void vsi_nn_kernel_node_release
    }
 }

-static inline void vsi_nn_kernel_node_pack_io
+static VSI_INLINE_API void vsi_nn_kernel_node_pack_io
    (
    vsi_nn_kernel_node_param_t * params,
    size_t param_num,
@ -476,7 +478,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
    );

 /** Map data type to gpu internal dtype. */
-static inline vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
+static VSI_INLINE_API vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
    (
    vsi_nn_type_e dtype
    )
@ -516,7 +518,7 @@ static inline vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
    return I8;
 } /* vsi_nn_kernel_map_dtype() */

-static inline  vsi_nn_type_e vsi_nn_dtype_map_kernel
+static VSI_INLINE_API  vsi_nn_type_e vsi_nn_dtype_map_kernel
    (
    vsi_nn_kernel_dtype_e dtype
    )
@ -556,7 +558,7 @@ static inline  vsi_nn_type_e vsi_nn_dtype_map_kernel
    return VSI_NN_TYPE_INT8;
 } /* vsi_nn_kernel_map_dtype() */

-static inline size_t vsi_nn_kernel_dtype_get_bytes
+static VSI_INLINE_API size_t vsi_nn_kernel_dtype_get_bytes
    (
    vsi_nn_kernel_dtype_e dtype
    )
@ -585,7 +587,7 @@ static inline size_t vsi_nn_kernel_dtype_get_bytes
    return 0;
 } /* vsi_nn_kernel_dtype_get_bytes() */

-static inline vsi_size_t vsi_nn_kernel_dtype_get_bits
+static VSI_INLINE_API vsi_size_t vsi_nn_kernel_dtype_get_bits
    (
    vsi_nn_kernel_dtype_e dtype
    )
@ -617,7 +619,7 @@ static inline vsi_size_t vsi_nn_kernel_dtype_get_bits
    return 0;
 } /* vsi_nn_kernel_dtype_get_bits() */

-static inline vsi_nn_kernel_quant_type_e vsi_nn_kernel_map_quant_type
+static VSI_INLINE_API vsi_nn_kernel_quant_type_e vsi_nn_kernel_map_quant_type
    ( vsi_nn_qnt_type_e quant_type )
 {
    switch( quant_type )
@ -658,7 +660,7 @@ vsi_nn_kernel_scalar_t vsi_nn_kernel_scalar_create
    const void * data
    );

-static inline void vsi_nn_kernel_scalar_release
+static VSI_INLINE_API void vsi_nn_kernel_scalar_release
    ( vsi_nn_kernel_scalar_t * scalar )
 {
    if( scalar && *scalar )
@ -803,7 +805,7 @@ vsi_status vsi_nn_kernel_tensor_write
    size_t size
    );

-static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_size
+static VSI_INLINE_API vsi_size_t vsi_nn_kernel_tensor_attr_get_size
    ( const vsi_nn_kernel_tensor_attr_t * attr )
 {
    if( !attr )
@ -813,7 +815,7 @@ static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_size
    return vsi_nn_shape_get_size( attr->shape->data, (vsi_size_t)attr->shape->size );
 } /* vsi_nn_kernel_tensor_attr_get_size() */

-static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes
+static VSI_INLINE_API vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes
    ( const vsi_nn_kernel_tensor_attr_t * attr )
 {
    vsi_size_t i = 0;
@ -851,7 +853,7 @@ static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes
    return bytes;
 } /* vsi_nn_kernel_tensor_attr_get_bytes() */

-static inline void vsi_nn_kernel_tensor_attr_get_stride
+static VSI_INLINE_API void vsi_nn_kernel_tensor_attr_get_stride
    ( const vsi_nn_kernel_tensor_attr_t * attr, vsi_size_t * out_stride)
 {
    vsi_size_t type_bits;
@ -902,7 +904,7 @@ static inline void vsi_nn_kernel_tensor_attr_get_stride
    }
 } /* vsi_nn_kernel_tensor_attr_get_size() */

-static inline vsi_bool vsi_nn_kernel_tensor_attr_is_quantized
+static VSI_INLINE_API vsi_bool vsi_nn_kernel_tensor_attr_is_quantized
    ( const vsi_nn_kernel_tensor_attr_t * attr )
 {
    return ( attr && attr->quant > VSI_NN_KERNEL_QUANT_NONE
@ -1072,7 +1074,7 @@ OVXLIB_API vsi_status vsi_nn_KernelGpuConfig
    const gpu_param_t * gpu_param
    );

-static inline const char* vsi_nn_kernel_type_str
+static VSI_INLINE_API const char* vsi_nn_kernel_type_str
    (
    vsi_nn_kernel_type_e type
    )
@ -1095,7 +1097,7 @@ static inline const char* vsi_nn_kernel_type_str
    return "None";
 } /* vsi_nn_kernel_type_str() */

-static inline vsi_status vsi_nn_kernel_unpack_4bit_data
+static VSI_INLINE_API vsi_status vsi_nn_kernel_unpack_4bit_data
    (
    const vsi_nn_kernel_tensor_attr_t * attr,
    uint8_t * src,
@ -1162,7 +1164,7 @@ static inline vsi_status vsi_nn_kernel_unpack_4bit_data
    return status;
 }

-static inline vsi_status vsi_nn_kernel_pack_4bit_data
+static VSI_INLINE_API vsi_status vsi_nn_kernel_pack_4bit_data
    (
    const vsi_nn_kernel_tensor_attr_t * attr,
    uint8_t * src,
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
@ -46,6 +46,8 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum
    VSI_NN_KERNEL_LUT_CLIP             = 12,
    VSI_NN_KERNEL_LUT_SQUARE           = 13,
    VSI_NN_KERNEL_LUT_CELU             = 14,
+    VSI_NN_KERNEL_LUT_RCP              = 15,
+    VSI_NN_KERNEL_LUT_SOFTSIGN         = 16,
 };

 #define VSI_NN_KERNEL_LUT_MAX_SIZE  (1024)
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_crop.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_crop.h
@ -30,11 +30,20 @@
 extern "C" {
 #endif

+typedef struct _vsi_nn_crop_lcl_data
+{
+    vx_int32 begin_dims[VSI_NN_MAX_DIM_NUM];
+    vx_int32 end_dims[VSI_NN_MAX_DIM_NUM];
+    vx_int32 stride_dims[VSI_NN_MAX_DIM_NUM];
+} vsi_nn_crop_lcl_data;
+
 typedef struct _vsi_nn_crop_param
 {
    int32_t  axis;
    uint32_t dims;
    uint32_t offset[VSI_NN_MAX_DIM_NUM];
+
+    vsi_nn_crop_lcl_data  *lcl_data;
 } vsi_nn_crop_param;

 #ifdef __cplusplus
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_cumsum.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_cumsum.h
@ -0,0 +1,45 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VSI_NN_OP_CUMSUM_H
+#define _VSI_NN_OP_CUMSUM_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_cumsum_param
+{
+    int32_t     axis;
+    vsi_bool    exclusive;
+    vsi_bool    reverse;
+} vsi_nn_cumsum_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_max_pool3d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_max_pool3d.h
@ -0,0 +1,55 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_MAX_POOL3D_H
+#define _VSI_NN_OP_MAX_POOL3D_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_max_pool3d_param
+{
+    struct _max_pool3d_local_data_t* local;
+    // Add parameters here
+
+    /* round_type is used to calculate the output shape */
+    vsi_nn_round_type_e round_type;
+    uint32_t     ksize[3];
+    uint32_t     stride[3];
+    /* Pad left, right, top, bottom */
+    uint32_t     pad[6];
+    /* Pad type default value shall be AUTO */
+    vsi_nn_pad_e pad_type;
+} vsi_nn_max_pool3d_param;
+_compiler_assert(offsetof(vsi_nn_max_pool3d_param, local) == 0, \
+    vsi_nn_max_pool3d_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_mod.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_mod.h
@ -0,0 +1,44 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_MOD_H
+#define _VSI_NN_OP_MOD_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_mod_param
+{
+    int32_t fmod;
+} vsi_nn_mod_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_rcp.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_rcp.h
@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_RCP_H
+#define _VSI_NN_OP_RCP_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_rcp_param
+{
+    struct _rcp_local_data_t* local;
+    // Add parameters here
+} vsi_nn_rcp_param;
+_compiler_assert(offsetof(vsi_nn_rcp_param, local) == 0, \
+    vsi_nn_rcp_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_sign.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_sign.h
@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_SIGN_H
+#define _VSI_NN_OP_SIGN_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_sign_param
+{
+    struct _sign_local_data_t* local;
+    // Add parameters here
+} vsi_nn_sign_param;
+_compiler_assert(offsetof(vsi_nn_sign_param, local) == 0, \
+    vsi_nn_sign_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_softsign.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_softsign.h
@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_SOFTSIGN_H
+#define _VSI_NN_OP_SOFTSIGN_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_softsign_param
+{
+    struct _softsign_local_data_t* local;
+    // Add parameters here
+} vsi_nn_softsign_param;
+_compiler_assert(offsetof(vsi_nn_softsign_param, local) == 0, \
+    vsi_nn_softsign_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
@ -32,7 +32,7 @@
 extern "C" {
 #endif

-static inline vsi_bool type_is_integer
+static VSI_INLINE_API vsi_bool type_is_integer
    (
    const vsi_nn_type_e type
    )
@ -60,7 +60,7 @@ static inline vsi_bool type_is_integer
    return ret;
 } /* type_is_integer() */

-static inline vsi_bool type_is_signed
+static VSI_INLINE_API vsi_bool type_is_signed
    (
    const vsi_nn_type_e type
    )
@ -86,7 +86,7 @@ static inline vsi_bool type_is_signed
    return ret;
 } /* type_is_signed() */

-static inline uint32_t type_get_bytes
+static VSI_INLINE_API uint32_t type_get_bytes
    (
    const vsi_nn_type_e type
    )
@ -115,7 +115,7 @@ static inline uint32_t type_get_bytes
    }
 } /* type_get_bytes() */

-static inline uint32_t type_get_bits
+static VSI_INLINE_API uint32_t type_get_bits
    (
    const vsi_nn_type_e type
    )
@ -147,7 +147,7 @@ static inline uint32_t type_get_bits
    }
 } /* type_get_bits() */

-static inline void type_get_range
+static VSI_INLINE_API void type_get_range
    (
    vsi_nn_type_e type,
    double  * max_range,
@ -186,7 +186,24 @@ static inline void type_get_range
    }
 } /* type_get_range() */

-static inline int32_t fp32_to_affine
+static VSI_INLINE_API vsi_bool fp32_is_inf
+    (
+        float val
+    )
+{
+    uint32_t u_value = *(uint32_t*)&val;
+
+    if ((u_value & (uint32_t)VSI_NN_INT32_MAX) == (uint32_t)VSI_NN_FLOAT32_INF)
+    {
+        return TRUE;
+    }
+    else
+    {
+        return FALSE;
+    }
+}
+
+static VSI_INLINE_API int32_t fp32_to_affine
    (
    const float  in,
    const float  scale,
@ -200,10 +217,17 @@ static inline int32_t fp32_to_affine
    type_get_range( type, &max_range, &min_range );
    data = (int32_t)(vsi_rint( in / scale ) + zero_point );
    data = vsi_nn_max( (int32_t)min_range, vsi_nn_min( (int32_t)max_range , data ) );
+
+    if (fp32_is_inf(in) != 0)
+    {
+        uint32_t sign = (*(uint32_t*)&in) >> 31;
+        data = sign == 1 ? (int32_t)min_range : (int32_t)max_range;
+    }
+
    return data;
 } /* fp32_to_affine() */

-static inline float affine_to_fp32
+static VSI_INLINE_API float affine_to_fp32
    (
    const int32_t    val,
    const float  scale,
@ -216,7 +240,7 @@ static inline float affine_to_fp32
    return data;
 } /* affine_to_fp32() */

-static inline int32_t fp32_to_dfp
+static VSI_INLINE_API int32_t fp32_to_dfp
    (
    const float in,
    const int8_t    fl,
@ -237,10 +261,17 @@ static inline int32_t fp32_to_dfp
    }
    data = vsi_nn_min( data, (int32_t)max_range );
    data = vsi_nn_max( data, (int32_t)min_range );
+
+    if (fp32_is_inf(in) != 0)
+    {
+        uint32_t sign = (*(uint32_t*)&in) >> 31;
+        data = sign == 1 ? (int32_t)min_range : (int32_t) max_range;
+    }
+
    return data;
 } /* fp32_to_dfp() */

-static inline float dfp_to_fp32
+static VSI_INLINE_API float dfp_to_fp32
    (
    const int32_t val,
    const int8_t  fl,
@ -259,7 +290,7 @@ static inline float dfp_to_fp32
    return result;
 } /* dfp_to_fp32() */

-static inline vsi_status integer_convert
+static VSI_INLINE_API vsi_status integer_convert
    (
    const void *    src,
    vsi_nn_type_e   src_type,
@ -303,7 +334,7 @@ typedef union
    float f;
 } _fp32_t;

-static inline float fp16_to_fp32
+static VSI_INLINE_API float fp16_to_fp32
    (
    int16_t in
    )
@ -323,7 +354,7 @@ static inline float fp16_to_fp32
    return o.f;
 } /* fp16_to_fp32() */

-static inline float bfp16_to_fp32
+static VSI_INLINE_API float bfp16_to_fp32
    (
    int16_t in
    )
@ -344,7 +375,7 @@ static inline float bfp16_to_fp32
    return t3 == 0 ? 0 : out;
 } /* bfp16_to_fp32() */

-static inline uint16_t fp32_to_fp16
+static VSI_INLINE_API uint16_t fp32_to_fp16
    (
    float in
    )
@ -370,7 +401,7 @@ static inline uint16_t fp32_to_fp16
    return (uint16_t) fp16;
 } /* fp32_to_fp16() */

-static inline uint16_t fp32_to_bfp16
+static VSI_INLINE_API uint16_t fp32_to_bfp16
    (
    float in
    )
@ -381,7 +412,7 @@ static inline uint16_t fp32_to_bfp16
    return (uint16_t) t1;
 } /* fp32_to_bfp16() */

-static inline uint16_t fp32_to_bfp16_rtne
+static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
    (
    float in
    )
@ -409,7 +440,7 @@ static inline uint16_t fp32_to_bfp16_rtne
    return out;
 } /* fp32_to_bfp16_rtne */

-static inline vsi_status dtype_to_float32
+static VSI_INLINE_API vsi_status dtype_to_float32
    (
    uint8_t *src,
    float   *dst,
@ -461,7 +492,7 @@ static inline vsi_status dtype_to_float32
    return VSI_SUCCESS;
 }

-static inline vsi_status float32_to_dtype
+static VSI_INLINE_API vsi_status float32_to_dtype
    (
    float   src,
    uint8_t *dst,
--- a/src/tim/vx/internal/include/utils/vsi_nn_math.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_math.h
@ -42,6 +42,8 @@ extern "C" {
 #define vsi_clamp(x, min, max)      vsi_nn_clamp(x, min, max)
 #define vsi_rtne(x)                 vsi_rint(x)

+#define VSI_NN_INT32_MAX            (0x7FFFFFFF)
+
 #define VSI_NN_FLOAT32_INF          (0x7F800000)
 #define VSI_NN_FLOAT32_NAN          (0x7FC00000)
 #define VSI_NN_FLOAT64_INF          (0x7FF0000000000000)
@ -53,14 +55,14 @@ extern "C" {
        size_t size; \
        TYPE data[0]; \
    } vsi_##NAME##_array_t; \
-    static inline vsi_##NAME##_array_t * vsi_##NAME##_array_create( size_t size ) { \
+    static VSI_INLINE_API vsi_##NAME##_array_t * vsi_##NAME##_array_create( size_t size ) { \
        vsi_##NAME##_array_t * array = (vsi_##NAME##_array_t *)malloc( \
                sizeof(vsi_##NAME##_array_t) + sizeof(TYPE) * size ); \
        if (array == NULL) return NULL; \
        array->size = size; \
        return array; \
    } \
-    static inline void vsi_##NAME##_array_release( vsi_##NAME##_array_t ** array ) \
+    static VSI_INLINE_API void vsi_##NAME##_array_release( vsi_##NAME##_array_t ** array ) \
        { \
            if( array && *array ) { \
                free( *array ); \
@ -167,7 +169,7 @@ void vsi_nn_random_uniform_transform
    uint32_t len
    );

-static inline double copy_sign
+static VSI_INLINE_API double copy_sign
    (
    double number,
    double sign
@ -177,7 +179,7 @@ static inline double copy_sign
    return (sign > 0) ? value : (-value);
 } /* copy_sign() */

-static inline float simple_round
+static VSI_INLINE_API float simple_round
    (
    float x
    )
@ -185,7 +187,7 @@ static inline float simple_round
    return (float) copy_sign(floorf(fabsf(x) + 0.5f), x);
 } /* simple_round() */

-static inline double vsi_rint
+static VSI_INLINE_API double vsi_rint
    (
    double x
    )
--- a/src/tim/vx/internal/include/utils/vsi_nn_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h
@ -65,7 +65,7 @@ extern "C" {
 #define VSI_NN_DO_JOIN(X, Y) VSI_NN_DO_JOIN2(X,Y)
 #define VSI_NN_DO_JOIN2(X, Y) X##Y

-#if defined(_MSC_VER)
+#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
    #define VSI_NN_DEPRECATED(symbol, hints) \
       __declspec(deprecated(VSI_NN_STRINGIZE(hints))) symbol

@ -381,7 +381,7 @@ int32_t vsi_nn_partition
 * @param[in]  num Number of tensors.
 * @param[out] out_tensors Ordered tensors
 * */
-static inline void vsi_nn_reorder_tensor
+static VSI_INLINE_API void vsi_nn_reorder_tensor
    (
    vsi_nn_tensor_t** tensors,
    const int32_t* order,
@ -417,6 +417,15 @@ vsi_bool vsi_nn_is_broadcast_operaton
    vsi_nn_tensor_t            *  output
    );

+vsi_bool vsi_nn_is_broadcast_axes_operaton
+    (
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            *  output,
+    int32_t                    *  axis,
+    int32_t                       axis_num
+    );
+
 float vsi_nn_get_tensor_scale
    (
    vsi_nn_tensor_t * tensor
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@ -66,6 +66,8 @@ typedef struct _vsi_nn_hw_config_t
    uint32_t use_40bits_va;
    uint32_t support_stream_processor;
    uint32_t sp_exec_count;
+    uint32_t sp_vector_depth;
+    uint32_t sp_per_core_vector_depth;
 } vsi_nn_hw_config_t;

 typedef struct _vsi_nn_runtime_option_t
--- a/src/tim/vx/internal/include/vsi_nn_daemon.h
+++ b/src/tim/vx/internal/include/vsi_nn_daemon.h
@ -35,7 +35,7 @@
        struct f##_t_{ ~f##_t_(void) { f(); }}; static f##_t_ f##_; \
        static void f(void)

-#elif defined(_MSC_VER)
+#elif (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
    #pragma section(".CRT$XCU", read)
    #define _INITIALIZER2(f, p) \
        static void f(void); \
--- a/src/tim/vx/internal/include/vsi_nn_feature.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature.h
@ -27,7 +27,7 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_prv.h"

-static inline vsi_bool vsi_nn_feature_conv_max_kernel_size()
+static VSI_INLINE_API vsi_bool vsi_nn_feature_conv_max_kernel_size()
 {
    return 11;
 }
--- a/src/tim/vx/internal/include/vsi_nn_log.h
+++ b/src/tim/vx/internal/include/vsi_nn_log.h
@ -31,7 +31,7 @@
 extern "C"{
 #endif

-#ifdef _MSC_VER
+#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
 #define snprintf(buffer, count, format, ...) \
    _snprintf_s(buffer, count, _TRUNCATE, format, ##__VA_ARGS__)
 #define vsnprintf(buffer, count, format, args) \
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@ -190,6 +190,12 @@
 #include "ops/vsi_nn_op_gather_elements.h"
 #include "ops/vsi_nn_op_selu.h"
 #include "ops/vsi_nn_op_celu.h"
+#include "ops/vsi_nn_op_max_pool3d.h"
+#include "ops/vsi_nn_op_rcp.h"
+#include "ops/vsi_nn_op_sign.h"
+#include "ops/vsi_nn_op_softsign.h"
+#include "ops/vsi_nn_op_cumsum.h"
+#include "ops/vsi_nn_op_mod.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"

@ -365,6 +371,12 @@ typedef union _vsi_nn_nn_param
    vsi_nn_gather_elements_param    gather_elements;
    vsi_nn_selu_param               selu;
    vsi_nn_celu_param               celu;
+    vsi_nn_max_pool3d_param         max_pool3d;
+    vsi_nn_rcp_param                rcp;
+    vsi_nn_sign_param               sign;
+    vsi_nn_softsign_param           softsign;
+    vsi_nn_cumsum_param             cumsum;
+    vsi_nn_mod_param                mod;
    void*                         client_param;

    /* custom node data struct define */
--- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
+++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
@ -243,6 +243,18 @@ OVXLIB_API vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam
        uint32_t enable_nodes_count
    );

+OVXLIB_API vsi_status vsi_nn_UpdateCropParamsForBinaryGraph
+    (
+        vsi_nn_graph_t* graph,
+        uint32_t enabled_crop_input_idx,
+        uint32_t start_x,
+        uint32_t start_y,
+        uint32_t crop_w,
+        uint32_t crop_h,
+        uint32_t dst_w,
+        uint32_t dst_h
+    );
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/tim/vx/internal/include/vsi_nn_pub.h
+++ b/src/tim/vx/internal/include/vsi_nn_pub.h
@ -26,7 +26,7 @@
 #define _VSI_NN_PUB_H

 #if !defined(OVXLIB_API)
-    #if defined(_WIN32)
+    #if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
        #define OVXLIB_API __declspec(dllimport)
    #else
        #define OVXLIB_API __attribute__((visibility("default")))
--- a/src/tim/vx/internal/include/vsi_nn_types.h
+++ b/src/tim/vx/internal/include/vsi_nn_types.h
@ -33,11 +33,13 @@
 extern "C"{
 #endif

-#ifdef _WIN32
-#define inline __inline
+#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
+#define VSI_INLINE_API __inline
+#else
+#define VSI_INLINE_API inline
 #endif

-#if (defined(_MSC_VER) || defined(__MINGW32))
+#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
    #define SIZE_T_SPECIFIER "Iu"
    #define SSIZE_T_SPECIFIER "Id"
    #ifdef VSI_40BIT_VA_SUPPORT
@ -59,7 +61,7 @@ extern "C"{
    #endif
 #endif

-#if defined(_MSC_VER)
+#if (defined(_MSC_VER))
 #include <BaseTsd.h>
 typedef SSIZE_T ssize_t;
 #else
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@ -33,7 +33,7 @@ extern "C"{

 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 43
+#define VSI_NN_VERSION_PATCH 50
 #define VSI_NN_VERSION \
    (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

--- a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
@ -188,7 +188,7 @@ static vsi_status _query_kernel
    input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

-    if (input_dtype == I8)
+    if (input_dtype == I8 || input_dtype == I16)
    {
        input_dtype = I32;
    }
@ -269,7 +269,6 @@ static vsi_nn_kernel_node_t _setup
            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
            CHECK_STATUS_FAIL_GOTO( status, OnError );
-
        }
    }

@ -285,4 +284,3 @@ OnError:
 __END_DECLS

 REGISTER_BACKEND_CL( argmax, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c
@ -188,6 +188,11 @@ static vsi_status _query_kernel
    input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

+    if (input_dtype == I8 || input_dtype == I16)
+    {
+        input_dtype = I32;
+    }
+
    if (output_dtype == I16)
    {
        output_dtype = I32;
@ -264,7 +269,6 @@ static vsi_nn_kernel_node_t _setup
            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
            CHECK_STATUS_FAIL_GOTO( status, OnError );
-
        }
    }

--- a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
@ -0,0 +1,365 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define KERNEL_SOURCE_1    "cumsum"
+#define KERNEL_SOURCE_2    "cumsum_2d"
+
+// Add kernel hashtable here
+#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
+    ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
+
+#define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
+        CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
+        KERNEL_SOURCE_1 },
+
+#define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \
+        CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
+        KERNEL_SOURCE_2 },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } cumsum_map[] =
+{
+    HASH_CUMSUM_KERNELS(0, U8,  U8)
+    HASH_CUMSUM_KERNELS(0, F32, F32)
+    HASH_CUMSUM_KERNELS(1, U8,  U8)
+    HASH_CUMSUM_KERNELS(1, F32, F32)
+    HASH_CUMSUM_KERNELS(2, U8,  U8)
+    HASH_CUMSUM_KERNELS(2, F32, F32)
+    HASH_CUMSUM_KERNELS_2D(0, U8,  U8)
+    HASH_CUMSUM_KERNELS_2D(0, F32, F32)
+    HASH_CUMSUM_KERNELS_2D(1, U8,  U8)
+    HASH_CUMSUM_KERNELS_2D(1, F32, F32)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _cumsum_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _CUMSUM_PARAM_NUM  _cnt_of_array( _cumsum_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_cumsum_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    vsi_size_array_t * input_shape = NULL;
+    int32_t       axis      = 0;
+    int32_t       width     = 0;
+    int32_t       height    = 0;
+    int32_t       channel   = 0;
+    int32_t       w         = 1;
+    int32_t       h         = 1;
+    int32_t       c         = 1;
+    uint32_t      dim       = 1;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    input_shape  = attr[0]->shape;
+    dim     = (uint32_t)input_shape->size;
+    width   = (int32_t)(input_shape->data[0]);
+    height  = (int32_t)(input_shape->data[1]);
+    channel = (int32_t)(dim > 2 ? input_shape->data[2] : 1);
+
+    if (axis == 0)
+    {
+        w = 1;
+        h = height;
+        c = channel;
+    }
+    else if (axis == 1)
+    {
+        w = width;
+        h = 1;
+        c = channel;
+    }
+    else if (axis == 2)
+    {
+        w = width;
+        h = height;
+        c = 1;
+    }
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = w;
+    gpu_param.global_size[1]   = h;
+    gpu_param.global_size[2]   = c;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _cumsum_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t axis,
+    int32_t is_2d
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (input0_dtype == U32)
+    {
+        input0_dtype = U8;
+    }
+
+    if (input0_dtype == F16)
+    {
+        input0_dtype = F32;
+    }
+
+    if (output_dtype == U32)
+    {
+        output_dtype = U8;
+    }
+
+    if (output_dtype == F16)
+    {
+        output_dtype = F32;
+    }
+
+    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d);
+
+    for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
+    {
+        if ( cumsum_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < _cnt_of_array(cumsum_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  cumsum_map[i].function_name );
+        kernel->info.parameters = _cumsum_kernel_param_def;
+        kernel->info.numParams = _cnt_of_array( _cumsum_kernel_param_def );
+        kernel->info.initialize = _cumsum_initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                cumsum_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                cumsum_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CUMSUM_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t  shapes[1][VSI_NN_MAX_DIM_NUM] = {{0}};
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    int32_t axis       = vsi_nn_kernel_param_get_int32( params, "axis" );
+    int32_t exclusive  = vsi_nn_kernel_param_get_int32( params, "exclusive" );
+    int32_t reverse    = vsi_nn_kernel_param_get_int32( params, "reverse" );
+    int32_t axis_new   = 0;
+    int32_t is_2d      = 0;
+    uint32_t rs_dim    = 2;
+    int32_t input_zp   = vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input_scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float output_zp    = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    float in_out_scale = input_scale * output_scale;
+    float in_out_zp_scale = in_out_scale * input_zp;
+    int32_t width      = 0;
+    int32_t height     = 0;
+    int32_t channel    = 1;
+    int32_t i = 0;
+
+    vsi_nn_kernel_optimize_softmax_shape(
+                inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+                shapes[0], &rs_dim, &axis_new);
+    if (rs_dim > 3)
+    {
+        return NULL;
+    }
+
+    width = (int32_t)shapes[0][0];
+    height = (int32_t)shapes[0][1];
+
+    if (rs_dim == 2)
+    {
+        is_2d = 1;
+    }
+    else
+    {
+        channel = (int32_t)shapes[0][2];
+    }
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+        inputs[0], shapes[0], (vsi_size_t)rs_dim );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+        outputs[0], shapes[0], (vsi_size_t)rs_dim );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, axis_new, is_2d );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 2;
+
+            /* Pass parameters to node. */
+            vsi_nn_kernel_node_pack_io( node_params, _CUMSUM_PARAM_NUM,
+                reshape_tensors, 1, &reshape_tensors[1], 1 );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_new );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &exclusive );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &channel );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_out_scale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_out_zp_scale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CUMSUM_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+        }
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( cumsum, _setup )
--- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
@ -53,6 +53,9 @@ typedef enum
    UNARY_HGELU,
    UNARY_SELU,
    UNARY_CELU,
+    UNARY_RCP,
+    UNARY_SIGN,
+    UNARY_SOFTSIGN,
 } unary_type_e;

 /*
@ -94,6 +97,13 @@ typedef enum
 #define HGELU_OPERATION         hard_gelu
 #define SELU_OPERATION          selu
 #define CELU_OPERATION          celu
+#define RCP_OPERATION           rcp
+#define SIGN_OPERATION          sign
+#define SOFTSIGN_OPERATION      softsign
+
+#define ADD_UNARY_SH_KERNELS(name, src_type, dst_type) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, src_type, dst_type) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, src_type, dst_type)

 static const struct {
        uint32_t key;
@ -101,61 +111,39 @@ static const struct {
        const char* source_name;
    } kernel_map[] =
 {
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,      UNARY_SIN,      F32, F32)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,      UNARY_COS,      F32, F32)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,      UNARY_EXP,      F32, F32)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,      UNARY_LOG,      F32, F32)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,      UNARY_NEG,      F32, F32)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION,     UNARY_MISH,     F32, F32)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION,    UNARY_ROUND,    F32, F32)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION,     UNARY_GELU,     F32, F32)
-    TENSOR_UNARY_KERNELS_3D(HGELU_OPERATION,    UNARY_HGELU,    F32, F32)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION,     UNARY_SELU,     F32, F32)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION,     UNARY_CELU,     F32, F32)
+    ADD_UNARY_SH_KERNELS(SIN,      F32, F32)
+    ADD_UNARY_SH_KERNELS(COS,      F32, F32)
+    ADD_UNARY_SH_KERNELS(EXP,      F32, F32)
+    ADD_UNARY_SH_KERNELS(LOG,      F32, F32)
+    ADD_UNARY_SH_KERNELS(NEG,      F32, F32)
+    ADD_UNARY_SH_KERNELS(HSIGMOID, F32, F32)
+    ADD_UNARY_SH_KERNELS(MISH,     F32, F32)
+    ADD_UNARY_SH_KERNELS(ROUND,    F32, F32)
+    ADD_UNARY_SH_KERNELS(GELU,     F32, F32)
+    ADD_UNARY_SH_KERNELS(HGELU,    F32, F32)
+    ADD_UNARY_SH_KERNELS(SELU,     F32, F32)
+    ADD_UNARY_SH_KERNELS(CELU,     F32, F32)
+    ADD_UNARY_SH_KERNELS(RCP,      F32, F32)
+    ADD_UNARY_SH_KERNELS(SIGN,     F32, F32)
+    ADD_UNARY_SH_KERNELS(SOFTSIGN, F32, F32)

-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,      UNARY_SIN,      F32, F32)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,      UNARY_COS,      F32, F32)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,      UNARY_EXP,      F32, F32)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,      UNARY_LOG,      F32, F32)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,      UNARY_NEG,      F32, F32)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION,     UNARY_MISH,     F32, F32)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION,    UNARY_ROUND,    F32, F32)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION,     UNARY_GELU,     F32, F32)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION,    UNARY_HGELU,    F32, F32)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION,     UNARY_SELU,     F32, F32)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION,     UNARY_CELU,     F32, F32)
+    ADD_UNARY_SH_KERNELS(SIN,      U8,  U8)
+    ADD_UNARY_SH_KERNELS(COS,      U8,  U8)
+    ADD_UNARY_SH_KERNELS(EXP,      U8,  U8)
+    ADD_UNARY_SH_KERNELS(LOG,      U8,  U8)
+    ADD_UNARY_SH_KERNELS(NEG,      U8,  U8)
+    ADD_UNARY_SH_KERNELS(HSIGMOID, U8,  U8)
+    ADD_UNARY_SH_KERNELS(MISH,     U8,  U8)
+    ADD_UNARY_SH_KERNELS(ROUND,    U8,  U8)
+    ADD_UNARY_SH_KERNELS(GELU,     U8,  U8)
+    ADD_UNARY_SH_KERNELS(HGELU,    U8,  U8)
+    ADD_UNARY_SH_KERNELS(SELU,     U8,  U8)
+    ADD_UNARY_SH_KERNELS(CELU,     U8,  U8)
+    ADD_UNARY_SH_KERNELS(RCP,      U8,  U8)
+    ADD_UNARY_SH_KERNELS(SIGN,     U8,  U8)
+    ADD_UNARY_SH_KERNELS(SOFTSIGN, U8,  U8)

-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,      UNARY_COS,      U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,      UNARY_LOG,      U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,      UNARY_NEG,      U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION,     UNARY_MISH,     U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION,    UNARY_ROUND,    U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION,     UNARY_GELU,     U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(HGELU_OPERATION,    UNARY_HGELU,    U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION,     UNARY_SELU,     U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION,     UNARY_CELU,     U8,  U8)
-
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,      UNARY_COS,      U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,      UNARY_LOG,      U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,      UNARY_NEG,      U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION,     UNARY_MISH,     U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION,    UNARY_ROUND,    U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION,     UNARY_GELU,     U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION,    UNARY_HGELU,    U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION,     UNARY_SELU,     U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION,     UNARY_CELU,     U8,  U8)
-
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I32,  I32)
-
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I32,  I32)
+    ADD_UNARY_SH_KERNELS(NEG,      I32, I32)
 };

 #undef SIN_OPERATION
@ -170,6 +158,9 @@ static const struct {
 #undef HGELU_OPERATION
 #undef SELU_OPERATION
 #undef CELU_OPERATION
+#undef RCP_OPERATION
+#undef SIGN_OPERATION
+#undef SOFTSIGN_OPERATION
 /*
 * Kernel params
 */
@ -458,4 +449,8 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( gelu,         UNARY_GELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_gelu,    UNARY_HGELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( selu,         UNARY_SELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( celu,         UNARY_CELU )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( rcp,          UNARY_RCP )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( sign,         UNARY_SIGN )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( softsign,     UNARY_SOFTSIGN )
+
 __END_DECLS
--- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
@ -123,7 +123,7 @@ static vsi_status cal_gather_tensor_reshape_size
    uint32_t i = 0;
    vsi_size_t elementCnt = 1;
    vsi_size_t outerCnt = 1;
-#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH

    for (i = 0; i < dims_num - batch_dims; ++i)
    {
@ -365,4 +365,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( gather, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
@ -111,7 +111,7 @@ static vsi_status cal_gather_nd_tensor_reshape_size
    vsi_size_t *input_size = inputs[0]->attr.size;
    uint32_t i = 0;
    vsi_size_t elementCnt = 1;
-#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH

    newDim[0] = 0;
    for(i = 0; i < dims_num; ++i)
@ -336,4 +336,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( gather_nd, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
@ -22,7 +22,6 @@
 *
 *****************************************************************************/

-
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -44,21 +43,20 @@ __BEGIN_DECLS
 */
 typedef enum
 {
-    INTERNAL_KERNEL_MEAN_VARI,
+    INTERNAL_KERNEL_SUMS,
    INTERNAL_KERNEL_NORM,
 } _internal_kernel_e;

 #define KERNEL_SOURCE_1    "instance_normalization_u8"
-#define KERNEL_SOURCE_2    "instance_normalization_f16"
+#define KERNEL_SOURCE_2    "instance_normalization_f32"
 #define KERNEL_SOURCE_3    "instance_normalization_i32"
-#define KERNEL_SOURCE_4    "instance_normalization_f32"

 // Add kernel hashtable here
-#define HASH_INSTANCENORM_MEAN_VARI_KERNEL_NAME(SRC0_TYPE) \
-    CVIVANTE_NAMESPACE("cl.instance_norm_meanvari_"#SRC0_TYPE)
+#define HASH_INSTANCENORM_SUMS_KERNEL_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("cl.instance_norm_sums_"#SRC0_TYPE)

-#define HASH_INSTANCENORM_MEAN_VARI_KERNEL_2D_NAME(SRC0_TYPE) \
-    CVIVANTE_NAMESPACE("cl.instance_norm_meanvari_"#SRC0_TYPE"_2D")
+#define HASH_INSTANCENORM_SUMS_KERNEL_2D_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("cl.instance_norm_sums_"#SRC0_TYPE"_2D")

 #define HASH_INSTANCENORM_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
    CVIVANTE_NAMESPACE("cl.instance_norm_"#SRC0_TYPE"to"#DST_TYPE)
@ -68,17 +66,17 @@ typedef enum

 // Add kernel hashtable here
 // mean vari
-#define HASH_INSTANCENORM_MEAN_VARI_KEY(_input0_type, _output_type, _reshape_flag) \
+#define HASH_INSTANCENORM_SUMS_KEY(_input0_type, _output_type, _reshape_flag) \
    ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))

-#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 0), \
-        HASH_INSTANCENORM_MEAN_VARI_KERNEL_NAME(IN0_TYPE), \
+#define TENSOR_INSTANCENORM_SUMS_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_INSTANCENORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        HASH_INSTANCENORM_SUMS_KERNEL_NAME(IN0_TYPE), \
        SOURCE },

-#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 1), \
-        HASH_INSTANCENORM_MEAN_VARI_KERNEL_2D_NAME(IN0_TYPE), \
+#define TENSOR_INSTANCENORM_SUMS_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_INSTANCENORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 1), \
+        HASH_INSTANCENORM_SUMS_KERNEL_2D_NAME(IN0_TYPE), \
        SOURCE },

 // normalization
@ -102,17 +100,15 @@ typedef struct
    const char * source_name;
 } _kernel_map_type;

-static const _kernel_map_type _instancenorm_mean_vari_kernel_map[] =
+static const _kernel_map_type _instancenorm_sums_kernel_map[] =
 {
    // Register kernel here
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( U8, F32, KERNEL_SOURCE_1 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F16, F32, KERNEL_SOURCE_2 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( I32, F32, KERNEL_SOURCE_3 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F32, F32, KERNEL_SOURCE_4 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F32, F32, KERNEL_SOURCE_4 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS( U8, F32, KERNEL_SOURCE_1 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS( F32, F32, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS( I32, F32, KERNEL_SOURCE_3 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
 };

 static const _kernel_map_type _instancenorm_kernel_map[] =
@ -123,22 +119,19 @@ static const _kernel_map_type _instancenorm_kernel_map[] =
    TENSOR_INSTANCENORM_KERNELS( U8, F16, KERNEL_SOURCE_1 )
    TENSOR_INSTANCENORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_1 )

-    TENSOR_INSTANCENORM_KERNELS( F16, F16, KERNEL_SOURCE_2 )
-    TENSOR_INSTANCENORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_KERNELS( F32, F32, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 )

    TENSOR_INSTANCENORM_KERNELS( I32, I32, KERNEL_SOURCE_3 )
    TENSOR_INSTANCENORM_KERNELS_2D( I32, I32, KERNEL_SOURCE_3 )
    TENSOR_INSTANCENORM_KERNELS( I32, F32, KERNEL_SOURCE_3 )
    TENSOR_INSTANCENORM_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
-
-    TENSOR_INSTANCENORM_KERNELS( F32, F32, KERNEL_SOURCE_4 )
-    TENSOR_INSTANCENORM_KERNELS_2D( F32, F32, KERNEL_SOURCE_4 )
 };

 /*
 * Kernel params
 */
-static vx_param_description_t _instancenorm_mean_vari_kernel_param_def[] =
+static vx_param_description_t _instancenorm_sums_kernel_param_def[] =
 {
    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@ -146,12 +139,9 @@ static vx_param_description_t _instancenorm_mean_vari_kernel_param_def[] =
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    // Add kererl parameters here
 };
-#define _INSTANCENORM_MEAN_VARI_PARAM_NUM  _cnt_of_array( _instancenorm_mean_vari_kernel_param_def )
+#define _INSTANCENORM_SUMS_PARAM_NUM  _cnt_of_array( _instancenorm_sums_kernel_param_def )

 static vx_param_description_t _instancenorm_kernel_param_def[] =
 {
@ -168,10 +158,6 @@ static vx_param_description_t _instancenorm_kernel_param_def[] =
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    // Add kererl parameters here
 };
 #define _INSTANCENORM_PARAM_NUM  _cnt_of_array( _instancenorm_kernel_param_def )
@ -179,7 +165,7 @@ static vx_param_description_t _instancenorm_kernel_param_def[] =
 /*
 * Kernel initializer
 */
-DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
+DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
    (
    vsi_nn_kernel_node_t                node,
    const vsi_nn_kernel_node_param_t  * param,
@ -244,7 +230,7 @@ final:
        attr[1] = NULL;
    }
    return status;
-} /* _instance_normalization_mean_vari_initializer() */
+} /* _instance_normalization_sums_initializer() */

 DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
    (
@ -334,12 +320,12 @@ static vsi_status _query_kernel

    switch( kernel_id )
    {
-        case INTERNAL_KERNEL_MEAN_VARI:
-            initializer = _instancenorm_mean_vari_initializer;
-            kernel_map = _instancenorm_mean_vari_kernel_map;
-            kernel_map_size = _cnt_of_array( _instancenorm_mean_vari_kernel_map );
-            param_def = _instancenorm_mean_vari_kernel_param_def;
-            param_size = _INSTANCENORM_MEAN_VARI_PARAM_NUM;
+        case INTERNAL_KERNEL_SUMS:
+            initializer = _instancenorm_sums_initializer;
+            kernel_map = _instancenorm_sums_kernel_map;
+            kernel_map_size = _cnt_of_array( _instancenorm_sums_kernel_map );
+            param_def = _instancenorm_sums_kernel_param_def;
+            param_size = _INSTANCENORM_SUMS_PARAM_NUM;
            break;
        case INTERNAL_KERNEL_NORM:
            initializer = _instancenorm_initializer;
@ -392,9 +378,9 @@ static vsi_nn_kernel_node_t _setup
    )
 {
 #define INTERNAL_KERNEL_SIZE    (1)
-#define MEAN_VARI_INDEX  (0)
+#define SUMS_INDEX  (0)
    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t mean_vari_node_params[_INSTANCENORM_MEAN_VARI_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_param_t sums_node_params[_INSTANCENORM_SUMS_PARAM_NUM] = { NULL };
    vsi_nn_kernel_node_param_t node_params[_INSTANCENORM_PARAM_NUM] = { NULL };
    vsi_nn_kernel_node_t node = NULL;
    vsi_nn_kernel_dtype_e in0_dtype = U8;
@ -407,18 +393,17 @@ static vsi_nn_kernel_node_t _setup
    uint32_t hashkey = 0;
    int32_t i = 0;
    uint32_t rank = outputs[0]->attr.dim_num;
-    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
+    float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" ) /
+                (input_scale * input_scale);
    size_t width = inputs[0]->attr.size[0];
    size_t height = inputs[0]->attr.size[1];
    int32_t reshape_flg  = outputs[0]->attr.size[1] * outputs[0]->attr.size[2] < GPU_TENSOR_MAX_WIDTH
            && rank > 2;
    int32_t group_num = (int32_t)(width + 15) / 16;
-    int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
-    float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
    int32_t output_zp = vsi_nn_get_tensor_zero_point(outputs[0]);
    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
-    float in_fl_scale = 1.0f, out_fl_scale = 1.0;
-    float dim_ratio = (float)1.0 / (float)(width * height);
+    float inv_multiplier = (float)1.0 / (float)(width * height);

    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
@ -443,15 +428,21 @@ static vsi_nn_kernel_node_t _setup
    attr.size[2] = 1;
    attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
    attr.dim_num = 4;
-    tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr );
+    tensors[SUMS_INDEX] = vsi_nn_CreateTensor( graph, &attr );

    in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+    in0_dtype = in0_dtype == F16 ? F32 : in0_dtype;
+    in0_dtype = in0_dtype == I8 ? I32 : in0_dtype;
+    in0_dtype = in0_dtype == I16 ? I32 : in0_dtype;
+    out_dtype = out_dtype == F16 ? F32 : out_dtype;
+    out_dtype = out_dtype == I8 ? I32 : out_dtype;
+    out_dtype = out_dtype == I16 ? I32 : out_dtype;

-    hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_MEAN_VARI_KEY( in0_dtype, F32, reshape_flg );
+    hashkeys[SUMS_INDEX]= HASH_INSTANCENORM_SUMS_KEY( in0_dtype, F32, reshape_flg );
    hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg );

-    status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
+    status = _query_kernel( ikernels[SUMS_INDEX], hashkeys[SUMS_INDEX], INTERNAL_KERNEL_SUMS );
    if ( VSI_SUCCESS != status )
    {
        goto final;
@ -497,37 +488,31 @@ static vsi_nn_kernel_node_t _setup
    }
    // Mean Vari
    {
-        node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] );
+        node = vsi_nn_kernel_create_node( graph, ikernels[SUMS_INDEX] );
        if (node)
        {
            uint32_t index = 0;
            if (reshape_flg)
            {
-                mean_vari_node_params[index++] = rs_input;
+                sums_node_params[index++] = rs_input;
            }
            else
            {
-                mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
+                sums_node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
            }
-            mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
-            mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
-            mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
-            mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp );
-            mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
-            mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_fl_scale );
-            mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
-            mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+            sums_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUMS_INDEX]->t;
+            sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+            sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
+            sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+            sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );

-            status  = vsi_nn_kernel_node_pass_param( node, mean_vari_node_params,
-                        _INSTANCENORM_MEAN_VARI_PARAM_NUM );
+            status  = vsi_nn_kernel_node_pass_param( node, sums_node_params,
+                        _INSTANCENORM_SUMS_PARAM_NUM );
            CHECK_STATUS(status);
-            vsi_nn_kernel_scalar_release( &mean_vari_node_params[2] );
-            vsi_nn_kernel_scalar_release( &mean_vari_node_params[3] );
-            vsi_nn_kernel_scalar_release( &mean_vari_node_params[4] );
-            vsi_nn_kernel_scalar_release( &mean_vari_node_params[5] );
-            vsi_nn_kernel_scalar_release( &mean_vari_node_params[6] );
-            vsi_nn_kernel_scalar_release( &mean_vari_node_params[7] );
-            vsi_nn_kernel_scalar_release( &mean_vari_node_params[8] );
+            vsi_nn_kernel_scalar_release( &sums_node_params[2] );
+            vsi_nn_kernel_scalar_release( &sums_node_params[3] );
+            vsi_nn_kernel_scalar_release( &sums_node_params[4] );
+            vsi_nn_kernel_scalar_release( &sums_node_params[5] );
            vsi_nn_kernel_node_release( &node );
        }
    }
@ -562,7 +547,7 @@ static vsi_nn_kernel_node_t _setup
            {
                node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
            }
-            node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
+            node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUMS_INDEX]->t;
            if (reshape_flg)
            {
                node_params[index++] = rs_output;
@ -573,15 +558,11 @@ static vsi_nn_kernel_node_t _setup
            }
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_fl_scale );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_zp );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &out_fl_scale );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inv_multiplier );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &group_num );

            status  = vsi_nn_kernel_node_pass_param( node, node_params,
@ -595,10 +576,6 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_scalar_release( &node_params[10] );
            vsi_nn_kernel_scalar_release( &node_params[11] );
            vsi_nn_kernel_scalar_release( &node_params[12] );
-            vsi_nn_kernel_scalar_release( &node_params[13] );
-            vsi_nn_kernel_scalar_release( &node_params[14] );
-            vsi_nn_kernel_scalar_release( &node_params[15] );
-            vsi_nn_kernel_scalar_release( &node_params[16] );
        }
    }

--- a/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c
@ -0,0 +1,312 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define KERNEL_SOURCE_1    "maxpoolwithargmax"
+#define KERNEL_SOURCE_2    "maxpoolwithargmax_2d"
+
+// Add kernel hashtable here
+#define MAXPOOLWITHARGMAX_HASH_KEY( IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1, _image_2d) \
+        (( IN_DTYPE << 24 ) | ( OUT_DTYPE0 << 20) | ( OUT_DTYPE1 << 12) | (_image_2d))
+
+#define HASH_MAXPOOLWITHARGMAX_KERNELS( IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1) \
+        { MAXPOOLWITHARGMAX_HASH_KEY(IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1, 0), \
+        CVIVANTE_NAMESPACE("cl.maxpoolwithargmax_"#IN_DTYPE"to"#OUT_DTYPE0"_"#OUT_DTYPE1), \
+        KERNEL_SOURCE_1 },
+
+#define HASH_MAXPOOLWITHARGMAX_KERNELS_2D( IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1) \
+        { MAXPOOLWITHARGMAX_HASH_KEY(IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1, 1), \
+        CVIVANTE_NAMESPACE("cl.maxpoolwithargmax_"#IN_DTYPE"to"#OUT_DTYPE0"_"#OUT_DTYPE1"_2D"), \
+        KERNEL_SOURCE_2 },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } maxpoolwithargmax_map[] =
+{
+    HASH_MAXPOOLWITHARGMAX_KERNELS(F32,  F32,  I32)
+    HASH_MAXPOOLWITHARGMAX_KERNELS(BF16, BF16, I32)
+    HASH_MAXPOOLWITHARGMAX_KERNELS(U32,  U32,   I32)
+    HASH_MAXPOOLWITHARGMAX_KERNELS(I32,  I32,  I32)
+    HASH_MAXPOOLWITHARGMAX_KERNELS_2D(F32,  F32,  I32)
+    HASH_MAXPOOLWITHARGMAX_KERNELS_2D(BF16, BF16, I32)
+    HASH_MAXPOOLWITHARGMAX_KERNELS_2D(U32,  U32,   I32)
+    HASH_MAXPOOLWITHARGMAX_KERNELS_2D(I32,  I32,  I32)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _maxpoolwithargmax_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _MAXPOOLWITHARGMAX_PARAM_NUM  _cnt_of_array( _maxpoolwithargmax_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_maxpoolwithargmax_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vx_status    status             = VX_FAILURE;
+    vx_tensor    output             = (vx_tensor)param[1];
+    vsi_nn_kernel_tensor_attr_t * attr_out = NULL;
+    vsi_size_array_t * out_shape   = NULL;
+
+    attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
+    CHECK_PTR_FAIL_GOTO( attr_out, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    out_shape = attr_out->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    gpu_param.global_size[0]   = gpu_align_p2((out_shape->data[0] +  gpu_param.global_scale[0] - 1)
+                                        /  gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = out_shape->data[1];
+    gpu_param.global_size[2]   = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+final:
+    if (attr_out)
+    {
+        vsi_nn_kernel_tensor_attr_release(&attr_out);
+    }
+
+    return status;
+} /* _maxpoolwithargmax_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t is_2d
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input_dtype   = U8;
+    vsi_nn_kernel_dtype_e output0_dtype = U8;
+    vsi_nn_kernel_dtype_e output1_dtype = I32;
+    uint32_t key = 0;
+    int32_t i = 0;
+
+    input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output0_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+    output1_dtype = vsi_nn_kernel_map_dtype( outputs[1]->attr.dtype.vx_type );
+
+    if (input_dtype == U8)
+    {
+        input_dtype = U32;
+    }
+
+    if (input_dtype == I8 || input_dtype == I16)
+    {
+        input_dtype = I32;
+    }
+
+    if (input_dtype == F16)
+    {
+        input_dtype = F32;
+    }
+
+    if (output0_dtype == U8)
+    {
+        output0_dtype = U32;
+    }
+
+    if (output0_dtype == I8 || output0_dtype == I16)
+    {
+        output0_dtype = I32;
+    }
+
+    if (output0_dtype == F16)
+    {
+        output0_dtype = F32;
+    }
+
+    key = MAXPOOLWITHARGMAX_HASH_KEY( input_dtype, output0_dtype, output1_dtype, is_2d);
+
+    for ( i = 0; i < _cnt_of_array(maxpoolwithargmax_map); i ++ )
+    {
+        if ( maxpoolwithargmax_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < _cnt_of_array(maxpoolwithargmax_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  maxpoolwithargmax_map[i].function_name );
+        kernel->info.parameters = _maxpoolwithargmax_kernel_param_def;
+        kernel->info.numParams = _cnt_of_array( _maxpoolwithargmax_kernel_param_def );
+        kernel->info.initialize = _maxpoolwithargmax_initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                maxpoolwithargmax_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                maxpoolwithargmax_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_MAXPOOLWITHARGMAX_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t ksize_x  = vsi_nn_kernel_param_get_int32(params, "ksize_x");
+    int32_t ksize_y  = vsi_nn_kernel_param_get_int32(params, "ksize_y");
+    int32_t stride_x = vsi_nn_kernel_param_get_int32(params, "stride_x");
+    int32_t stride_y = vsi_nn_kernel_param_get_int32(params, "stride_y");
+    int32_t pad_x    = vsi_nn_kernel_param_get_int32(params, "pad_left");
+    int32_t pad_y    = vsi_nn_kernel_param_get_int32(params, "pad_top");
+    int32_t image_2d = inputs[0]->attr.dim_num == 2 ? 1 : 0;
+    int32_t width    = (int32_t)inputs[0]->attr.size[0];
+    int32_t height   = (int32_t)inputs[0]->attr.size[1];
+    float   outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float   inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float   scale_value  = 1.0f;
+    float   tail_value   = 0.0f;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
+                inputs[0]->attr.dim_num )
+     || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num )
+     || !vsi_nn_kernel_gpu_check_shape( outputs[1]->attr.size,
+                outputs[1]->attr.dim_num ))
+    {
+        return NULL;
+    }
+
+    scale_value = inputScale / outputScale;
+    tail_value  = outputTail - inputTail * inputScale / outputScale;
+
+    status = _query_kernel( kernel, inputs, outputs, image_2d );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 3;
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _MAXPOOLWITHARGMAX_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_value );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &tail_value );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _MAXPOOLWITHARGMAX_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( maxpoolwithargmax, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cl/mod_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/mod_cl.c
@ -0,0 +1,303 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+#define MOD_KERNEL_SOURCE_NAME "mod"
+
+#define MOD_HASH_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
+    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
+
+
+#define MOD_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
+    { MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
+      CVIVANTE_NAMESPACE("cl.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE), \
+      MOD_KERNEL_SOURCE_NAME},
+
+#define MOD_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
+    { MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \
+      CVIVANTE_NAMESPACE("cl.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE"_2D"), \
+      MOD_KERNEL_SOURCE_NAME },
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _mod_kernel_map[] =
+{
+
+// Register kernel here
+    MOD_KERNELS( F32, F32, F32 )
+    MOD_KERNELS( I32, I32, I32 )
+    MOD_KERNELS( I32, I32, U8 )
+    MOD_KERNELS( U8,  U8,  U8 )
+    MOD_KERNELS( U8,  I32, U8 )
+
+    MOD_KERNELS_2D( F32, F32, F32 )
+    MOD_KERNELS_2D( I32, I32, I32 )
+    MOD_KERNELS_2D( I32, I32, U8 )
+    MOD_KERNELS_2D( U8,  U8,  U8 )
+    MOD_KERNELS_2D( U8,  I32, U8 )
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _mod_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _MOD_PARAM_NUM  _cnt_of_array( _mod_kernel_param_def )
+#define MOD_QUANT_PARAM_NUM   _cnt_of_array( _mod_kernel_param_def )
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_mod_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status status = VSI_FAILURE;
+    vx_tensor  output              = (vx_tensor)param[2];
+    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
+    vsi_size_array_t             *output_shape = NULL;
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
+    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    output_shape = output_attr->shape;
+
+    gpu_param.dim = output_shape->size < 3 ? 2 : 3;
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    gpu_param.global_size[0]   = gpu_align_p2((output_shape->data[0] +  gpu_param.global_scale[0] - 1)
+                                        /  gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = (output_shape->data[1] +  gpu_param.global_scale[1] - 1)
+                                        /  gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = output_shape->size > 2 ? output_shape->data[2] : 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (output_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&output_attr);
+    }
+
+    return status;
+} /* _mod_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool image_2d
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+     const _kernel_map_type * kernel_map = _mod_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _mod_kernel_map );
+    vx_param_description_t * param_def  = _mod_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array( _mod_kernel_param_def );
+    vx_kernel_initialize_f  initializer = _mod_initializer;
+
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (F16 == in0_dtype)
+    {
+        in0_dtype = F32;
+    }
+    else if (I16 == in0_dtype || I8 == in0_dtype)
+    {
+        in0_dtype = I32;
+    }
+
+    if (F16 == in1_dtype)
+    {
+        in1_dtype = F32;
+    }
+    else if (I16 == in1_dtype || I8 == in1_dtype)
+    {
+        in1_dtype = I32;
+    }
+
+    if (F16 == out_dtype)
+    {
+        out_dtype  = F32;
+    }
+    else if (I16 == out_dtype || I8 == out_dtype)
+    {
+        out_dtype = I32;
+    }
+
+    key = MOD_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d);
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_MOD_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_bool image_2d = FALSE;
+    float outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float input0Scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float input0Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input1Scale  = vsi_nn_get_tensor_scale(inputs[1]);
+    float input1Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
+    int32_t isfmod = vsi_nn_kernel_param_get_int32(params, "isfmod");
+
+    outputScale = 1.0f / outputScale;
+    input0Tail   = -(input0Tail * input0Scale);
+    input1Tail   = -(input1Tail * input1Scale);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    image_2d = (outputs[0]->attr.dim_num == 2);
+
+    status = _query_kernel( kernel, inputs, outputs, image_2d);
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            size_t node_params_num = MOD_QUANT_PARAM_NUM;
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _MOD_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &isfmod );
+            node_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale );
+            node_params[5] = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail );
+            node_params[6] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale );
+            node_params[7] = vsi_nn_kernel_scalar_create(graph, F32, &input1Tail );
+            node_params[8] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[9] = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( mod, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
@ -48,7 +48,7 @@ __BEGIN_DECLS

 #define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE ) \
        { ROI_ALIGN_HASH_KEY( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, 0 ), \
-          CVIVANTE_NAMESPACE("cl.roi_align_"STR(IN0_DTYPE)"to"STR(OUT_DTYPE)), \
+          CVIVANTE_NAMESPACE("cl.roi_align_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
          _ROI_ALIGN_KERNEL_SOURCE(IN0_DTYPE) }

 typedef struct
@ -61,6 +61,7 @@ typedef struct
 static const _kernel_map_type _roi_align_kernel_map[] =
 {
    PACK_KERNEL_MAP(F32, F32, I32, F32),
+    PACK_KERNEL_MAP(U8,  U16, I32, U8),
 };


@ -82,20 +83,28 @@ static vx_param_description_t _roi_align_kernel_param_def[] =
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _ROI_ALIGN_PARAM_NUM  _cnt_of_array( _roi_align_kernel_param_def )

-#define SCALAR_SPATIAL_X_SCALE          (4)
-#define SCALAR_SPATIAL_Y_SCALE          (5)
-#define SCALAR_INPUT_WIDTH              (6)
-#define SCALAR_INPUT_HEIGHT             (7)
-#define SCALAR_RCP_OF_OUTPUT_WIDTH      (8)
-#define SCALAR_RCP_OF_OUTPUT_HEIGHT     (9)
-#define SCALAR_SAMPLING_X_RATIO         (10)
-#define SCALAR_SAMPLING_Y_RATIO         (11)
-#define SCALAR_DEPTH                    (12)
+#define SCALAR_INPUT_SCALE              (4)
+#define SCALAR_INPUT_TAIL               (5)
+#define SCALAR_OUTPUT_SCALE             (6)
+#define SCALAR_OUTPUT_ZP                (7)
+#define SCALAR_SPATIAL_X_SCALE          (8)
+#define SCALAR_SPATIAL_Y_SCALE          (9)
+#define SCALAR_INPUT_WIDTH              (10)
+#define SCALAR_INPUT_HEIGHT             (11)
+#define SCALAR_RCP_OF_OUTPUT_WIDTH      (12)
+#define SCALAR_RCP_OF_OUTPUT_HEIGHT     (13)
+#define SCALAR_SAMPLING_X_RATIO         (14)
+#define SCALAR_SAMPLING_Y_RATIO         (15)
+#define SCALAR_DEPTH                    (16)

-#define ROI_ALIGN_PARAM_NUM         13
+#define ROI_ALIGN_PARAM_NUM         17
 #define ROI_ALIGN_QUANT_PARAM_NUM   _cnt_of_array( _roi_align_kernel_param_def )

 /*
@ -185,6 +194,7 @@ static vsi_status _query_kernel

    in0_dtype = in0_dtype == F16 ? F32 : in0_dtype;
    in1_dtype = in1_dtype == F16 ? F32 : in1_dtype;
+    out_dtype = out_dtype == F16 ? F32 : out_dtype;

    key = ROI_ALIGN_HASH_KEY( in0_dtype, in1_dtype, in2_dtype, out_dtype, image_2d );

@ -241,8 +251,14 @@ static vsi_nn_kernel_node_t _setup
    float   height_ratio        = vsi_nn_kernel_param_get_float32( params, "height_ratio" );
    int32_t width_sample_num    = vsi_nn_kernel_param_get_int32( params, "width_sample_num" );
    int32_t height_sample_num   = vsi_nn_kernel_param_get_int32( params, "height_sample_num" );
-    float   width_scale         = 1.0f / width_ratio;
-    float   height_scale        = 1.0f / height_ratio;
+    float   input_zp    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float   input_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float   input_tail  = -(input_zp * input_scale);
+    float   roi_scale   = vsi_nn_get_tensor_scale(inputs[1]);
+    float   output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    float   output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   width_scale         = roi_scale / width_ratio;
+    float   height_scale        = roi_scale / height_ratio;
    float   in_width            = (float)(inputs[0]->attr.size[0]);
    float   in_height           = (float)(inputs[0]->attr.size[1]);
    float   rcp_of_out_width    = 1.0f / (float)(outputs[0]->attr.size[0]);
@ -287,6 +303,10 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_node_pack_io( node_params, _ROI_ALIGN_PARAM_NUM,
                reshape_tensors, input_num, &reshape_tensors[_INPUT_NUM], output_num );

+            node_params[SCALAR_INPUT_SCALE]          = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
+            node_params[SCALAR_INPUT_TAIL]           = vsi_nn_kernel_scalar_create( graph, F32, &input_tail );
+            node_params[SCALAR_OUTPUT_SCALE]         = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+            node_params[SCALAR_OUTPUT_ZP]            = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
            node_params[SCALAR_SPATIAL_X_SCALE]      = vsi_nn_kernel_scalar_create( graph, F32, &width_scale );
            node_params[SCALAR_SPATIAL_Y_SCALE]      = vsi_nn_kernel_scalar_create( graph, F32, &height_scale );
            node_params[SCALAR_INPUT_WIDTH]          = vsi_nn_kernel_scalar_create( graph, F32, &in_width );
@ -299,6 +319,10 @@ static vsi_nn_kernel_node_t _setup

            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_X_SCALE] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_Y_SCALE] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );
--- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c
@ -115,7 +115,7 @@ static vsi_status cal_scatter_nd_tensor_reshape_size
        return status;
    }

-#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH

    newDim[0] = 0;
    for(i = 0; i < dims_num; ++i)
@ -333,4 +333,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( scatter_nd, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
@ -108,7 +108,7 @@ static vsi_status cal_scatter_nd_update_tensor_reshape_size
        return status;
    }

-#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH

    newDim[0] = 0;
    for(i = 0; i < dims_num; ++i)
@ -373,4 +373,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( scatter_nd_update, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@ -22,7 +22,6 @@
 *
 *****************************************************************************/

-
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -49,6 +48,13 @@ __BEGIN_DECLS
          CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
          _TOPK_KERNEL_SOURCE }

+#define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) )
+#define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+          CVIVANTE_NAMESPACE("cl.topk_odd_even_sort_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
+          "topk_odd_even_sort" }
+
 typedef struct
 {
    uint32_t key;
@ -84,6 +90,14 @@ static const _kernel_map_type _topk_kernel_map[] =
    PACK_KERNEL_MAP( I32, I32, 6 ),
 };

+static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
+{
+    // Register kernel here
+    PACK_ODD_EVEN_SORT_KERNEL_MAP( F32, F32 ),
+    PACK_ODD_EVEN_SORT_KERNEL_MAP( U32, U32 ),
+    PACK_ODD_EVEN_SORT_KERNEL_MAP( I32, I32 ),
+};
+
 /*
 * Kernel params
 */
@ -99,6 +113,19 @@ static vx_param_description_t _topk_kernel_param_def[] =
 #define _TOPK_PARAM_NUM  _cnt_of_array( _topk_kernel_param_def )
 #define SCALAR_INPUT_NUM_STAGES (3)
 #define SCALAR_INPUT_WIDTH      (4)
+
+static vx_param_description_t _topk_odd_even_sort_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _TOPK_ODD_EVEN_SORT_PARAM_NUM  _cnt_of_array( _topk_odd_even_sort_kernel_param_def )
+#define SCALAR_INPUT_SIZE  (5)
 /*
 * Kernel initializer
 */
@ -140,9 +167,47 @@ DEF_KERNEL_INITIALIZER(_topk_initializer)
 final:
 #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
    SAFE_FREE_TENSOR_ATTR(input_attr);
+#undef SAFE_FREE_TENSOR_ATTR
    return status;
 } /* _topk_initializer() */

+DEF_KERNEL_INITIALIZER(_topk_odd_even_sort_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        2,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * input_attr   = NULL;
+    vsi_size_array_t * in_shape                = NULL;
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+
+    in_shape  = input_attr->shape;
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.local_size[0]   = 32;
+    gpu_param.local_size[1]   = 1;
+    gpu_param.global_size[0]  = 32;
+    gpu_param.global_size[1]  = in_shape->data[1];
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(input_attr);
+#undef SAFE_FREE_TENSOR_ATTR
+    return status;
+} /* _topk_odd_even_sort_initializer() */

 /*
 * Query kernel
@ -215,6 +280,72 @@ static vsi_status _query_kernel
    return status;
 } /* _query_kernel() */

+static vsi_status _query_odd_even_sort_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _topk_odd_even_sort_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _topk_odd_even_sort_kernel_map );
+    vx_param_description_t * param_def  = _topk_odd_even_sort_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _topk_odd_even_sort_initializer;
+#define _PACK_SELECT_KEY( in_type, out_type ) \
+    ( (in_type) | (out_type << 8) )
+    uint32_t key = 0;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+        key = TOPK_ODD_EVEN_SORT_HASH_KEY( F32, F32 );
+        break;
+    case _PACK_SELECT_KEY(U32, U32):
+    case _PACK_SELECT_KEY(U16, U16):
+    case _PACK_SELECT_KEY(U8,  U8):
+        key = TOPK_ODD_EVEN_SORT_HASH_KEY( U32, U32 );
+        break;
+    case _PACK_SELECT_KEY(I32, I32):
+    case _PACK_SELECT_KEY(I16, I16):
+    case _PACK_SELECT_KEY(I8,  I8):
+        key = TOPK_ODD_EVEN_SORT_HASH_KEY( I32, I32 );
+        break;
+    default:
+        break;
+    }
+#undef _PACK_SELECT_KEY
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _topk_odd_even_sort_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */

 static vsi_nn_kernel_node_t _setup
    (
@ -228,16 +359,19 @@ static vsi_nn_kernel_node_t _setup
    )
 {
    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_TOPK_PARAM_NUM];
+    vsi_nn_kernel_node_param_t node_params[_TOPK_ODD_EVEN_SORT_PARAM_NUM];
    vsi_nn_kernel_node_t node = NULL;
    vsi_size_t block_size = inputs[0]->attr.size[0];
    vsi_size_t block_num = 1;
    uint32_t i = 0;
-    vsi_nn_tensor_t* rs_tensors[3] = { NULL };
+    vsi_nn_tensor_t* rs_tensors[5] = { NULL };
+    vsi_nn_tensor_attr_t attr;
    vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
    int32_t width = (int32_t)block_size;
    int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
    int32_t num_stages = (int32_t)ceil(log10(block_size / 2.0f) / log10(2.0f));
+    vsi_bool is_odd_even_sort = FALSE;
+    size_t param_num = _TOPK_PARAM_NUM;

    for (i = 1; i < inputs[0]->attr.dim_num; i ++)
    {
@ -257,26 +391,58 @@ static vsi_nn_kernel_node_t _setup

    rs_tensors[0] = vsi_nn_reshape_tensor( graph,
        inputs[0], shape[0], 2 );
-    rs_tensors[1] = vsi_nn_reshape_tensor( graph,
-        outputs[0], shape[1], 2 );
-    rs_tensors[2] = vsi_nn_reshape_tensor( graph,
-        outputs[1], shape[1], 2 );

-    status = _query_kernel( kernel, inputs, outputs, num_stages );
+    if (num_stages < 7)
+    {
+        status = _query_kernel( kernel, inputs, outputs, num_stages );
+
+        rs_tensors[1] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shape[1], 2 );
+        rs_tensors[2] = vsi_nn_reshape_tensor( graph,
+            outputs[1], shape[1], 2 );
+    }
+    else
+    {
+        status = _query_odd_even_sort_kernel( kernel, inputs, outputs );
+        is_odd_even_sort = TRUE;
+        param_num = _TOPK_ODD_EVEN_SORT_PARAM_NUM;
+
+        memcpy( &attr, &(rs_tensors[0]->attr), sizeof(vsi_nn_tensor_attr_t) );
+        rs_tensors[1] = vsi_nn_CreateTensor( graph, &attr );
+        attr.dtype.vx_type = VSI_NN_TYPE_INT32;
+        attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+        rs_tensors[2] = vsi_nn_CreateTensor( graph, &attr );
+
+        rs_tensors[3] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shape[1], 2 );
+        rs_tensors[4] = vsi_nn_reshape_tensor( graph,
+            outputs[1], shape[1], 2 );
+
+        input_num = 3;
+    }
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
        if ( node )
        {
            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _TOPK_PARAM_NUM,
-                    rs_tensors, input_num, &rs_tensors[1], output_num );
+            vsi_nn_kernel_node_pack_io( node_params, param_num,
+                    rs_tensors, input_num, &rs_tensors[input_num], output_num );
            /* Pass parameters to node. */
-            node_params[SCALAR_INPUT_NUM_STAGES] = vsi_nn_kernel_scalar_create(
-                graph, I32, &num_stages );
-            node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create(
-                graph, I32, &width );
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _TOPK_PARAM_NUM );
+            if (is_odd_even_sort)
+            {
+                node_params[SCALAR_INPUT_SIZE] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &width );
+            }
+            else
+            {
+                node_params[SCALAR_INPUT_NUM_STAGES] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &num_stages );
+                node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &width );
+            }
+
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, param_num );
            CHECK_STATUS_FAIL_GOTO( status, final );
        }
    }
@ -284,13 +450,25 @@ final:
    vsi_safe_release_tensor(rs_tensors[0]);
    vsi_safe_release_tensor(rs_tensors[1]);
    vsi_safe_release_tensor(rs_tensors[2]);
-    if (node_params[SCALAR_INPUT_NUM_STAGES])
+    vsi_safe_release_tensor(rs_tensors[3]);
+    vsi_safe_release_tensor(rs_tensors[4]);
+    if (is_odd_even_sort)
    {
-        vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_NUM_STAGES] );
+        if (node_params[SCALAR_INPUT_SIZE])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SIZE] );
+        }
    }
-    if (node_params[SCALAR_INPUT_WIDTH])
+    else
    {
-        vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );
+        if (node_params[SCALAR_INPUT_NUM_STAGES])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_NUM_STAGES] );
+        }
+        if (node_params[SCALAR_INPUT_WIDTH])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );
+        }
    }

    return node;
--- a/src/tim/vx/internal/src/kernel/cpu/cumsum_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/cumsum_cpu.c
@ -0,0 +1,260 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _CPU_ARG_NUM            (3)
+#define _CPU_INPUT_NUM          (1)
+#define _CPU_OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.cumsum")
+
+DEF_KERNEL_EXECUTOR(_cumsum_exec)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VX_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    float * buffer[2] = { NULL };
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
+    int32_t i = 0;
+    int32_t axisSize = 1, innerSize = 1, outerSize = 1;
+    int32_t axis = 0, exclusive = 0, reverse = 0;
+
+    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &exclusive);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &reverse);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
+
+    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
+    memset( buffer[1], 0, out_elements * sizeof(float) );
+
+    {
+        int32_t  dims_num  = (int32_t)attr[1]->shape->size;
+        int32_t  inner     = 0;
+        int32_t  outer     = 0;
+
+        for(i = 0; i < axis; ++i)
+        {
+            innerSize *= (int32_t)attr[0]->shape->data[i];
+        }
+
+        axisSize = (int32_t)attr[0]->shape->data[i++];
+
+        for(; i < dims_num; ++i)
+        {
+            outerSize *= (int32_t)attr[0]->shape->data[i];
+        }
+
+        for ( outer = 0; outer < outerSize; ++outer)
+        {
+            for ( inner = 0; inner < innerSize; ++inner)
+            {
+                float sum = .0f;
+
+                if (exclusive && reverse)
+                {
+                    int32_t idx_out = (outer * axisSize + axisSize - 1) * innerSize + inner;
+                    buffer[1][idx_out] = sum;
+                    for (i = axisSize - 1; i > 0; i--)
+                    {
+                        int32_t idx = (outer * axisSize + i) * innerSize + inner;
+                        float value = buffer[0][idx];
+                        idx_out = (outer * axisSize + i - 1) * innerSize + inner;
+                        sum += value;
+                        buffer[1][idx_out] = sum;
+                    }
+                }
+                else if (exclusive)
+                {
+                    int32_t idx_out = outer * axisSize * innerSize + inner;
+                    buffer[1][idx_out] = sum;
+                    for (i = 0; i < axisSize - 1; ++i)
+                    {
+                        int32_t idx = (outer * axisSize + i) * innerSize + inner;
+                        float value = buffer[0][idx];
+                        idx_out = (outer * axisSize + i + 1) * innerSize + inner;
+                        sum += value;
+                        buffer[1][idx_out] = sum;
+                    }
+                }
+                else if (reverse)
+                {
+                    for (i = axisSize - 1; i >= 0; i--)
+                    {
+                        int32_t idx = (outer * axisSize + i) * innerSize + inner;
+                        float value = buffer[0][idx];
+                        sum += value;
+                        buffer[1][idx] = sum;
+                    }
+                }
+                else
+                {
+                    for (i = 0; i < axisSize; ++i)
+                    {
+                        // i * innerSize + inner + outer * innerSize * axisSize
+                        int32_t idx = (outer * axisSize + i) * innerSize + inner;
+                        float value = buffer[0][idx];
+                        sum += value;
+                        buffer[1][idx] = sum;
+                    }
+                }
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
+            buffer[1], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    for ( i = 0; i < 2; i ++ )
+    {
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+    }
+    for ( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+    }
+    return status;
+} /* _cumsum_exec() */
+/*
+ * Kernel params
+ */
+static vx_param_description_t _cumsum_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _CUMSUM_PARAM_NUM  _cnt_of_array( _cumsum_kernel_param_def )
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel
+    )
+{
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _cumsum_exec;
+    kernel->info.parameters  = _cumsum_kernel_param_def;
+    kernel->info.numParams   = _CUMSUM_PARAM_NUM;
+
+    return VSI_SUCCESS;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VX_FAILURE;
+    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+
+    status = _query_kernel( inputs, outputs, kernel );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 2;
+            int32_t axis      = vsi_nn_kernel_param_get_int32( params, "axis" );
+            int32_t exclusive = vsi_nn_kernel_param_get_int32( params, "exclusive" );
+            int32_t reverse   = vsi_nn_kernel_param_get_int32( params, "reverse" );
+
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
+                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
+
+            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
+            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &exclusive );
+            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
+            /* Pass parameters to node. */
+            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
+            CHECK_STATUS( status );
+            vsi_nn_kernel_scalar_release( &backend_params[2] );
+            vsi_nn_kernel_scalar_release( &backend_params[3] );
+            vsi_nn_kernel_scalar_release( &backend_params[4] );
+        }
+        else
+        {
+            status = VSI_FAILURE;
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( cumsum, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
@ -50,6 +50,9 @@ typedef enum
    UNARY_HGELU,
    UNARY_SELU,
    UNARY_CELU,
+    UNARY_RCP,
+    UNARY_SIGN,
+    UNARY_SOFTSIGN,
 } unary_type_e;


@ -145,6 +148,21 @@ static float celu_eval(float x, float alpha)
    return positive + negative;
 }

+static float rcp_eval(float x)
+{
+    return 1 / x;
+}
+
+static float sign_eval(float x)
+{
+    return x > 0 ? 1.0f : x < 0 ? -1.0f : 0;
+}
+
+static float softsign_eval(float x)
+{
+    return x / (1.0f + vsi_abs(x));
+}
+
 DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
    (
    vsi_nn_kernel_node_t node,
@ -227,6 +245,15 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
        case UNARY_CELU:
            data = celu_eval(data, alpha);
            break;
+        case UNARY_RCP:
+            data = rcp_eval(data);
+            break;
+        case UNARY_SIGN:
+            data = sign_eval(data);
+            break;
+        case UNARY_SOFTSIGN:
+            data = softsign_eval(data);
+            break;
        default:
            break;
        }
@ -360,4 +387,7 @@ REGISTER_ELTWISE_UNARY_BACKEND_CPU( round,        UNARY_ROUND )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( gelu,         UNARY_GELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_gelu,    UNARY_HGELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( selu,         UNARY_SELU )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( celu,         UNARY_CELU )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( celu,         UNARY_CELU )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( rcp,          UNARY_RCP )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( sign,         UNARY_SIGN )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( softsign,     UNARY_SOFTSIGN )
--- a/src/tim/vx/internal/src/kernel/cpu/maxpoolwithargmax_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/maxpoolwithargmax_cpu.c
@ -0,0 +1,284 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _CPU_ARG_NUM            (8)
+#define _CPU_INPUT_NUM          (1)
+#define _CPU_OUTPUT_NUM         (2)
+#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.maxpoolwithargmax")
+
+#define FP32_MIN                -3.4e38
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _maxpoolwithargmax_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
+    // Add kererl parameters here
+};
+#define _MAXPOOLWITHARGMAX_PARAM_NUM  _cnt_of_array( _maxpoolwithargmax_kernel_param_def )
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_maxpoolwithargmax_exec)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VX_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    float * buffer[_CPU_IO_NUM] = { NULL };
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
+    int32_t ksize_x = 0, ksize_y = 0, stride_x = 0, stride_y = 0;
+    int32_t pad_left = 0, pad_right = 0, pad_top = 0, pad_bottom = 0;
+    int32_t i = 0;
+
+    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
+    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &ksize_x);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &ksize_y);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &stride_x);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &stride_y);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &pad_left);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &pad_right);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &pad_top);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &pad_bottom);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
+
+    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
+    memset( buffer[1], 0, out_elements * sizeof(float) );
+
+    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
+    memset( buffer[2], 0, out_elements * sizeof(float) );
+
+    {
+        int32_t dims_num = (int32_t)attr[1]->shape->size;
+        int32_t batch    = dims_num > 3 ? (int32_t)attr[1]->shape->data[3] : 1;
+        int32_t depth    = dims_num > 2 ? (int32_t)attr[1]->shape->data[2] : 1;
+        int32_t height_o = (int32_t)attr[1]->shape->data[1];
+        int32_t width_o  = (int32_t)attr[1]->shape->data[0];
+        int32_t width    = (int32_t)attr[0]->shape->data[0];
+        int32_t height   = (int32_t)attr[0]->shape->data[1];
+        int32_t b = 0, d = 0, j = 0;
+        int32_t output_base = 0;
+        int32_t input_base  = 0;
+
+        for (b = 0; b < batch; b++)
+        {
+            for (d = 0; d < depth; d++)
+            {
+                output_base = b * depth * height_o * width_o + d * height_o * width_o;
+                input_base = b * depth * height * width + d * height * width;
+                for (j = 0; j < height_o; j++)
+                {
+                    for (i = 0; i < width_o; i++)
+                    {
+                        int32_t hstart = j * stride_y - pad_top;
+                        int32_t wstart = i * stride_x - pad_left;
+                        int32_t hend = vsi_nn_min(hstart + ksize_y, height);
+                        int32_t wend = vsi_nn_min(wstart + ksize_x, width);
+                        int32_t pool_index = output_base + j * width_o + i;
+                        int32_t h = 0, w = 0;
+                        int32_t index_max = 0;
+                        float   value_max = (float)FP32_MIN;
+
+                        hstart = vsi_nn_max(hstart, 0);
+                        wstart = vsi_nn_max(wstart, 0);
+
+                        for (h = hstart; h < hend; ++ h)
+                        {
+                            for (w = wstart; w < wend; ++ w)
+                            {
+                                int32_t index = input_base + h * width + w;
+                                float data = buffer[0][index];
+
+                                if (data > value_max)
+                                {
+                                    value_max = data;
+                                    index_max = index;
+                                }
+                            }
+                        }
+                        buffer[1][pool_index] = value_max;
+                        buffer[2][pool_index] = (float)index_max;
+                    }
+                }
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
+            buffer[1], out_elements );
+    status |= vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
+            buffer[2], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    for ( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+    }
+    for ( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+    }
+    return status;
+} /* _maxpoolwithargmax_exec() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _maxpoolwithargmax_exec;
+    kernel->info.parameters  = _maxpoolwithargmax_kernel_param_def;
+    kernel->info.numParams   = _MAXPOOLWITHARGMAX_PARAM_NUM;
+    status = VSI_SUCCESS;
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_MAXPOOLWITHARGMAX_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+
+    int32_t ksize_x    = vsi_nn_kernel_param_get_int32(params, "ksize_x");
+    int32_t ksize_y    = vsi_nn_kernel_param_get_int32(params, "ksize_y");
+    int32_t stride_x   = vsi_nn_kernel_param_get_int32(params, "stride_x");
+    int32_t stride_y   = vsi_nn_kernel_param_get_int32(params, "stride_y");
+    int32_t pad_left   = vsi_nn_kernel_param_get_int32(params, "pad_left");
+    int32_t pad_right  = vsi_nn_kernel_param_get_int32(params, "pad_right");
+    int32_t pad_top    = vsi_nn_kernel_param_get_int32(params, "pad_top");
+    int32_t pad_bottom = vsi_nn_kernel_param_get_int32(params, "pad_bottom");
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            int32_t index = 3;
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _MAXPOOLWITHARGMAX_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_left );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_right );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_top );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_bottom );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _MAXPOOLWITHARGMAX_PARAM_NUM );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( maxpoolwithargmax, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cpu/mod_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/mod_cpu.c
@ -0,0 +1,247 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.mod")
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _mod_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _MOD_PARAM_NUM  _cnt_of_array( _mod_kernel_param_def )
+
+static vsi_ssize_t _expand_offset
+    (
+    vsi_ssize_t index,
+    vsi_size_t * shape, vsi_size_t rank,
+    vsi_size_t * strides, vsi_size_t * out_shape
+    )
+{
+    vsi_size_t i;
+    vsi_ssize_t offset = 0;
+
+    for( i = 0; i < rank && index; i ++ )
+    {
+        if( shape[i] == out_shape[i] )
+        {
+            offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
+        }
+        index /= out_shape[i];
+    }
+    return offset;
+}
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    int32_t isfmod = 0;
+    vsi_nn_kernel_dtype_e input0_dtype = F16;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float* f32_in_buffer[_INPUT_NUM] = {NULL};
+    float* f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t* in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t* out_attr[_OUTPUT_NUM] = {NULL};
+    vsi_size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    vsi_size_t out_elements[_OUTPUT_NUM] = {0};
+    vsi_size_t out_bytes[_OUTPUT_NUM] = {0};
+    uint32_t i;
+
+    /* prepare data */
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &isfmod);
+    for (i = 0; i < _INPUT_NUM; i++) {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create(input[i]);
+        vsi_nn_kernel_tensor_attr_get_stride(in_attr[i], in_stride_size[i]);
+        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer(input[i], in_attr[i], TRUE);
+        CHECK_PTR_FAIL_GOTO(f32_in_buffer[i], "Create input0 buffer fail.", final);
+    }
+
+    input0_dtype = in_attr[0]->dtype;
+    if (input0_dtype == F16 || input0_dtype == F32 || input0_dtype == BF16) {
+        isfmod = 1;
+    }
+
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create(output[i]);
+        vsi_nn_kernel_tensor_attr_get_stride(out_attr[i], out_stride_size[i]);
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size(out_attr[i]);
+        out_bytes[i] = out_elements[i] * sizeof(float);
+        f32_out_buffer[i] = (float*)malloc(out_bytes[i]);
+        CHECK_PTR_FAIL_GOTO(f32_out_buffer[i], "Create output buffer fail.", final);
+        memset(f32_out_buffer[i], 0, out_bytes[i]);
+    }
+
+    for (i = 0; i < out_elements[0]; i++)
+    {
+        vsi_ssize_t in0_offset = 0;
+        vsi_ssize_t in1_offset = 0;
+        float in0 = 0;
+        float in1 = 0;
+
+        in0_offset = _expand_offset( i, in_attr[0]->shape->data, (vsi_size_t)in_attr[0]->shape->size,
+                in_stride_size[0], out_attr[0]->shape->data );
+        in1_offset = _expand_offset( i, in_attr[1]->shape->data, (vsi_size_t)in_attr[1]->shape->size,
+                in_stride_size[1], out_attr[0]->shape->data );
+        in0 = f32_in_buffer[0][in0_offset];
+        in1 = f32_in_buffer[1][in1_offset];
+        if (isfmod)
+        {
+            f32_out_buffer[0][i] = (float)fmod(in0,in1);
+        }
+        else
+        {
+            f32_out_buffer[0][i] = in0 - in1 * (float)floor(in0 / in1);
+        }
+    }
+
+    /* save data */
+    for (i = 0; i < _OUTPUT_NUM; i++) {
+        status = vsi_nn_kernel_tensor_write_from_float(
+            output[i], out_attr[i], f32_out_buffer[i], out_elements[i]);
+        CHECK_STATUS_FAIL_GOTO(status, final);
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++) {
+        if (f32_in_buffer[i]) {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+
+        if (in_attr[i]) {
+            vsi_nn_kernel_tensor_attr_release(&in_attr[i]);
+        }
+    }
+
+    for (i = 0; i < _OUTPUT_NUM; i++) {
+        if (f32_out_buffer[i]) {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+
+        if (out_attr[i]) {
+            vsi_nn_kernel_tensor_attr_release(&out_attr[i]);
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _mod_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _mod_kernel_param_def );
+
+    return VSI_SUCCESS;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_MOD_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t isfmod = vsi_nn_kernel_param_get_int32(params, "isfmod");
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _MOD_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &isfmod );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _MOD_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( mod, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c
@ -55,8 +55,8 @@ __BEGIN_DECLS
 static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
 {
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@ -90,12 +90,16 @@ DEF_KERNEL_EXECUTOR(_compute)
    uint32_t i = 0;
    int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0;
    float mean[3] = {0}, scale = 1;
+    vsi_bool is_rgb888 = tensors[1] == NULL;

    for (i = 0; i < _CPU_IO_NUM; i++)
    {
        tensors[i] = (vsi_nn_kernel_tensor_t)param[i];
-        attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
-        CHECK_PTR_FAIL_GOTO( attr[i], "Create tensor attr buffer fail.", final );
+        if (tensors[i])
+        {
+            attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
+            CHECK_PTR_FAIL_GOTO( attr[i], "Create tensor attr buffer fail.", final );
+        }
    }

    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
@ -113,8 +117,11 @@ DEF_KERNEL_EXECUTOR(_compute)

    for (i = 0; i < 3; i++)
    {
-        buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( buffer[i], "Create input0 buffer fail.", final );
+        if (tensors[i])
+        {
+            buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE );
+            CHECK_PTR_FAIL_GOTO( buffer[i], "Create input0 buffer fail.", final );
+        }

        buffer[i + 3] = (float *)malloc( out_elements * sizeof(float) );
        CHECK_PTR_FAIL_GOTO( buffer[i + 3], "Create output buffer fail.", final );
@ -125,12 +132,17 @@ DEF_KERNEL_EXECUTOR(_compute)
        int32_t line1[2], line2[2];
        int32_t dx = 0, dy = 0, idx = 0;
        int32_t src_width = (int32_t)attr[0]->shape->data[0];
+        int32_t src_height = (int32_t)attr[0]->shape->data[1];
        int32_t dst_width = (int32_t)attr[3]->shape->data[0];
        int32_t dst_height = (int32_t)attr[3]->shape->data[1];
        uint8_t result = 0;
+        int32_t offset = 0;
+        int32_t index = 0;

        for ( idx = 0; idx < 3; idx ++)
        {
+            offset = is_rgb888 ? idx * src_width * src_height : 0;
+            index = is_rgb888 ? 0 : idx;
            for ( dy = 0; dy < (int32_t)dst_height; dy ++)
            {
                for ( dx = 0; dx < (int32_t)dst_width; dx ++)
@ -170,10 +182,10 @@ DEF_KERNEL_EXECUTOR(_compute)
                        sy += yOffset;
                        source_index = (sx + sy * src_width);

-                        line1[0] = (int32_t)buffer[idx][source_index];
-                        line1[1] = (int32_t)buffer[idx][source_index + 1];
-                        line2[0] = (int32_t)buffer[idx][source_index + src_width];
-                        line2[1] = (int32_t)buffer[idx][source_index + src_width + 1];
+                        line1[0] = (int32_t)buffer[index][source_index + offset];
+                        line1[1] = (int32_t)buffer[index][source_index + 1 + offset];
+                        line2[0] = (int32_t)buffer[index][source_index + src_width + offset];
+                        line2[1] = (int32_t)buffer[index][source_index + src_width + 1 + offset];

                        temp1 = fx * (line1[1] - line1[0]) + (line1[0] << 10);
                        temp2 = fx * (line2[1] - line2[0]) + (line2[0] << 10);
@ -184,10 +196,10 @@ DEF_KERNEL_EXECUTOR(_compute)
                    }
                    else
                    {
-                        int32_t offset = xOffset + yOffset * src_width;
-                        source_index = dx + dy * src_width + offset;
-                        finalVal = (buffer[0][source_index] - mean[idx]) * scale;
-                        buffer[1][output_index] = finalVal;
+                        int32_t ofset = xOffset + yOffset * src_width;
+                        source_index = dx + dy * src_width + ofset + offset;
+                        finalVal = (buffer[index][source_index] - mean[idx]) * scale;
+                        buffer[idx + 3][output_index] = finalVal;
                    }
                }
            }
--- a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
@ -209,16 +209,15 @@ DEF_KERNEL_EXECUTOR(_compute)
    for (n = 0; n < num_rois; n++)
    {
        uint32_t batchId = (uint32_t)f32_in_buffer[2][n];
-        float scale = (in_attr[1]->dtype == U16) ? 0.125f : 1.0f;
        float qx1 = f32_in_buffer[1][n * kRoiDim];
        float qy1 = f32_in_buffer[1][n * kRoiDim + 1];
        float qx2 = f32_in_buffer[1][n * kRoiDim + 2];
        float qy2 = f32_in_buffer[1][n * kRoiDim + 3];

-        float x1 = qx1 * scale;
-        float x2 = qx2 * scale;
-        float y1 = qy1 * scale;
-        float y2 = qy2 * scale;
+        float x1 = qx1;
+        float x2 = qx2;
+        float y1 = qy1;
+        float y2 = qy2;
        float roi_anchor_x = x1 * width_scale;
        float roi_anchor_y = y1 * height_scale;
        float roi_dims_x   = vsi_nn_max((x2 - x1) * width_scale, 1.0f);
--- a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
@ -0,0 +1,770 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define KERNEL_SOURCE_1    "cumsum"
+#define KERNEL_SOURCE_2    "cumsum_2d"
+#define KERNEL_SOURCE_3    "cumsum_bf16"
+#define KERNEL_SOURCE_4    "cumsum_f16_u8"
+
+// Add kernel hashtable here
+#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
+    ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
+
+#define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
+        CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
+        SOURCE },
+
+#define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \
+        CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
+        SOURCE },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } cumsum_map[] =
+{
+    HASH_CUMSUM_KERNELS(0, U8,   U8,   KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(0, I8,   I8,   KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(0, I16,  I16,  KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(0, F16,  F16,  KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(0, BF16, BF16, KERNEL_SOURCE_3)
+    HASH_CUMSUM_KERNELS(1, U8,   U8,   KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(1, I8,   I8,   KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(1, I16,  I16,  KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(1, F16,  F16,  KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(1, BF16, BF16, KERNEL_SOURCE_3)
+    HASH_CUMSUM_KERNELS(2, U8,   U8,   KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(2, I8,   I8,   KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(2, I16,  I16,  KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(2, F16,  F16,  KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(2, BF16, BF16, KERNEL_SOURCE_3)
+    HASH_CUMSUM_KERNELS_2D(0, U8,   U8,   KERNEL_SOURCE_2)
+    HASH_CUMSUM_KERNELS_2D(0, I8,   I8,   KERNEL_SOURCE_2)
+    HASH_CUMSUM_KERNELS_2D(0, I16,  I16,  KERNEL_SOURCE_2)
+    HASH_CUMSUM_KERNELS_2D(0, F16,  F16,  KERNEL_SOURCE_2)
+    HASH_CUMSUM_KERNELS_2D(0, BF16, BF16, KERNEL_SOURCE_3)
+    HASH_CUMSUM_KERNELS_2D(1, U8,   U8,   KERNEL_SOURCE_2)
+    HASH_CUMSUM_KERNELS_2D(1, I8,   I8,   KERNEL_SOURCE_2)
+    HASH_CUMSUM_KERNELS_2D(1, I16,  I16,  KERNEL_SOURCE_2)
+    HASH_CUMSUM_KERNELS_2D(1, F16,  F16,  KERNEL_SOURCE_2)
+    HASH_CUMSUM_KERNELS_2D(1, BF16, BF16, KERNEL_SOURCE_3)
+    HASH_CUMSUM_KERNELS(0, F16,  U8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS(0, F16,  I8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS(0, F16,  I16, KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS(1, F16,  U8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS(1, F16,  I8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS(1, F16,  I16, KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS(2, F16,  U8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS(2, F16,  I8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS(2, F16,  I16, KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS_2D(0, F16,  U8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS_2D(0, F16,  I8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS_2D(0, F16,  I16, KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS_2D(1, F16,  U8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS_2D(1, F16,  I8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS_2D(1, F16,  I16, KERNEL_SOURCE_4)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _cumsum_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _CUMSUM_PARAM_NUM  _cnt_of_array( _cumsum_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_cumsum_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    int32_t       axis    = 0;
+    int32_t       width   = 0;
+    int32_t       height  = 0;
+    int32_t       channel = 0;
+    int32_t       w       = 1;
+    int32_t       h       = 1;
+    int32_t       c       = 1;
+    uint32_t      dim     = 1;
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    vsi_size_array_t * input_shape = NULL;
+    int32_t input_zp        = 0;
+    float   input_scale     = 1.0f;
+    float   output_zp       = 0;
+    float   output_scale    = 1.0f;
+    float   in_out_zp_scale = 1.0f;
+    float   in_out_scale    = 1.0f;
+
+    uint32_t pack_key = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        if (attr[0]->dfp.fl > 0)
+        {
+            input_scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
+        }
+        else
+        {
+            input_scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
+        }
+    }
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        input_scale = attr[0]->asymm.scale;
+        input_zp = attr[0]->asymm.zero_point;
+    }
+
+    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        if (attr[1]->dfp.fl > 0)
+        {
+            output_scale = (float)((int64_t)1 << attr[1]->dfp.fl);
+        }
+        else
+        {
+            output_scale = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
+        }
+    }
+    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        output_scale = 1.0f / attr[1]->asymm.scale;
+        output_zp = (float)attr[1]->asymm.zero_point;
+    }
+
+    in_out_scale = input_scale * output_scale;
+    in_out_zp_scale = (float)in_out_scale * input_zp;
+
+    input_shape  = attr[0]->shape;
+    dim     = (uint32_t)input_shape->size;
+    width   = (int32_t)(input_shape->data[0]);
+    height  = (int32_t)(input_shape->data[1]);
+    channel = (int32_t)(dim > 2 ? input_shape->data[2] : 1);
+
+
+    if (axis == 0)
+    {
+        w = 1;
+        h = height;
+        c = channel;
+    }
+    else if (axis == 1)
+    {
+        w = width;
+        h = 1;
+        c = channel;
+    }
+    else if (axis == 2)
+    {
+        w = width;
+        h = height;
+        c = 1;
+    }
+
+    shaderParam.global_scale[0]  = 8;
+    if ((attr[0]->dtype == U8 || attr[0]->dtype == I8)
+        && (axis > 0))
+    {
+        shaderParam.global_scale[0]  = 16;
+    }
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.global_size[0]   = (w + shaderParam.global_scale[0] - 1) / shaderParam.global_scale[0];
+    shaderParam.global_size[1]   = h;
+    shaderParam.global_size[2]   = c;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, AXIS, DIM)    \
+        (IN0_TYPE | (OUT_TYPE << 8) | (AXIS << 16) | (DIM << 24))
+
+    pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, axis, dim);
+
+    {
+        uint16_t M0               = 0;
+        int32_t  postShift        = 0;
+        uint32_t multAndoutZP0[2] = {0};
+        gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{
+                0xdddddddd, // TCfg
+                0x44444444, // ASelt
+                0x13121110, 0x17161514, // ABin
+                0x11111111, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniAccSumVertF16toF16_2x8 = {{
+            0x55555555, // TCfg
+            0x44444444, // ASelt
+            0x33221100, 0x77665544, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00010001, 0x00010001, 0x00010001,
+            0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniAccSumVertU8toI32A_4x4 = {{
+            0x0d0d0d0d, // TCfg
+            0x04040404, // ASelt
+            0x00110000, 0x00330022, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniAccSumVertU8toI32B_4x4 = {{
+            0x0d0d0d0d, // TCfg
+            0x04040404, // ASelt
+            0x00150004, 0x00370026, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniAccSumVertU8toI32C_4x4 = {{
+            0x0d0d0d0d, // TCfg
+            0x04040404, // ASelt
+            0x00190008, 0x003b002a, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniAccSumVertU8toI32D_4x4 = {{
+            0x0d0d0d0d, // TCfg
+            0x04040404, // ASelt
+            0x001d000c, 0x003f002e, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniSumHorzF16toF16A_4x4 = {{
+            0x55150501, // TCfg
+            0x00000000, // ASelt
+            0x00100000, 0x32100210, // ABin
+            0xaa2a0a02, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x3c003c00, 0x00000000,
+            0x3c003c00, 0x00003c00, 0x3c003c00, 0x3c003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniSumHorzF16toF16B_4x4 = {{
+            0x55150501, // TCfg
+            0x00000000, // ASelt
+            0x00540004, 0x76540654, // ABin
+            0xaa2a0a02, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x3c003c00, 0x00000000,
+            0x3c003c00, 0x00003c00, 0x3c003c00, 0x3c003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniSumHorzF16toF16C_2x8 = {{
+            0x55551111, // TCfg
+            0x00000000, // ASelt
+            0x03020100, 0x37363534, // ABin
+            0xaaaa2222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniAccSumHorzF16toF16_2x8 = {{
+            0x55555555, // TCfg
+            0x44444444, // ASelt
+            0x73727170, 0x77767574, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00,
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniSumHorzU8toI16A_4x4 = {{
+            0x55150501, // TCfg
+            0x00000000, // ASelt
+            0x00100000, 0x32100210, // ABin
+            0xaa2a0a02, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000700, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000001, 0x00010001, 0x00010001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniSumHorzU8toI16B_8x4 = {{
+            0x05550155, 0x55551555, // TCfg
+            0x00418820, 0x41882000, 0x8820000a, 0x20018a41, 0x398a4188, // BinSelect
+            0x00000700, // AccumType, ConstantType, and PostShift
+            0x01010101, 0x00000001, 0x01010101, 0x00000101,
+            0x01010101, 0x00010101, 0x01010101, 0x01010101 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniSubZpI16toI16_2x8 = {{
+            0x99999999, // TCfg
+            0x44444444, // ASelt
+            0x03020100, 0x07060504, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00020001, 0x00030001, 0x00040001,
+            0x00050001, 0x00060001, 0x00070001, 0x00080001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniAccSumHorzI16toI32A_4x4 = {{
+            0x0d0d0d0d, // TCfg
+            0x04040404, // ASelt
+            0x00310030, 0x00330032, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniAccSumHorzI16toI32B_4x4 = {{
+            0x0d0d0d0d, // TCfg
+            0x04040404, // ASelt
+            0x00350034, 0x00370036, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111,  // TCfg
+            0x01010101,  // ASelt
+            0x01050004, 0x03070206,  // ABin
+            0x22222222,  // BSelt
+            0x00000000, 0x00000000,  // BBin
+            0x00000600,  // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001  // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+            0x11111111,  // TCfg
+            0x01010101,  // ASelt
+            0x05050404, 0x07070606,  // ABin
+            0x22222222,  // BSelt
+            0x00000000, 0x00000000,  // BBin
+            0x00000600,  // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001  // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtractOddData_2x8 = {{
+            0x11111111,  // TCfg
+            0x11110000,  // ASelt
+            0x07050301, 0x07050301,  // ABin
+            0x22222222,  // BSelt
+            0x00000000, 0x00000000,  // BBin
+            0x00000600,  // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001  // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_dp_inst_t uniSetZeroF16_2x8 = {{
+            0x11111111, // TCfg
+            0x00000000, // ASelt
+            0x03020100, 0x07060504, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_quantize_multiplier_16bit( (double)input_scale * output_scale, &M0, &postShift);
+        multAndoutZP0[0] = (uint32_t)(M0);
+        multAndoutZP0[1] = (uint32_t)((attr[1]->asymm.zero_point << postShift) - input_zp * M0);
+        gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift );
+
+        status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
+        status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+        switch( pack_key )
+        {
+        case _PACK_SELECT_KEY( U8,   U8,   2, 3):
+        case _PACK_SELECT_KEY( I8,   I8,   2, 3):
+        case _PACK_SELECT_KEY( I16,  I16,  2, 3):
+        case _PACK_SELECT_KEY( F16,  F16,  2, 3):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
+                status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
+                status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+                status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale);
+                status |= vsi_nn_kernel_gpu_add_param(node, "in_out_zp_scale", &in_out_zp_scale);
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertU8toI32A_4x4", &uniAccSumVertU8toI32A_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertU8toI32B_4x4", &uniAccSumVertU8toI32B_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertU8toI32C_4x4", &uniAccSumVertU8toI32C_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertU8toI32D_4x4", &uniAccSumVertU8toI32D_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzU8toI16A_4x4", &uniSumHorzU8toI16A_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzU8toI16B_8x4", &uniSumHorzU8toI16B_8x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSubZpI16toI16_2x8", &uniSubZpI16toI16_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumHorzI16toI32A_4x4", &uniAccSumHorzI16toI32A_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumHorzI16toI32B_4x4", &uniAccSumHorzI16toI32B_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case _PACK_SELECT_KEY( U8,   U8,   0, 2):
+        case _PACK_SELECT_KEY( U8,   U8,   1, 2):
+        case _PACK_SELECT_KEY( U8,   U8,   0, 3):
+        case _PACK_SELECT_KEY( U8,   U8,   1, 3):
+        case _PACK_SELECT_KEY( I8,   I8,   0, 2):
+        case _PACK_SELECT_KEY( I8,   I8,   1, 2):
+        case _PACK_SELECT_KEY( I8,   I8,   0, 3):
+        case _PACK_SELECT_KEY( I8,   I8,   1, 3):
+        case _PACK_SELECT_KEY( I16,  I16,  0, 2):
+        case _PACK_SELECT_KEY( I16,  I16,  1, 2):
+        case _PACK_SELECT_KEY( I16,  I16,  0, 3):
+        case _PACK_SELECT_KEY( I16,  I16,  1, 3):
+        case _PACK_SELECT_KEY( F16,  F16,  0, 2):
+        case _PACK_SELECT_KEY( F16,  F16,  1, 2):
+        case _PACK_SELECT_KEY( F16,  F16,  0, 3):
+        case _PACK_SELECT_KEY( F16,  F16,  1, 3):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
+                status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+                status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale);
+                status |= vsi_nn_kernel_gpu_add_param(node, "in_out_zp_scale", &in_out_zp_scale);
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertU8toI32A_4x4", &uniAccSumVertU8toI32A_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertU8toI32B_4x4", &uniAccSumVertU8toI32B_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertU8toI32C_4x4", &uniAccSumVertU8toI32C_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertU8toI32D_4x4", &uniAccSumVertU8toI32D_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzU8toI16A_4x4", &uniSumHorzU8toI16A_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzU8toI16B_8x4", &uniSumHorzU8toI16B_8x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSubZpI16toI16_2x8", &uniSubZpI16toI16_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumHorzI16toI32A_4x4", &uniAccSumHorzI16toI32A_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumHorzI16toI32B_4x4", &uniAccSumHorzI16toI32B_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case _PACK_SELECT_KEY( BF16, BF16, 0, 2):
+        case _PACK_SELECT_KEY( BF16, BF16, 1, 2):
+        case _PACK_SELECT_KEY( BF16, BF16, 0, 3):
+        case _PACK_SELECT_KEY( BF16, BF16, 1, 3):
+        case _PACK_SELECT_KEY( BF16, BF16, 2, 3):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case _PACK_SELECT_KEY( F16, U8,  0, 2):
+        case _PACK_SELECT_KEY( F16, U8,  1, 2):
+        case _PACK_SELECT_KEY( F16, U8,  0, 3):
+        case _PACK_SELECT_KEY( F16, U8,  1, 3):
+        case _PACK_SELECT_KEY( F16, U8,  2, 3):
+        case _PACK_SELECT_KEY( F16, I8,  0, 2):
+        case _PACK_SELECT_KEY( F16, I8,  1, 2):
+        case _PACK_SELECT_KEY( F16, I8,  0, 3):
+        case _PACK_SELECT_KEY( F16, I8,  1, 3):
+        case _PACK_SELECT_KEY( F16, I8,  2, 3):
+        case _PACK_SELECT_KEY( F16, I16, 0, 2):
+        case _PACK_SELECT_KEY( F16, I16, 1, 2):
+        case _PACK_SELECT_KEY( F16, I16, 0, 3):
+        case _PACK_SELECT_KEY( F16, I16, 1, 3):
+        case _PACK_SELECT_KEY( F16, I16, 2, 3):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "multAndoutZP0", &multAndoutZP0);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        default:
+            break;
+        }
+    }
+#undef _PACK_SELECT_KEY
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+}
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel,
+    const vsi_nn_kernel_param_t * params,
+    int32_t axis,
+    int32_t is_2d
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d);
+
+    for( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
+    {
+        if ( cumsum_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(cumsum_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  cumsum_map[i].function_name );
+        kernel->info.parameters = _cumsum_kernel_param_def;
+        kernel->info.numParams = _cnt_of_array( _cumsum_kernel_param_def );
+        kernel->info.initialize = _cumsum_initializer;
+
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                cumsum_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                cumsum_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t tmp_params[_CUMSUM_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t  shapes[1][VSI_NN_MAX_DIM_NUM] = {{0}};
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    int32_t axis       = vsi_nn_kernel_param_get_int32( params, "axis" );
+    int32_t exclusive  = vsi_nn_kernel_param_get_int32( params, "exclusive" );
+    int32_t reverse    = vsi_nn_kernel_param_get_int32( params, "reverse" );
+    int32_t axis_new   = 0;
+    int32_t is_2d      = 0;
+    uint32_t rs_dim    = 2;
+    int32_t i          = 0;
+
+    vsi_nn_kernel_optimize_softmax_shape(
+                inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+                shapes[0], &rs_dim, &axis_new);
+    if (exclusive || reverse || rs_dim > 3)
+    {
+        return NULL;
+    }
+
+    if (rs_dim == 2)
+    {
+        is_2d = 1;
+    }
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+        inputs[0], shapes[0], (vsi_size_t)rs_dim );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+        outputs[0], shapes[0], (vsi_size_t)rs_dim );
+
+    status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d);
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 2;
+
+            /* Pass parameters to node. */
+            vsi_nn_kernel_node_pack_io( tmp_params, _CUMSUM_PARAM_NUM,
+                reshape_tensors, 1, &reshape_tensors[1], 1 );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_new );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &exclusive );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
+            status = vsi_nn_kernel_node_pass_param( node, tmp_params, _CUMSUM_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &tmp_params[2] );
+            vsi_nn_kernel_scalar_release( &tmp_params[3] );
+            vsi_nn_kernel_scalar_release( &tmp_params[4] );
+        }
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( cumsum, _setup )
--- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
@ -53,6 +53,9 @@ typedef enum
    UNARY_HGELU,
    UNARY_SELU,
    UNARY_CELU,
+    UNARY_RCP,
+    UNARY_SIGN,
+    UNARY_SOFTSIGN,
 } unary_type_e;

 /*
@ -94,6 +97,34 @@ typedef enum
 #define HGELU_OPERATION         hard_gelu
 #define SELU_OPERATION          selu
 #define CELU_OPERATION          celu
+#define RCP_OPERATION           rcp
+#define SIGN_OPERATION          sign
+#define SOFTSIGN_OPERATION      softsign
+
+#define ADD_UNARY_SH_KERNELS(name, source) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, BF16, BF16, source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, BF16, BF16, source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16,  F16,  source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16,  F16,  source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16,  I16,  source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16,  I16,  source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16,  U8,   source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16,  U8,   source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16,  I8,   source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16,  I8,   source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I16,  I16,  source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I16,  I16,  source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I16,  F16,  source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I16,  F16,  source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I8,   I8,   source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I8,   I8,   source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I8,   F16,  source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I8,   F16,  source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, U8,   U8,   source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, U8,   U8,   source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, U8,   F16,  source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, U8,   F16,  source##_2D) \
+

 static const struct {
        uint32_t key;
@ -101,269 +132,22 @@ static const struct {
        const char* source_name;
    } _eltwise_unary_evis_kernel_map[] =
 {
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, F16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, F16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, F16,  U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, F16,  I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, I16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, I16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, U8,   U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, U8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, I8,   I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, I8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, BF16, BF16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, F16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, F16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, F16,  U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, F16,  I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, I16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, I16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, U8,   U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, U8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, I8,   I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, I8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, BF16, BF16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, F16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, F16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, F16,  U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, F16,  I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, I16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, I16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, U8,   U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, U8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, I8,   I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, I8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, BF16, BF16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, F16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, F16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, F16,  U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, F16,  I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, I16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, I16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, U8,   U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, U8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, I8,   I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, I8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, BF16, BF16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16,  U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16,  I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, U8,   U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, U8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I8,   I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, BF16, BF16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  F16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  F16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  F16,  U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  F16,  I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  I16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  I16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  U8,   U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  U8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  I8,   I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  I8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  BF16, BF16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16,  U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16,  I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, U8,   U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, U8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I8,   I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, BF16, BF16, KERNEL_SOURCE1_3D)
+    ADD_UNARY_SH_KERNELS(SIN,       KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(COS,       KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(EXP,       KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(LOG,       KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(SELU,      KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(CELU,      KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(NEG,       KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(RCP,       KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(SIGN,      KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(SOFTSIGN,  KERNEL_SOURCE1)

-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, F16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, F16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, F16,  U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, F16,  I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, I16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, I16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, U8,   U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, U8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, I8,   I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, I8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, BF16, BF16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, F16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, F16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, F16,  U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, F16,  I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, I16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, I16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, U8,   U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, U8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, I8,   I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, I8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, BF16, BF16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, F16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, F16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, F16,  U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, F16,  I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, I16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, I16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, U8,   U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, U8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, I8,   I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, I8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, BF16, BF16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, F16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, F16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, F16,  U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, F16,  I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, I16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, I16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, U8,   U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, U8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, I8,   I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, I8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, BF16, BF16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16,  U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16,  I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, U8,   U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, U8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I8,   I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, BF16, BF16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  F16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  F16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  F16,  U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  F16,  I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  I16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  I16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  U8,   U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  U8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  I8,   I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  I8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  BF16, BF16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16,  U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16,  I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, U8,   U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, U8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I8,   I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, BF16, BF16, KERNEL_SOURCE1_2D)
-
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  I16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  U8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  I8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16,  I16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16,  F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,   U8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,   F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8,   I8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8,   F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, BF16, BF16, KERNEL_SOURCE0_3D)
-
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,   U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8,   I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, BF16, BF16, KERNEL_SOURCE0_2D)
-
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16,  F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16,  I16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16,  U8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16,  I8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I16,  I16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I16,  F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, U8,   U8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, U8,   F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I8,   I8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I8,   F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, BF16, BF16, KERNEL_SOURCE0_3D)
-
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16,  U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16,  I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8,   U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8,   I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, BF16, BF16, KERNEL_SOURCE0_2D)
-
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16,  F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16,  I16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16,  U8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16,  I8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I16,  I16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I16,  F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, U8,   U8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, U8,   F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I8,   I8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I8,   F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16, KERNEL_SOURCE0_3D)
-
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8,   U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8,   I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16, KERNEL_SOURCE0_2D)
-
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16,  F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16,  I16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16,  U8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16,  I8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I16,  I16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I16,  F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, U8,   U8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, U8,   F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I8,   I8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I8,   F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, BF16, BF16, KERNEL_SOURCE0_3D)
-
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8,   U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8,   I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, BF16, BF16, KERNEL_SOURCE0_2D)
-
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8,   U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8,   I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, BF16, BF16, KERNEL_SOURCE0_2D)
+    ADD_UNARY_SH_KERNELS(HSIGMOID,  KERNEL_SOURCE0)
+    ADD_UNARY_SH_KERNELS(MISH,      KERNEL_SOURCE0)
+    ADD_UNARY_SH_KERNELS(ROUND,     KERNEL_SOURCE0)
+    ADD_UNARY_SH_KERNELS(GELU,      KERNEL_SOURCE0)
+    ADD_UNARY_SH_KERNELS(HGELU,     KERNEL_SOURCE0)
 };

 #undef SIN_OPERATION
@ -378,6 +162,9 @@ static const struct {
 #undef GELU_OPERATION
 #undef HGELU_OPERATION
 #undef CELU_OPERATION
+#undef RCP_OPERATION
+#undef SIGN_OPERATION
+#undef SOFTSIGN_OPERATION
 /*
 * Kernel params
 */
@ -509,6 +296,9 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
        case _PACK_SELECT_KEY( UNARY_GELU, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_HGELU, BF16, BF16 ):
        case _PACK_SELECT_KEY( UNARY_CELU, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_RCP, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_SIGN, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_SOFTSIGN, BF16, BF16 ):
        {
            gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
                0x11111111, // TCfg
@ -815,5 +605,8 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( gelu, UNARY_GELU )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_gelu, UNARY_HGELU )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( selu, UNARY_SELU )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( celu, UNARY_CELU )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( rcp, UNARY_RCP )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( sign, UNARY_SIGN )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( softsign, UNARY_SOFTSIGN )

 __END_DECLS
--- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
@ -222,7 +222,7 @@ static vsi_status get_gather_tensor_reshape_size
    uint32_t i = 0;
    vsi_size_t elementCnt = 1;
    vsi_size_t outerCnt = 1;
-#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH

    for(i = 0; i < dims_num - batch_dims; ++i)
    {
@ -751,7 +751,7 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_t             * kernel
    )
 {
-#define VSI_NN_MAX_BLOCK_SIZE  (65536)
+#define VSI_NN_MAX_BLOCK_SIZE  GPU_TENSOR_MAX_WIDTH
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t tmp_params[_GATHER_PARAM_NUM] = { NULL };
    vsi_nn_kernel_node_t node = NULL;
@ -795,12 +795,6 @@ static vsi_nn_kernel_node_t _setup
    reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
        outputs[0], shapes[2], rs_dim );

-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
-    {
-        return NULL;
-    }
-
    status = _query_kernel( inputs, outputs, kernel, params, axis0_flg, is_array, is_batch);
    if ( VSI_SUCCESS == status)
    {
--- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
@ -136,7 +136,7 @@ static vsi_status get_gather_nd_tensor_reshape_size
    vsi_size_t *input_size = inputs[0]->attr.size;
    uint32_t i = 0;
    vsi_size_t elementCnt = 1;
-#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH

    newDim[0] = 0;
    for(i = 0; i < dims_num; ++i)
--- a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
--- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
@ -44,7 +44,7 @@ __BEGIN_DECLS
 typedef enum _grucell_nn_activation_type_e
 {
    SIGMOID = VSI_NN_ACT_SIGMOID,
-    HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
+    HSIGMOID = VSI_NN_ACT_HARD_SIGMOID,
 }grucell_nn_activation_type_e;

 #define _GRUCELL_ACTIVATION_Z_H_KERNEL_SOURCE      "grucell_activation_z_h"
@ -72,6 +72,10 @@ static const _kernel_map_type _grucell_activation_z_h_kernel_map[] =
    PACK_KERNEL_MAP( I8,  F16, I8,  SIGMOID ),
    PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ),
    PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
+    PACK_KERNEL_MAP( U8,  F16, U8,  HSIGMOID ),
+    PACK_KERNEL_MAP( I8,  F16, I8,  HSIGMOID ),
+    PACK_KERNEL_MAP( I16, F16, I16, HSIGMOID ),
+    PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ),
 };

 /*
--- a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
@ -22,7 +22,6 @@
 *
 *****************************************************************************/

-
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -45,7 +44,7 @@ __BEGIN_DECLS
 typedef enum _grucell_nn_activation_type_e
 {
    SIGMOID = VSI_NN_ACT_SIGMOID,
-    HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
+    HSIGMOID = VSI_NN_ACT_HARD_SIGMOID,
 }grucell_nn_activation_type_e;

 #define _GRUCELL_H_TIMES_ACTIVATION_R_KERNEL_SOURCE      "grucell_h_times_activation_r"
@ -72,9 +71,12 @@ static const _kernel_map_type _grucell_h_times_activation_r_kernel_map[] =
    PACK_KERNEL_MAP( I8,  F16, F16, SIGMOID ),
    PACK_KERNEL_MAP( I16, F16, F16, SIGMOID ),
    PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
+    PACK_KERNEL_MAP( U8,  F16, F16, HSIGMOID ),
+    PACK_KERNEL_MAP( I8,  F16, F16, HSIGMOID ),
+    PACK_KERNEL_MAP( I16, F16, F16, HSIGMOID ),
+    PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ),
 };

-
 /*
 * Kernel params
 */
@ -256,8 +258,6 @@ final:
    return status;
 } /* _grucell_h_times_activation_r_initializer() */

-
-
 /*
 * Query kernel
 */
@ -313,7 +313,6 @@ static vsi_status _query_kernel
    return status;
 } /* _query_kernel() */

-
 static vsi_nn_kernel_node_t _setup
    (
    vsi_nn_graph_t              * graph,
--- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
--- a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
@ -38,16 +38,24 @@

 __BEGIN_DECLS

+
 #define HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, _image_2d) \
    ((AXIS << 28) | (IN1_DTYPE << 20) | (IN0_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))

- #define HASH_L2NORMALIZESCALE_KERNEL_SOURCE_NAME(AXIS) \
-    "l2normalizescale_axis"#AXIS
+#define KERNEL_SOURCE_1    "l2normalizescale_axis0"
+#define KERNEL_SOURCE_2    "l2normalizescale_axis0_2d"
+#define KERNEL_SOURCE_3    "l2normalizescale_axis1"

-#define HASH_L2NORMALIZESCALE_KERNELS_2D( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+
+#define HASH_L2NORMALIZESCALE_KERNELS_2D( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE) \
        { HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1), \
        CVIVANTE_NAMESPACE("evis.l2normalizescale_axis"#AXIS"_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D"), \
-        HASH_L2NORMALIZESCALE_KERNEL_SOURCE_NAME(AXIS) },
+        SOURCE },
+
+#define HASH_L2NORMALIZESCALE_KERNELS( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE) \
+        { HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0), \
+        CVIVANTE_NAMESPACE("evis.l2normalizescale_axis"#AXIS"_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE), \
+        SOURCE },

 typedef struct
 {
@ -58,20 +66,27 @@ typedef struct

 static const _kernel_map_type _l2normalizescale_kernel_map[] =
 {
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, F16, F16, F16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, I8  )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, F16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, U8  )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, F16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, I16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, F16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, F16, F16, F16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, I8  )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, F16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, U8  )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, F16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, I16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, F16 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, F16, F16, F16, KERNEL_SOURCE_2 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, I8,  KERNEL_SOURCE_2 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, F16, KERNEL_SOURCE_2 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, U8,  KERNEL_SOURCE_2 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, F16, KERNEL_SOURCE_2 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, I16, KERNEL_SOURCE_2 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, F16, KERNEL_SOURCE_2 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, F16, F16, F16, KERNEL_SOURCE_3 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, I8,  KERNEL_SOURCE_3 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, F16, KERNEL_SOURCE_3 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, U8,  KERNEL_SOURCE_3 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, F16, KERNEL_SOURCE_3 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, I16, KERNEL_SOURCE_3 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, F16, KERNEL_SOURCE_3 )
+    HASH_L2NORMALIZESCALE_KERNELS( 0, F16, F16, F16, KERNEL_SOURCE_1 )
+    HASH_L2NORMALIZESCALE_KERNELS( 0, I8 , F16, I8,  KERNEL_SOURCE_1 )
+    HASH_L2NORMALIZESCALE_KERNELS( 0, I8 , F16, F16, KERNEL_SOURCE_1 )
+    HASH_L2NORMALIZESCALE_KERNELS( 0, U8 , F16, U8,  KERNEL_SOURCE_1 )
+    HASH_L2NORMALIZESCALE_KERNELS( 0, U8 , F16, F16, KERNEL_SOURCE_1 )
+    HASH_L2NORMALIZESCALE_KERNELS( 0, I16, F16, I16, KERNEL_SOURCE_1 )
+    HASH_L2NORMALIZESCALE_KERNELS( 0, I16, F16, F16, KERNEL_SOURCE_1 )
 };

 /*
@ -119,6 +134,10 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
    int32_t   outputZP      = 0;
    float     outputScale   = 1.0f;
    float     r_inputScale  = 1.0f;
+    float     e2InScale     = 1.0f;
+    float     inOutScale    = 1.0f;
+    int32_t   axis2Dflg     = 0;
+    int32_t   inputWidth    = 0;

    input_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
@ -168,7 +187,10 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
        outputScale  = 1.0f / output_attr->asymm.scale;
    }

+    e2InScale    = inputScale * inputScale;
    r_inputScale = 1.0f / inputScale;
+    inOutScale   = inputScale * outputScale;
+    inputWidth   = (int32_t)(output_shape->data[0]);

    if (1 == axis)
    {
@ -190,6 +212,13 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
        gpu_param.local_size[1]    = 1;
        gpu_param.global_size[0]   = 16;
        gpu_param.global_size[1]   = output_shape->data[1];
+
+        if (output_shape->data[0] < GPU_TENSOR_MAX_WIDTH
+            && output_shape->data[1] < GPU_TENSOR_MAX_WIDTH
+            && (output_shape->size == 2 || (output_shape->size == 3 && output_shape->data[2] == 1)))
+        {
+            axis2Dflg = 1;
+        }
    }
    else
    {
@ -257,8 +286,105 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
            0x00000400, // AccumType, ConstantType, and PostShift
            0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0x76543210, // ABin
+            0x5555aaaa, // BSelt
+            0x00000000, 0x76543210, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertSecFp16Fp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
+            0x05050505, // TCfg
+            0x04040404, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{
+            0x05050505, // TCfg
+            0x04040404, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0x76543210, // ABin
+            0x5555aaaa, // BSelt
+            0x00000000, 0x76543210, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };

-        if (1 == axis)
+        if (axis2Dflg)
+        {
+            float zP2x = 2 * (float)inputZP;
+            float zpSqr8x =  8 * (float)inputZP * (float)inputZP;
+            float output_ZP = (float)outputZP;
+            status = vsi_nn_kernel_gpu_add_param( node, "inputWidth", &inputWidth);
+            status |= vsi_nn_kernel_gpu_add_param( node, "zP2x", &zP2x);
+            status |= vsi_nn_kernel_gpu_add_param( node, "zpSqr8x", &zpSqr8x);
+            status |= vsi_nn_kernel_gpu_add_param( node, "e2InScale", &e2InScale);
+            status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale);
+            status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &output_ZP);
+            status |= vsi_nn_kernel_gpu_add_param( node, "inputZP", &inputZP);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniInt16SumSqr_dp8x2", &uniInt16SumSqr_dp8x2);
+            status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", &uniConvertSecFp16Fp32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
+                        &uniConvert1stUint8SubZpToFp32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
+                        &uniConvert2ndUint8SubZpToFp32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", &uniConvertHalfToFp16_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }
+        else if (1 == axis)
        {
            int32_t L2NorS_depth = (int32_t)(output_shape->data[1]);
            status = vsi_nn_kernel_gpu_add_param( node, "L2NorS_depth",  &L2NorS_depth);
@ -277,8 +403,7 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
        }
        else if (0 == axis)
        {
-            int32_t inputWidth, inputWidthCount, inputWidthRemain256;
-            inputWidth          = (int32_t)(output_shape->data[0]);
+            int32_t inputWidthCount, inputWidthRemain256;
            inputWidthRemain256 = (int32_t)(output_shape->data[0] % 256);
            inputWidthCount     = (int32_t)(output_shape->data[0] / 256);
            vsi_nn_kernel_gpu_add_param( node, "inputWidth", &inputWidth);
@ -298,7 +423,8 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
        }
    }

-   {
+    if (axis2Dflg == 0)
+    {
        float IntergerScale = inputScale;
        float output_ZP      = (float)outputZP;
        gpu_dp_inst_t uniExtact8Bin_2x8 = {{
@ -473,7 +599,8 @@ static vsi_nn_kernel_node_t _setup
        return NULL;
    }

-    image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
+    image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1) &&
+               (inputs[0]->attr.size[0] < GPU_TENSOR_MAX_WIDTH && inputs[0]->attr.size[1] < GPU_TENSOR_MAX_WIDTH);
    status = _query_kernel( kernel, inputs, outputs, axis, image_2d );
    if ( VSI_SUCCESS == status)
    {
--- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
--- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
@ -910,6 +910,7 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
                status |= vsi_nn_kernel_gpu_add_param( node,
                        "uniGemmU8U8MulZptoFp32_8x4", &uniGemmU8U8MulZptoFp32_8x4 );
                status |= vsi_nn_kernel_gpu_add_param( node, "input01Scale", &inScaleMul );
+                status |= vsi_nn_kernel_gpu_add_param( node, "mulKIn0In1Zp", &mulKIn0In1Zp );
                CHECK_STATUS_FAIL_GOTO(status, OnError );
            }
            break;
--- a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
@ -202,7 +202,7 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)

    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
-        int32_t fl = (uint8_t)attr[2]->dfp.fl;
+        int32_t fl = attr[2]->dfp.fl;
        if (fl > 0)
        {
            output_scale = (float) ((int64_t)1 << fl);
--- a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
@ -202,7 +202,7 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)

    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
-        int32_t fl = (uint8_t)attr[2]->dfp.fl;
+        int32_t fl = attr[2]->dfp.fl;
        if (fl > 0)
        {
            output_scale = (float) ((int64_t)1 << fl);
--- a/src/tim/vx/internal/src/kernel/evis/mod_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/mod_evis.c
@ -0,0 +1,444 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+#define MOD_HASH_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
+    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
+
+ #define MOD_KERNEL_SOURCE_NAME "mod"
+
+#define MOD_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
+    { MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
+      CVIVANTE_NAMESPACE("evis.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE), \
+      MOD_KERNEL_SOURCE_NAME },
+
+#define MOD_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
+    { MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \
+      CVIVANTE_NAMESPACE("evis.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE"_2D"), \
+      MOD_KERNEL_SOURCE_NAME },
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _mod_kernel_map[] =
+{
+        // Register kernel here
+    MOD_KERNELS( F16,  F16,  F16 )
+    MOD_KERNELS( F16,  F16,  I16 )
+    MOD_KERNELS( F16,  F16,  I8 )
+    MOD_KERNELS( F16,  F16,  U8 )
+    MOD_KERNELS( I16,  I16,  I16 )
+    MOD_KERNELS( I8,   I8,   I8 )
+    MOD_KERNELS( U8,   U8,   U8 )
+    MOD_KERNELS( I16,  I16,  F16 )
+    MOD_KERNELS( I8,   I8,   F16 )
+    MOD_KERNELS( U8,   U8,   F16 )
+    MOD_KERNELS( BF16, BF16, BF16 )
+
+    MOD_KERNELS_2D( F16,  F16,  F16 )
+    MOD_KERNELS_2D( F16,  F16,  I16 )
+    MOD_KERNELS_2D( F16,  F16,  I8 )
+    MOD_KERNELS_2D( F16,  F16,  U8 )
+    MOD_KERNELS_2D( I16,  I16,  I16 )
+    MOD_KERNELS_2D( I8,   I8,   I8 )
+    MOD_KERNELS_2D( U8,   U8,   U8 )
+    MOD_KERNELS_2D( I16,  I16,  F16 )
+    MOD_KERNELS_2D( I8,   I8,   F16 )
+    MOD_KERNELS_2D( U8,   U8,   F16 )
+    MOD_KERNELS_2D( BF16, BF16, BF16 )
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _mod_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _MOD_PARAM_NUM  _cnt_of_array( _mod_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_mod_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vx_status     status             = VX_FAILURE;
+    vx_tensor     input0              = (vx_tensor)param[0];
+    vx_tensor     input1              = (vx_tensor)param[1];
+    vx_tensor     output              = (vx_tensor)param[2];
+    vsi_nn_kernel_tensor_attr_t *input0_attr  = NULL;
+    vsi_nn_kernel_tensor_attr_t *input1_attr  = NULL;
+    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
+    vsi_size_array_t             *output_shape = NULL;
+    vsi_nn_kernel_dtype_e        input0_dtype = F16;
+    int32_t                      input0_fl    = 0;
+    int32_t                      input1_fl    = 0;
+    int32_t                      output_fl    = 0;
+    float                        inScale0     = 1.0f;
+    float                        inScale1     = 1.0f;
+    float                        outScale     = 1.0f;
+    float                        in0Tail      = 0;
+    float                        in1Tail      = 0;
+    float                        outZp        = 0;
+
+    input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0 );
+    CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input1 );
+    CHECK_PTR_FAIL_GOTO( input1_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
+    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    output_shape = output_attr->shape;
+    input0_dtype = input0_attr->dtype;
+
+    gpu_param.dim = output_shape->size < 3 ? 2 : 3;
+    gpu_param.global_offset[0] = 0;
+    gpu_param.global_offset[1] = 0;
+    gpu_param.global_offset[2] = 0;
+    gpu_param.global_scale[0]  = 8;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1)
+                                             / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = (output_shape->data[1] + gpu_param.global_scale[1] - 1)
+                                             / gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = output_shape->size > 2 ?
+                                 (output_shape->data[2] + gpu_param.global_scale[2] - 1)
+                                             / gpu_param.global_scale[2] : 1;
+
+    if (input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        input0_fl = input0_attr->dfp.fl;
+        if (input0_fl > 0)
+        {
+            inScale0 = 1.0f / (float) ((int64_t)1 << input0_fl);
+        }
+        else
+        {
+            inScale0 = (float)((int64_t)1 << -input0_fl);
+        }
+    }
+    else if (input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        inScale0   = input0_attr->asymm.scale;
+        in0Tail    = -inScale0 * ((float)input0_attr->asymm.zero_point);
+    }
+
+    if (input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        input1_fl = input1_attr->dfp.fl;
+        if (input1_fl > 0)
+        {
+            inScale1 = 1.0f / (float) ((int64_t)1 << input1_fl);
+        }
+        else
+        {
+            inScale1 = (float)((int64_t)1 << -input1_fl);
+        }
+    }
+    else if (input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        inScale1   = input1_attr->asymm.scale;
+        in1Tail    = -inScale1 * ((float)input1_attr->asymm.zero_point);
+    }
+
+    if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        output_fl = output_attr->dfp.fl;
+        if (output_fl > 0)
+        {
+            outScale = (float) ((int64_t)1 << output_fl);
+        }
+        else
+        {
+            outScale = 1.0f / (float)((int64_t)1 << -output_fl);
+        }
+    }
+    else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        outScale    = 1.0f / output_attr->asymm.scale;
+        outZp       = (float)(output_attr->asymm.zero_point);
+    }
+
+    if (BF16 == input0_dtype)
+    {
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x05050404, 0x07070606, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtractOddData_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x07050301, 0x07050301, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+        status  = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else
+    {
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_dp_inst_t uniConvertFstToFp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_dp_inst_t uniConvertSecToFp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertFstToFp32_4x4", &uniConvertFstToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertSecToFp32_4x4", &uniConvertSecToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "in_scale0", &inScale0 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "in0Tail", &in0Tail );
+        status |= vsi_nn_kernel_gpu_add_param( node, "in_scale1", &inScale1 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "in1Tail", &in1Tail );
+        status |= vsi_nn_kernel_gpu_add_param( node, "out_scale", &outScale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "out_zp", &outZp );
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (input0_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&input0_attr);
+    }
+    if (input1_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&input1_attr);
+    }
+    if (output_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&output_attr);
+    }
+    return status;
+} /* _mod_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool image_2d
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _mod_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _mod_kernel_map );
+    vx_param_description_t * param_def  = _mod_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array( _mod_kernel_param_def );
+    vx_kernel_initialize_f  initializer = _mod_initializer;
+
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = MOD_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d);
+
+    for (i = 0; i < kernel_map_size; i ++)
+    {
+        if (kernel_map[i].key == key)
+        {
+            break;
+        }
+    }
+
+    if (i < kernel_map_size)
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_MOD_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_bool image_2d = FALSE;
+    int32_t isfmod = vsi_nn_kernel_param_get_int32(params, "isfmod");
+
+    if (!vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ))
+    {
+        return NULL;
+    }
+
+    image_2d = (outputs[0]->attr.dim_num == 2 || outputs[0]->attr.size[2] == 1);
+    if (vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type) == F16 ||
+        vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type) == F16 ||
+        vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type) == BF16 ||
+        vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type) == BF16)
+    {
+        isfmod = 1;
+    }
+    status = _query_kernel( kernel, inputs, outputs, image_2d);
+    if (VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if (node)
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _MOD_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &isfmod );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _MOD_PARAM_NUM );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( mod, _setup )
+
--- a/src/tim/vx/internal/src/kernel/evis/pow_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pow_evis.c
@ -38,69 +38,20 @@

 __BEGIN_DECLS

-#define VX_KERNEL_NAME_POW_F16F16TOF16                     CVIVANTE_NAMESPACE("evis.pow_F16F16toF16")
-#define VX_KERNEL_NAME_POW_F16F16TOF16_2D                  CVIVANTE_NAMESPACE("evis.pow_F16F16toF16_2D")
-#define VX_KERNEL_NAME_POW_F16F16TOU8                      CVIVANTE_NAMESPACE("evis.pow_F16F16toU8")
-#define VX_KERNEL_NAME_POW_F16F16TOU8_2D                   CVIVANTE_NAMESPACE("evis.pow_F16F16toU8_2D")
-#define VX_KERNEL_NAME_POW_F16F16TOI8                      CVIVANTE_NAMESPACE("evis.pow_F16F16toI8")
-#define VX_KERNEL_NAME_POW_F16F16TOI8_2D                   CVIVANTE_NAMESPACE("evis.pow_F16F16toI8_2D")
-#define VX_KERNEL_NAME_POW_F16F16TOI16                     CVIVANTE_NAMESPACE("evis.pow_F16F16toI16")
-#define VX_KERNEL_NAME_POW_F16F16TOI16_2D                  CVIVANTE_NAMESPACE("evis.pow_F16F16toI16_2D")
-#define VX_KERNEL_NAME_POW_F16U8TOF16                      CVIVANTE_NAMESPACE("evis.pow_F16U8toF16")
-#define VX_KERNEL_NAME_POW_F16U8TOF16_2D                   CVIVANTE_NAMESPACE("evis.pow_F16U8toF16_2D")
-#define VX_KERNEL_NAME_POW_F16I8TOF16                      CVIVANTE_NAMESPACE("evis.pow_F16I8toF16")
-#define VX_KERNEL_NAME_POW_F16I8TOF16_2D                   CVIVANTE_NAMESPACE("evis.pow_F16I8toF16_2D")
-#define VX_KERNEL_NAME_POW_F16I16TOF16                     CVIVANTE_NAMESPACE("evis.pow_F16I16toF16")
-#define VX_KERNEL_NAME_POW_F16I16TOF16_2D                  CVIVANTE_NAMESPACE("evis.pow_F16I16toF16_2D")
-#define VX_KERNEL_NAME_POW_F16U8TOU8                       CVIVANTE_NAMESPACE("evis.pow_F16U8toU8")
-#define VX_KERNEL_NAME_POW_F16U8TOU8_2D                    CVIVANTE_NAMESPACE("evis.pow_F16U8toU8_2D")
-#define VX_KERNEL_NAME_POW_F16I8TOI8                       CVIVANTE_NAMESPACE("evis.pow_F16I8toI8")
-#define VX_KERNEL_NAME_POW_F16I8TOI8_2D                    CVIVANTE_NAMESPACE("evis.pow_F16I8toI8_2D")
-#define VX_KERNEL_NAME_POW_F16I16TOI16                     CVIVANTE_NAMESPACE("evis.pow_F16I16toI16")
-#define VX_KERNEL_NAME_POW_F16I16TOI16_2D                  CVIVANTE_NAMESPACE("evis.pow_F16I16toI16_2D")
-#define VX_KERNEL_NAME_POW_U8F16TOF16                      CVIVANTE_NAMESPACE("evis.pow_U8F16toF16")
-#define VX_KERNEL_NAME_POW_U8F16TOF16_2D                   CVIVANTE_NAMESPACE("evis.pow_U8F16toF16_2D")
-#define VX_KERNEL_NAME_POW_I8F16TOF16                      CVIVANTE_NAMESPACE("evis.pow_I8F16toF16")
-#define VX_KERNEL_NAME_POW_I8F16TOF16_2D                   CVIVANTE_NAMESPACE("evis.pow_I8F16toF16_2D")
-#define VX_KERNEL_NAME_POW_I16F16TOF16                     CVIVANTE_NAMESPACE("evis.pow_I16F16toF16")
-#define VX_KERNEL_NAME_POW_I16F16TOF16_2D                  CVIVANTE_NAMESPACE("evis.pow_I16F16toF16_2D")
-#define VX_KERNEL_NAME_POW_U8F16TOU8                       CVIVANTE_NAMESPACE("evis.pow_U8F16toU8")
-#define VX_KERNEL_NAME_POW_U8F16TOU8_2D                    CVIVANTE_NAMESPACE("evis.pow_U8F16toU8_2D")
-#define VX_KERNEL_NAME_POW_I8F16TOI8                       CVIVANTE_NAMESPACE("evis.pow_I8F16toI8")
-#define VX_KERNEL_NAME_POW_I8F16TOI8_2D                    CVIVANTE_NAMESPACE("evis.pow_I8F16toI8_2D")
-#define VX_KERNEL_NAME_POW_I16F16TOI16                     CVIVANTE_NAMESPACE("evis.pow_I16F16toI16")
-#define VX_KERNEL_NAME_POW_I16F16TOI16_2D                  CVIVANTE_NAMESPACE("evis.pow_I16F16toI16_2D")
-#define VX_KERNEL_NAME_POW_U8U8TOU8                        CVIVANTE_NAMESPACE("evis.pow_U8U8toU8")
-#define VX_KERNEL_NAME_POW_U8U8TOU8_2D                     CVIVANTE_NAMESPACE("evis.pow_U8U8toU8_2D")
-#define VX_KERNEL_NAME_POW_I8I8TOI8                        CVIVANTE_NAMESPACE("evis.pow_I8I8toI8")
-#define VX_KERNEL_NAME_POW_I8I8TOI8_2D                     CVIVANTE_NAMESPACE("evis.pow_I8I8toI8_2D")
-#define VX_KERNEL_NAME_POW_I16I16TOI16                     CVIVANTE_NAMESPACE("evis.pow_I16I16toI16")
-#define VX_KERNEL_NAME_POW_I16I16TOI16_2D                  CVIVANTE_NAMESPACE("evis.pow_I16I16toI16_2D")
-#define VX_KERNEL_NAME_POW_BF16BF16TOBF16                  CVIVANTE_NAMESPACE("evis.pow_BF16BF16toBF16")
-#define VX_KERNEL_NAME_POW_BF16BF16TOBF16_2D               CVIVANTE_NAMESPACE("evis.pow_BF16BF16toBF16_2D")
-#define VX_KERNEL_NAME_POW_U8U8TOF16                       CVIVANTE_NAMESPACE("evis.pow_U8U8toF16")
-#define VX_KERNEL_NAME_POW_U8U8TOF16_2D                    CVIVANTE_NAMESPACE("evis.pow_U8U8toF16_2D")
-
-#define KERNEL_SOURCE_1    "pow_fp16",
-#define KERNEL_SOURCE_2    "pow_fp16_i8",
-#define KERNEL_SOURCE_3    "pow_fp16_i16",
-#define KERNEL_SOURCE_4    "pow_u8",
-#define KERNEL_SOURCE_5    "pow_i8",
-#define KERNEL_SOURCE_6    "pow_i16"
-
+#define KERNEL_SOURCE    "pow",

 #define HASH_POW_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))

-#define TENSOR_POW_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
+#define TENSOR_POW_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
    { HASH_POW_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
-        VX_KERNEL_NAME_POW_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE, \
-        SOURCE },
+        CVIVANTE_NAMESPACE("evis.pow_"#IN0_TYPE"_"#IN1_TYPE"to"#OUT_TYPE), \
+        KERNEL_SOURCE },

-#define TENSOR_POW_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
+#define TENSOR_POW_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
    { HASH_POW_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \
-        VX_KERNEL_NAME_POW_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE##_2D, \
-        SOURCE },
+        CVIVANTE_NAMESPACE("evis.pow_"#IN0_TYPE"_"#IN1_TYPE"to"#OUT_TYPE"_2D"), \
+        KERNEL_SOURCE },

 static const struct {
        uint32_t key;
@ -108,59 +59,59 @@ static const struct {
        const char* source_name;
    } pow_map[] =
 {
-    TENSOR_POW_KERNELS(F16, F16, F16,       KERNEL_SOURCE_1)
-    TENSOR_POW_KERNELS(F16, F16, U8,        KERNEL_SOURCE_1)
-    TENSOR_POW_KERNELS(F16, U8, F16,        KERNEL_SOURCE_1)
-    TENSOR_POW_KERNELS(F16, U8, U8,         KERNEL_SOURCE_1)
+    TENSOR_POW_KERNELS(F16, F16, F16)
+    TENSOR_POW_KERNELS(F16, F16, U8)
+    TENSOR_POW_KERNELS(F16, U8, F16)
+    TENSOR_POW_KERNELS(F16, U8, U8)

-    TENSOR_POW_KERNELS(F16, F16, I8,        KERNEL_SOURCE_2)
-    TENSOR_POW_KERNELS(F16, I8, F16,        KERNEL_SOURCE_2)
-    TENSOR_POW_KERNELS(F16, I8, I8,         KERNEL_SOURCE_2)
+    TENSOR_POW_KERNELS(F16, F16, I8)
+    TENSOR_POW_KERNELS(F16, I8, F16)
+    TENSOR_POW_KERNELS(F16, I8, I8)

-    TENSOR_POW_KERNELS(F16, F16, I16,       KERNEL_SOURCE_3)
-    TENSOR_POW_KERNELS(F16, I16, F16,       KERNEL_SOURCE_3)
-    TENSOR_POW_KERNELS(F16, I16, I16,       KERNEL_SOURCE_3)
+    TENSOR_POW_KERNELS(F16, F16, I16)
+    TENSOR_POW_KERNELS(F16, I16, F16)
+    TENSOR_POW_KERNELS(F16, I16, I16)

-    TENSOR_POW_KERNELS(U8, F16, F16,        KERNEL_SOURCE_4)
-    TENSOR_POW_KERNELS(U8, F16, U8,         KERNEL_SOURCE_4)
-    TENSOR_POW_KERNELS(U8, U8, U8,          KERNEL_SOURCE_4)
-    TENSOR_POW_KERNELS(U8, U8, F16,         KERNEL_SOURCE_4)
+    TENSOR_POW_KERNELS(U8, F16, F16)
+    TENSOR_POW_KERNELS(U8, F16, U8)
+    TENSOR_POW_KERNELS(U8, U8, U8)
+    TENSOR_POW_KERNELS(U8, U8, F16)

-    TENSOR_POW_KERNELS(I8, F16, F16,        KERNEL_SOURCE_5)
-    TENSOR_POW_KERNELS(I8, F16, I8,         KERNEL_SOURCE_5)
-    TENSOR_POW_KERNELS(I8, I8, I8,          KERNEL_SOURCE_5)
+    TENSOR_POW_KERNELS(I8, F16, F16)
+    TENSOR_POW_KERNELS(I8, F16, I8)
+    TENSOR_POW_KERNELS(I8, I8, I8)

-    TENSOR_POW_KERNELS(I16, F16, F16,       KERNEL_SOURCE_6)
-    TENSOR_POW_KERNELS(I16, F16, I16,       KERNEL_SOURCE_6)
-    TENSOR_POW_KERNELS(I16, I16, I16,       KERNEL_SOURCE_6)
-    TENSOR_POW_KERNELS(BF16, BF16, BF16,    KERNEL_SOURCE_3)
+    TENSOR_POW_KERNELS(I16, F16, F16)
+    TENSOR_POW_KERNELS(I16, F16, I16)
+    TENSOR_POW_KERNELS(I16, I16, I16)
+    TENSOR_POW_KERNELS(BF16, BF16, BF16)

-    TENSOR_POW_KERNELS_2D(F16, F16, F16,    KERNEL_SOURCE_1)
-    TENSOR_POW_KERNELS_2D(F16, F16, U8,     KERNEL_SOURCE_1)
-    TENSOR_POW_KERNELS_2D(F16, U8, F16,     KERNEL_SOURCE_1)
-    TENSOR_POW_KERNELS_2D(F16, U8, U8,      KERNEL_SOURCE_1)
+    TENSOR_POW_KERNELS_2D(F16, F16, F16)
+    TENSOR_POW_KERNELS_2D(F16, F16, U8)
+    TENSOR_POW_KERNELS_2D(F16, U8, F16)
+    TENSOR_POW_KERNELS_2D(F16, U8, U8)

-    TENSOR_POW_KERNELS_2D(F16, F16, I8,     KERNEL_SOURCE_2)
-    TENSOR_POW_KERNELS_2D(F16, I8, F16,     KERNEL_SOURCE_2)
-    TENSOR_POW_KERNELS_2D(F16, I8, I8,      KERNEL_SOURCE_2)
+    TENSOR_POW_KERNELS_2D(F16, F16, I8)
+    TENSOR_POW_KERNELS_2D(F16, I8, F16)
+    TENSOR_POW_KERNELS_2D(F16, I8, I8)

-    TENSOR_POW_KERNELS_2D(F16, F16, I16,    KERNEL_SOURCE_3)
-    TENSOR_POW_KERNELS_2D(F16, I16, F16,    KERNEL_SOURCE_3)
-    TENSOR_POW_KERNELS_2D(F16, I16, I16,    KERNEL_SOURCE_3)
+    TENSOR_POW_KERNELS_2D(F16, F16, I16)
+    TENSOR_POW_KERNELS_2D(F16, I16, F16)
+    TENSOR_POW_KERNELS_2D(F16, I16, I16)

-    TENSOR_POW_KERNELS_2D(U8, F16, F16,     KERNEL_SOURCE_4)
-    TENSOR_POW_KERNELS_2D(U8, F16, U8,      KERNEL_SOURCE_4)
-    TENSOR_POW_KERNELS_2D(U8, U8, U8,       KERNEL_SOURCE_4)
-    TENSOR_POW_KERNELS_2D(U8, U8, F16,      KERNEL_SOURCE_4)
+    TENSOR_POW_KERNELS_2D(U8, F16, F16)
+    TENSOR_POW_KERNELS_2D(U8, F16, U8)
+    TENSOR_POW_KERNELS_2D(U8, U8, U8)
+    TENSOR_POW_KERNELS_2D(U8, U8, F16)

-    TENSOR_POW_KERNELS_2D(I8, F16, F16,     KERNEL_SOURCE_5)
-    TENSOR_POW_KERNELS_2D(I8, F16, I8,      KERNEL_SOURCE_5)
-    TENSOR_POW_KERNELS_2D(I8, I8, I8,       KERNEL_SOURCE_5)
+    TENSOR_POW_KERNELS_2D(I8, F16, F16)
+    TENSOR_POW_KERNELS_2D(I8, F16, I8)
+    TENSOR_POW_KERNELS_2D(I8, I8, I8)

-    TENSOR_POW_KERNELS_2D(I16, F16, F16,    KERNEL_SOURCE_6)
-    TENSOR_POW_KERNELS_2D(I16, F16, I16,    KERNEL_SOURCE_6)
-    TENSOR_POW_KERNELS_2D(I16, I16, I16,    KERNEL_SOURCE_6)
-    TENSOR_POW_KERNELS_2D(BF16, BF16, BF16, KERNEL_SOURCE_3)
+    TENSOR_POW_KERNELS_2D(I16, F16, F16)
+    TENSOR_POW_KERNELS_2D(I16, F16, I16)
+    TENSOR_POW_KERNELS_2D(I16, I16, I16)
+    TENSOR_POW_KERNELS_2D(BF16, BF16, BF16)
 };

 static vx_param_description_t vxPowKernel_param_def[] =
@ -186,24 +137,13 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
        {0, 0, 0},  // localWorkSize: local group size in thread
        {0, 0, 0}}; // globalWorkSize: image size in thread

-    int8_t      in0_fl     = 0;
-    int32_t     src0ZP     = 0;
-    float       src0Scale  = 1.0f;
-    int8_t      in1_fl     = 0;
-    int32_t     src1ZP     = 0;
-    float       src1Scale  = 1.0f;
-    int8_t      out_fl     = 0;
-    float       dstZP      = 0;
-    float       dstScale   = 1.0f;
+    float    input0_scale = 1.0f;
+    float    input1_scale = 1.0f;
+    float    input0_tail = 0;
+    float    input1_tail = 0;
+    float    output_scale = 1.0f;
+    float    output_zp = 0;

-    int32_t    postshift0  = 0;
-    int32_t    postshift1  = 0;
-    float      outScale_fl = 1;
-
-    uint16_t M0            = 0;
-    uint16_t M1            = 0;
-
-    vsi_size_t    zAx        = 1;
    uint32_t pack_key      = 0;
    // dim number ???
    vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
@ -220,58 +160,59 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)

    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
-        in0_fl = (int8_t)attr[0]->dfp.fl;
-        postshift0 = in0_fl - 0;
+        int32_t fl = attr[0]->dfp.fl;
+        if (fl > 0)
+        {
+            input0_scale = 1.0f / (float) ((int64_t)1 << fl);
+        }
+        else
+        {
+            input0_scale = (float)((int64_t)1 << -fl);
+        }
    }
    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
        || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM )
    {
-        src0ZP     = attr[0]->asymm.zero_point;
-        src0Scale  = attr[0]->asymm.scale;
-
-        gpu_quantize_multiplier_16bit(src0Scale / 1.0f, &M0, &postshift0);
+        input0_scale  = attr[0]->asymm.scale;
+        input0_tail = 0 - (float)attr[0]->asymm.zero_point * input0_scale;
    }

    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
-        in1_fl = (int8_t)attr[1]->dfp.fl;
-        postshift1 = in1_fl - 0;
+        int32_t fl = attr[1]->dfp.fl;
+        if (fl > 0)
+        {
+            input1_scale = 1.0f / (float) ((int64_t)1 << fl);
+        }
+        else
+        {
+            input1_scale = (float)((int64_t)1 << -fl);
+        }
    }
    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
        || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
    {
-        src1ZP     = attr[1]->asymm.zero_point;
-        src1Scale  = attr[1]->asymm.scale;
-
-        gpu_quantize_multiplier_16bit(src1Scale / 1.0f, &M1, &postshift1);
+        input1_scale  = attr[1]->asymm.scale;
+        input1_tail = 0 - (float)attr[1]->asymm.zero_point * input1_scale;
    }

    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
-        out_fl = (int8_t)attr[2]->dfp.fl;
-        if (out_fl > 0)
+        int32_t fl = attr[2]->dfp.fl;
+        if (fl > 0)
        {
-            outScale_fl = (vx_float32)((int64_t)1 << out_fl);
+            output_scale = (float) ((int64_t)1 << fl);
        }
        else
        {
-            outScale_fl = (1.0f / (vx_float32)((int64_t)1 << -out_fl));
+            output_scale = 1.0f / (float)((int64_t)1 << -fl);
        }
    }
    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
        || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM )
    {
-        dstZP     = (float)attr[2]->asymm.zero_point;
-        dstScale  = 1.0f / attr[2]->asymm.scale;
-    }
-
-    if ( out_shape->size < 3 )
-    {
-        zAx = 1;
-    }
-    else
-    {
-        zAx = out_shape->data[2];
+        output_zp     = (float)attr[2]->asymm.zero_point;
+        output_scale  = 1.0f / attr[2]->asymm.scale;
    }

 #define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE )    \
@ -287,269 +228,122 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
        / shaderParam.global_scale[0], 4);
    shaderParam.global_size[1]   = gpu_align_p2((out_shape->data[1] + shaderParam.global_scale[1] - 1)
        / shaderParam.global_scale[1], 2);
-    shaderParam.global_size[2]   = gpu_align_p2((zAx + shaderParam.global_scale[2] - 1)
-        / shaderParam.global_scale[2], 1);
+    shaderParam.global_size[2]   = out_shape->size > 2 ? out_shape->data[2] : 1;

    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
    CHECK_STATUS_FAIL_GOTO(status, OnError);

+    switch( pack_key )
    {
-        gpu_dp_inst_t uniConvertFstDataToFp32_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertSecDataToFp32_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertFstDataToFp32_4x4_2 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertSecDataToFp32_4x4_2 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
-            0x33333333, // TCfg
-            0x11110000, // ASelt
-            0x03020100, 0x03020100, // ABin
-            0x00000000, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000,
-            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniConvertUint8SubZpToFp32_4x4 = {{
-            0x09090909, // TCfg
-            0x04040404, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00000000, 0x00010001, 0x00000000,
-            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertSecUint8SubZpToFp32_4x4 = {{
-            0x09090909, // TCfg
-            0x04040404, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00000000, 0x00010001, 0x00000000,
-            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniConvertUint8SubZpToFp32_4x4_2 = {{
-            0x09090909, // TCfg
-            0x04040404, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00000000, 0x00010001, 0x00000000,
-            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniConvertSecUint8SubZpToFp32_4x4_2 = {{
-            0x09090909, // TCfg
-            0x04040404, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00000000, 0x00010001, 0x00000000,
-            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{
-            0x11111111, // TCfg
-            0x11110000, // ASelt
-            0x06040200, 0x06040200, // ABin
-            0x22222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
-            0x11111111, // TCfg
-            0x01010101, // ASelt
-            0x01050004, 0x03070206, // ABin
-            0x22222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
-            0x11111111, // TCfg
-            0x01010101, // ASelt
-            0x05050404, 0x07070606, // ABin
-            0x22222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniExtractOddData_2x8 = {{
-            0x11111111, // TCfg
-            0x11110000, // ASelt
-            0x07050301, 0x07050301, // ABin
-            0x22222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
-            0x11111111, // TCfg
-            0x11110000, // ASelt
-            0x06040200, 0x06040200, // ABin
-            0x22222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        uint32_t multiplierA = (M0 << 16) | M0;
-        uint32_t multiplierB = (M1 << 16) | M1;
-        int32_t i = 8;
-
-        uniConvertUint8SubZpToFp32_4x4.data[7] |= (postshift0 & 0x1F);
-        uniConvertSecUint8SubZpToFp32_4x4.data[7] |= (postshift0 & 0x1F);
-        uniConvertUint8SubZpToFp32_4x4_2.data[7] |= (postshift1 & 0x1F);
-        uniConvertSecUint8SubZpToFp32_4x4_2.data[7] |= (postshift1 & 0x1F);
-        for ( i = 8; i < 16; i += 2 )
+    case _PACK_SELECT_KEY( BF16, BF16, BF16 ):
        {
-            uniConvertUint8SubZpToFp32_4x4.data[i] = multiplierA;
-            uniConvertSecUint8SubZpToFp32_4x4.data[i] = multiplierA;
-            uniConvertUint8SubZpToFp32_4x4_2.data[i] = multiplierB;
-            uniConvertSecUint8SubZpToFp32_4x4_2.data[i] = multiplierB;
+            gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+                0x11111111, // TCfg
+                0x01010101, // ASelt
+                0x01050004, 0x03070206, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+                0x11111111, // TCfg
+                0x01010101, // ASelt
+                0x05050404, 0x07070606, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniExtractOddData_2x8 = {{
+                0x11111111, // TCfg
+                0x11110000, // ASelt
+                0x07050301, 0x07050301, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };
+            status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8",
+                &uniConvBF16toF32_Part0_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8",
+                &uniConvBF16toF32_Part1_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
+                &uniExtractOddData_2x8);
+            CHECK_STATUS_FAIL_GOTO(status, OnError );
        }
-
-        if ( attr[0]->dtype == I8 || attr[0]->dtype == I16 )
+        break;
+    default:
        {
-            gpu_dp_inst_update_postshfit( &uniConvertFstDataToFp32_4x4, postshift0 );
-            gpu_dp_inst_update_postshfit( &uniConvertSecDataToFp32_4x4, postshift0 );
-        }
+            gpu_dp_inst_t uniConvertFstDataToFp32_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniConvertSecDataToFp32_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00050004, 0x00070006, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniExtact8Bit_2x8 = {{
+                0x33333333, // TCfg
+                0x11110000, // ASelt
+                0x03020100, 0x03020100, // ABin
+                0x00000000, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniExtactHalf8_2x8 = {{
+                0x11111111, // TCfg
+                0x11110000, // ASelt
+                0x06040200, 0x06040200, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+            }, GPU_DP_TYPE_16};

-        if ( attr[1]->dtype == I8 || attr[1]->dtype == I16 )
-        {
-            gpu_dp_inst_update_postshfit( &uniConvertFstDataToFp32_4x4_2, postshift1 );
-            gpu_dp_inst_update_postshfit( &uniConvertSecDataToFp32_4x4_2, postshift1 );
+            status = vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4",
+                &uniConvertFstDataToFp32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4",
+                &uniConvertSecDataToFp32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "input0_scale", &input0_scale);
+            status |= vsi_nn_kernel_gpu_add_param( node, "input1_scale", &input1_scale);
+            status |= vsi_nn_kernel_gpu_add_param( node, "input0_tail", &input0_tail);
+            status |= vsi_nn_kernel_gpu_add_param( node, "input1_tail", &input1_tail);
+            status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale);
+            status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp);
+            if (attr[2]->dtype == F16)
+            {
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtact8Bit_2x8",
+                    &uniExtactHalf8_2x8);
+            }
+            else
+            {
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtact8Bit_2x8",
+                    &uniExtact8Bit_2x8);
+            }
+            CHECK_STATUS_FAIL_GOTO(status, OnError );
        }
-
-        switch( pack_key )
-        {
-            case _PACK_SELECT_KEY( F16, F16, I8 ):
-            case _PACK_SELECT_KEY( F16, I8, F16 ):
-            case _PACK_SELECT_KEY( F16, I8, I8 ):
-            case _PACK_SELECT_KEY( F16, F16, I16 ):
-            case _PACK_SELECT_KEY( F16, I16, F16 ):
-            case _PACK_SELECT_KEY( F16, I16, I16 ):
-            case _PACK_SELECT_KEY( I8, F16, F16 ):
-            case _PACK_SELECT_KEY( I8, F16, I8 ):
-            case _PACK_SELECT_KEY( I8, I8, I8 ):
-            case _PACK_SELECT_KEY( I16, F16, F16 ):
-            case _PACK_SELECT_KEY( I16, F16, I16 ):
-            case _PACK_SELECT_KEY( I16, I16, I16 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4",
-                        &uniConvertFstDataToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4",
-                        &uniConvertSecDataToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4_2",
-                        &uniConvertFstDataToFp32_4x4_2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4_2",
-                        &uniConvertSecDataToFp32_4x4_2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "outScale_fl", &outScale_fl);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            case _PACK_SELECT_KEY( U8, F16, F16 ):
-            case _PACK_SELECT_KEY( U8, F16, U8 ):
-            case _PACK_SELECT_KEY( U8, U8, U8 ):
-            case _PACK_SELECT_KEY( U8, U8, F16 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4",
-                        &uniConvertUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4",
-                        &uniConvertSecUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4_2",
-                        &uniConvertFstDataToFp32_4x4_2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4_2",
-                        &uniConvertSecDataToFp32_4x4_2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4_2",
-                        &uniConvertUint8SubZpToFp32_4x4_2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4_2",
-                        &uniConvertSecUint8SubZpToFp32_4x4_2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
-                        &uniConvertHalftoFp16_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP0", &src0ZP);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP1", &src1ZP);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &dstZP);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            case _PACK_SELECT_KEY( F16, F16, F16 ):
-            case _PACK_SELECT_KEY( F16, F16, U8 ):
-            case _PACK_SELECT_KEY( F16, U8, F16 ):
-            case _PACK_SELECT_KEY( F16, U8, U8 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4",
-                        &uniConvertFstDataToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4",
-                        &uniConvertSecDataToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4_2",
-                        &uniConvertUint8SubZpToFp32_4x4_2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4_2",
-                        &uniConvertSecUint8SubZpToFp32_4x4_2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
-                        &uniConvertHalfToFp16_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP1", &src1ZP);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &dstZP);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            case _PACK_SELECT_KEY( BF16, BF16, BF16 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8",
-                        &uniConvBF16toF32_Part0_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8",
-                        &uniConvBF16toF32_Part1_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
-                        &uniExtractOddData_2x8);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-            default:
-                break;
-        }
-#undef _PACK_SELECT_KEY
-        status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
-        CHECK_STATUS_FAIL_GOTO(status, OnError );
+        break;
    }
+#undef _PACK_SELECT_KEY

 OnError:
    if ( attr[0] )
@ -646,7 +440,6 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_POW_PARAM_NUM,
                inputs, 2, outputs, 1 );
            status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_POW_PARAM_NUM );
-
        }
    }
    return node;
@ -655,4 +448,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_EVIS( pow, _setup )
-
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
@ -126,8 +126,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
    CHECK_STATUS_FAIL_GOTO(status, OnError );

    out_shape  = attr[0]->shape;
-    dstZP      = attr[0]->asymm.zero_point;
-    outputScale   = attr[0]->asymm.scale;
    width      = (uint32_t)(out_shape->data[0]);
    height     = (uint32_t)(out_shape->data[1]);

@ -152,7 +150,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
    }
    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
    {
-        outputScale = 1.0f/outputScale;
+        outputScale = 1.0f / attr[0]->asymm.scale;
+        dstZP = attr[0]->asymm.zero_point;
    }
    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
    {
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
@ -128,8 +128,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer)
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );

    out_shape  = attr[0]->shape;
-    dstZP      = (float)attr[0]->asymm.zero_point;
-    outputScale   = attr[0]->asymm.scale;
    width      = (uint32_t)(out_shape->data[0]);
    height     = (uint32_t)(out_shape->data[1]);

@ -147,7 +145,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer)
    }
    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
    {
-        outputScale = 1.0f/outputScale;
+        outputScale = 1.0f / attr[0]->asymm.scale;
+        dstZP = (float)attr[0]->asymm.zero_point;
    }
    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
    {
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
@ -148,8 +148,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
    CHECK_STATUS_FAIL_GOTO(status, OnError );

    out_shape  = attr[0]->shape;
-    dstZP      = attr[0]->asymm.zero_point;
-    dstScale   = attr[0]->asymm.scale;
    width      = (uint32_t)(out_shape->data[0]);
    height     = (uint32_t)(out_shape->data[1]);

@ -161,7 +159,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)

    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
    {
-        dstScale = 1.0f / dstScale;
+        dstScale = 1.0f / attr[0]->asymm.scale;
+        dstZP = attr[0]->asymm.zero_point;
    }
    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
    {
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
@ -35,13 +35,15 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

-#define KERNEL_SOURCE_0    "pre_process_rgb888_planar_0",
-#define KERNEL_SOURCE_1    "pre_process_rgb888_planar_1",
-#define KERNEL_SOURCE_2    "pre_process_rgb888_planar_2",
+#define RGB888_SEP_SOURCE_0     "pre_process_rgb888_planar_sep_0",
+#define RGB888_SEP_SOURCE_1     "pre_process_rgb888_planar_sep_1",
+#define RGB888_SEP_SOURCE_2     "pre_process_rgb888_planar_sep_2",
+#define RGB888_SOURCE_0         "pre_process_rgb888_planar_0",
+#define RGB888_SOURCE_1         "pre_process_rgb888_planar_1",
+#define RGB888_SOURCE_2         "pre_process_rgb888_planar_2",

 #define STR(a) #a

@ -53,28 +55,48 @@ typedef enum
    HALF
 } _internal_scale_e;
 // Add kernel hashtable here
-#define PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, SCALE_FLAG ) \
-        (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 ) | (SCALE_FLAG))
+#define PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, SEP, SCALE_FLAG ) \
+        (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 ) | ( SEP << 4 ) | (SCALE_FLAG))

 #define PACK_KERNEL_SCALE_MAP( IN_DTYPE, OUT_DTYPE ) \
-        { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, SCALE ), \
-          CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
-          KERNEL_SOURCE_0 }
+   { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, SCALE ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+     RGB888_SOURCE_0 }
+
+#define PACK_KERNEL_SEP_SCALE_MAP( IN_DTYPE, OUT_DTYPE ) \
+  { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, SCALE ), \
+    CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+    RGB888_SEP_SOURCE_0 }

 #define PACK_KERNEL_COPY_MAP( IN_DTYPE, OUT_DTYPE ) \
-        { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, COPY ), \
-          CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
-          KERNEL_SOURCE_1 }
+   { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, COPY ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+     RGB888_SOURCE_1 }
+
+#define PACK_KERNEL_SEP_COPY_MAP( IN_DTYPE, OUT_DTYPE ) \
+   { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, COPY ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+     RGB888_SEP_SOURCE_1 }

 #define PACK_KERNEL_4_OVER_3_MAP( IN_DTYPE, OUT_DTYPE ) \
-        { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, FOUR_OVER_THREE ), \
-          CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_4over3_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
-          KERNEL_SOURCE_2 }
+   { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, FOUR_OVER_THREE ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_4over3_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+     RGB888_SOURCE_2 }
+
+#define PACK_KERNEL_SEP_4_OVER_3_MAP( IN_DTYPE, OUT_DTYPE ) \
+   { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, FOUR_OVER_THREE ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_4over3_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+     RGB888_SEP_SOURCE_2 }

 #define PACK_KERNEL_HALF_MAP( IN_DTYPE, OUT_DTYPE ) \
-        { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, HALF ), \
-          CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
-          KERNEL_SOURCE_2 }
+   { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, HALF ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+     RGB888_SOURCE_2 }
+
+#define PACK_KERNEL_SEP_HALF_MAP( IN_DTYPE, OUT_DTYPE ) \
+   { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, HALF ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+     RGB888_SEP_SOURCE_2 }

 typedef struct
 {
@ -98,6 +120,19 @@ static const _kernel_map_type pre_process_rgb888_planar_kernel_map[] =

    PACK_KERNEL_4_OVER_3_MAP( U8, U8 ),
    PACK_KERNEL_HALF_MAP( U8, U8 ),
+
+    PACK_KERNEL_SEP_SCALE_MAP( U8, F16 ),
+    PACK_KERNEL_SEP_SCALE_MAP( U8, I16 ),
+    PACK_KERNEL_SEP_SCALE_MAP( U8, I8 ),
+    PACK_KERNEL_SEP_SCALE_MAP( U8, U8 ),
+
+    PACK_KERNEL_SEP_COPY_MAP( U8, F16 ),
+    PACK_KERNEL_SEP_COPY_MAP( U8, I16 ),
+    PACK_KERNEL_SEP_COPY_MAP( U8, I8 ),
+    PACK_KERNEL_SEP_COPY_MAP( U8, U8 ),
+
+    PACK_KERNEL_SEP_4_OVER_3_MAP( U8, U8 ),
+    PACK_KERNEL_SEP_HALF_MAP( U8, U8 ),
 };


@ -105,6 +140,23 @@ static const _kernel_map_type pre_process_rgb888_planar_kernel_map[] =
 * Kernel params
 */
 static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM  _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def )
+
+static vx_param_description_t _pre_process_rgb888_planar_sep_kernel_param_def[] =
 {
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@ -121,7 +173,7 @@ static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
-#define _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM  _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def )
+#define _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM  _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def )

 /*
 * Kernel initializer
@ -149,9 +201,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer)
    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
    vsi_size_array_t * out_shape = NULL;

-    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
+    {
+        attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    }
+    else
+    {
+        attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    }
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &output_scale);
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &output_scale);
    CHECK_STATUS_FAIL_GOTO(status, OnError );

    out_shape  = attr[0]->shape;
@ -310,9 +369,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
    vsi_size_array_t * out_shape = NULL;

-    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
+    {
+        attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    }
+    else
+    {
+        attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    }
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &output_scale);
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &output_scale);
    CHECK_STATUS_FAIL_GOTO(status, OnError );

    out_shape  = attr[0]->shape;
@ -406,7 +472,14 @@ DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer)

    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
+    {
+        attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    }
+    else
+    {
+        attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    }
    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );

    out_shape  = attr[1]->shape;
@ -540,6 +613,7 @@ static vsi_status _query_kernel
    vsi_bool is_4_over_3 = FALSE;
    vsi_bool is_half_scale = FALSE;
    vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
+    vsi_bool is_rgb888_sep = (vsi_bool)(inputs[1] != NULL);

    is_4_over_3 = (width * 3 == (int32_t)outputs[0]->attr.size[0] * 4) &&
                  (height * 3 == (int32_t)outputs[0]->attr.size[1] * 4);
@ -568,7 +642,7 @@ static vsi_status _query_kernel
        }
    }

-    key = PRE_PROCESS_RGB888_PLANAR_HASH_KEY( input0_dtype, output_dtype, scale_type);
+    key = PRE_PROCESS_RGB888_PLANAR_HASH_KEY( input0_dtype, output_dtype, is_rgb888_sep, scale_type);

    for ( i = 0; i < _cnt_of_array(pre_process_rgb888_planar_kernel_map); i ++ )
    {
@ -581,8 +655,17 @@ static vsi_status _query_kernel
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",
            pre_process_rgb888_planar_kernel_map[i].function_name );
-        kernel->info.parameters = _pre_process_rgb888_planar_kernel_param_def;
-        kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def );
+
+        if (is_rgb888_sep)
+        {
+            kernel->info.parameters = _pre_process_rgb888_planar_sep_kernel_param_def;
+            kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def );
+        }
+        else
+        {
+            kernel->info.parameters = _pre_process_rgb888_planar_kernel_param_def;
+            kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def );
+        }

        if (enable_copy)
        {
@ -620,8 +703,9 @@ static vsi_nn_kernel_node_t _setup
    )
 {
    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_PRE_PROCESS_RGB888_PLANAR_PARAM_NUM];
+    vsi_nn_kernel_node_param_t* node_params = NULL;
    vsi_nn_kernel_node_t node = NULL;
+    int32_t param_count = _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM;
    int32_t width  = vsi_nn_kernel_param_get_int32( params, "width" );
    int32_t height = vsi_nn_kernel_param_get_int32( params, "height" );
    float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
@ -630,7 +714,10 @@ static vsi_nn_kernel_node_t _setup
    float scale = vsi_nn_kernel_param_get_float32( params, "scale" );
    vsi_bool is_no_range_change = FALSE;

-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    input_num = inputs[1] == NULL ? 1 : input_num;
+    param_count = inputs[1] == NULL ? _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM : param_count;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
        return NULL;
@ -648,17 +735,19 @@ static vsi_nn_kernel_node_t _setup
    status = _query_kernel( inputs, outputs, kernel, params, is_no_range_change, width, height );
    if ( VSI_SUCCESS == status)
    {
+        node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count);
        node = vsi_nn_kernel_create_node( graph, kernel );
        if ( node )
        {
-            uint32_t index = 6;
+            uint32_t index = inputs[1] == NULL ? 4 : 6;
+            uint32_t scalar_index = index;
            int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
            int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" );
            int32_t left    = vsi_nn_kernel_param_get_int32( params, "left" );
            int32_t top     = vsi_nn_kernel_param_get_int32( params, "top" );

            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM,
+            vsi_nn_kernel_node_pack_io( node_params, param_count,
                    inputs, input_num, outputs, output_num );

            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
@ -670,17 +759,21 @@ static vsi_nn_kernel_node_t _setup
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale );
            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &node_params[6] );
-            vsi_nn_kernel_scalar_release( &node_params[7] );
-            vsi_nn_kernel_scalar_release( &node_params[8] );
-            vsi_nn_kernel_scalar_release( &node_params[9] );
-            vsi_nn_kernel_scalar_release( &node_params[10] );
-            vsi_nn_kernel_scalar_release( &node_params[11] );
-            vsi_nn_kernel_scalar_release( &node_params[12] );
-            vsi_nn_kernel_scalar_release( &node_params[13] );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, param_count );
+            index = scalar_index;
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
        }
    }
+
+    vsi_nn_safe_free(node_params);
+
    return node;
 } /* _setup() */

--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
@ -150,8 +150,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
    CHECK_STATUS_FAIL_GOTO(status, OnError );

    out_shape  = attr[0]->shape;
-    outputZP   = (float)attr[0]->asymm.zero_point;
-    outputScale   = attr[0]->asymm.scale;
    width      = (uint32_t)(out_shape->data[0]);
    height     = (uint32_t)(out_shape->data[1]);

@ -176,7 +174,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
    }
    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
    {
-        outputScale = 1.0f / outputScale;
+        outputScale = 1.0f / attr[0]->asymm.scale;
+        outputZP = (float)attr[0]->asymm.zero_point;
    }
    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
    {
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
@ -135,8 +135,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
    CHECK_STATUS_FAIL_GOTO(status, OnError );

    out_shape  = attr[0]->shape;
-    dstZP      = attr[0]->asymm.zero_point;
-    dstScale   = attr[0]->asymm.scale;
    width      = (uint32_t)(out_shape->data[0]);
    height     = (uint32_t)(out_shape->data[1]);

@ -151,9 +149,22 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
        width = width / 3;
    }

-    if (attr[0]->dtype == U8)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
    {
-        dstScale = 1.0f / dstScale;
+        dstScale = 1.0f / attr[0]->asymm.scale;
+        dstZP = attr[0]->asymm.zero_point;
+    }
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        if (attr[0]->dfp.fl > 0)
+        {
+            dstScale = (float)((int64_t)1 << attr[0]->dfp.fl);
+        }
+        else
+        {
+            dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
+        }
+        dstZP = 0;
    }

    shaderParam.global_scale[0]  = 16;
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
@ -130,8 +130,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
    CHECK_STATUS_FAIL_GOTO(status, OnError );

    out_shape  = attr[0]->shape;
-    dstZP      = attr[0]->asymm.zero_point;
-    dstScale   = attr[0]->asymm.scale;
    width      = (uint32_t)(out_shape->data[0]);
    height     = (uint32_t)(out_shape->data[1]);

@ -141,9 +139,22 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
        order1 = 0;
    }

-    if (attr[0]->dtype == U8)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
    {
-        dstScale = 1.0f / dstScale;
+        dstScale = 1.0f / attr[0]->asymm.scale;
+        dstZP = attr[0]->asymm.zero_point;
+    }
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        if (attr[0]->dfp.fl > 0)
+        {
+            dstScale = (float)((int64_t)1 << attr[0]->dfp.fl);
+        }
+        else
+        {
+            dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
+        }
+        dstZP = 0;
    }

    shaderParam.global_scale[0]  = 16;
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@ -51,6 +51,7 @@ typedef enum
    UP_3X_HALF,
    UP_4X_HALF,
    UP_8X_HALF,
+    UP_8X_ALIGN,
 } _internal_scale_e;

 #define _RESIZE_BILINEAR_KERNEL_SOURCE(_input_type)      "resize_bilinear_"#_input_type
@ -102,6 +103,12 @@ typedef enum
            "_SAME_3x_upsample_half_pixel_centers"), \
          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }

+#define PACK_KERNEL_MAP_UP_8X_ALIGN( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN ), \
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_SAME_8x_upsample_align_corners"), \
+          "resize_bilinear_align_corners" }
+
 typedef struct
 {
    uint32_t key;
@ -128,6 +135,7 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
    PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
    PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
    PACK_KERNEL_MAP_UP_8X_HALF(U8, U8),
+    PACK_KERNEL_MAP_UP_8X_ALIGN(U8, U8),
 };


@ -228,11 +236,6 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
    uint32_t    out_height;
    float       half_pixel_value = 0.0f;
    vsi_bool    is_use_scale_kernel = (vsi_bool)(_RESIZE_BILINEAR_PARAM_NUM == param_size);
-    vsi_bool    is_half_pixel_centers     = FALSE;
-    vsi_bool    is_2x_up_kernel  = FALSE;
-    vsi_bool    is_3x_up_kernel  = FALSE;
-    vsi_bool    is_4x_up_kernel  = FALSE;
-    vsi_bool    is_8x_up_kernel  = FALSE;

    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
@ -257,20 +260,20 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)

    if (align_corners && out_width > 1)
    {
-        scale_factor[0] = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
+        scale_factor[0] = ((float)(in_width - 1) * 1.0f) / (float)(out_width - 1);
    }
    else
    {
-        scale_factor[0] = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
+        scale_factor[0] = ((float)in_width * 1.0f) / (float)out_width;
    }

    if (align_corners && out_height > 1)
    {
-        scale_factor[1] = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1);
+        scale_factor[1] = ((float)(in_height - 1) * 1.0f) / (float)(out_height - 1);
    }
    else
    {
-        scale_factor[1] = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height;
+        scale_factor[1] = ((float)in_height * 1.0f) / (float)out_height;
    }

    if (half_pixel_centers)
@ -282,16 +285,6 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
        half_pixel_value = 0.0f;
    }

-    is_half_pixel_centers = (!align_corners) && (half_pixel_centers);
-
-    if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)) && is_half_pixel_centers)
-    {
-        is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
-        is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
-        is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
-        is_8x_up_kernel = (8 * in_width == out_width) && (8 * in_height == out_height);
-    }
-
    if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
    {
        input_scale    = input_attr->asymm.scale;
@ -302,11 +295,11 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
        srcFixPointPos   = input_attr->dfp.fl;
        if (srcFixPointPos >= 0)
        {
-            input_scale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
+            input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
        }
        else if (srcFixPointPos < 0)
        {
-            input_scale = (vx_float32)((int64_t)1 << -srcFixPointPos);
+            input_scale = (float)((int64_t)1 << -srcFixPointPos);
        }
        inputZP = 0;
    }
@ -326,11 +319,11 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
        dstFixPointPos = output_attr->dfp.fl;
        if (dstFixPointPos >= 0)
        {
-            output_scale = (vx_float32) ((int64_t)1 << dstFixPointPos);
+            output_scale = (float) ((int64_t)1 << dstFixPointPos);
        }
        else if (dstFixPointPos < 0)
        {
-            output_scale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
+            output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
        }
        outputZP = 0;
    }
@ -340,226 +333,11 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
        outputZP     = 0;
    }

-    if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
-    {
-        gpu_param.global_scale[0] = 16;
-        gpu_param.global_scale[1] = 1;
-    }
-    else if (is_3x_up_kernel)
-    {
-        gpu_param.global_scale[0] = 15;
-        gpu_param.global_scale[1] = 6;
-        gpu_param.global_scale[2] = 1;
-    }
-    else
-    {
-        gpu_param.global_scale[0] = 4;
-        gpu_param.global_scale[1] = 1;
-        gpu_param.global_scale[2] = 1;
-    }
+    gpu_param.global_scale[0] = 4;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;

-    if (is_2x_up_kernel)
-    {
-        gpu_dp_inst_t uniResize2xUp_0_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
-            0x00000704, // AccumType, ConstantType, and PostShift
-            0x09030301, 0x03090103, 0x09030301, 0x03090103,
-            0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize2xUp_1_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
-            0x00000704, // AccumType, ConstantType, and PostShift
-            0x09030301, 0x03090103, 0x09030301, 0x03090103,
-            0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
-        }, GPU_DP_TYPE_16};
-
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if (is_3x_up_kernel)
-    {
-        gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{
-            0x15515515, // TCfg
-            0x00000000, // ASelt
-            0x21210110, 0x03323202, // ABin
-            0x2aa2aa2a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000610, // AccumType, ConstantType, and PostShift
-            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
-            0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{
-            0x05155155, // TCfg
-            0x00000000, // ASelt
-            0x54044343, 0x00650554, // ABin
-            0x0a2aa2aa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000610, // AccumType, ConstantType, and PostShift
-            0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
-            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{
-            0x55551155, // TCfg
-            0x50501050, // ASelt
-            0x01011010, 0x21212121, // ABin
-            0xaaaa22aa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x0000060f, // AccumType, ConstantType, and PostShift
-            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
-            0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{
-            0x11555511, // TCfg
-            0x10505010, // ASelt
-            0x32320202, 0x03033232, // ABin
-            0x22aaaa22, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x0000060f, // AccumType, ConstantType, and PostShift
-            0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
-            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{
-            0x55115555, // TCfg
-            0x50105050, // ASelt
-            0x43434343, 0x54540404, // ABin
-            0xaa22aaaa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x0000060f, // AccumType, ConstantType, and PostShift
-            0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
-            0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{
-            0x00551155, // TCfg
-            0x00501050, // ASelt
-            0x05055454, 0x00006565, // ABin
-            0x00aa22aa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x0000060f, // AccumType, ConstantType, and PostShift
-            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
-            0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
-
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if (is_4x_up_kernel)
-    {
-        gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
-            0x00000406, // AccumType, ConstantType, and PostShift
-            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
-            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
-            0x00000406, // AccumType, ConstantType, and PostShift
-            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
-            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
-            0x00000406, // AccumType, ConstantType, and PostShift
-            0x23150503, 0x31070701, 0x07310107, 0x15230305,
-            0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
-            0x00000406, // AccumType, ConstantType, and PostShift
-            0x23150503, 0x31070701, 0x07310107, 0x15230305,
-            0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
-        }, GPU_DP_TYPE_16};
-
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if (is_8x_up_kernel)
-    {
-        gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
-            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
-            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
-            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
-            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
-            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
-            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
-            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
-            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
-        }, GPU_DP_TYPE_16};
-
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
+    if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
    {
        float dfpScale = input_scale * output_scale;
        gpu_dp_inst_t uniConvertDFP2FP32_4x4 = {{
@ -840,7 +618,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
    else if (F16 == input_dtype && (U8 == output_dtype || F16 == output_dtype))
    {
        float   uint8Scale             = 1.0f / output_scale;
-        float   uint8ZP_out            = (vx_float32)outputZP;
+        float   uint8ZP_out            = (float)outputZP;
        gpu_dp_inst_t uniExtact8Bit_2x8 = {{
            0x33333333, // TCfg
            0x11110000, // ASelt
@ -1045,11 +823,299 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
        goto final;
    }

-    if (!is_2x_up_kernel && !is_3x_up_kernel && !is_4x_up_kernel&& !is_8x_up_kernel)
+    status  = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    gpu_param.global_size[0] = gpu_align_p2((out_width  + \
+        gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1];
+    gpu_param.global_size[2] = depth / gpu_param.global_scale[2];
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+final:
+    if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
+    if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
+    return status;
+} /* _resize_bilinear_initializer() */
+
+DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
+    vsi_nn_kernel_tensor_attr_t * input_attr    = NULL;
+    vsi_size_array_t             * out_shape     = NULL;
+    vsi_size_array_t             * in_shape      = NULL;
+    vsi_nn_kernel_dtype_e         input_dtype   = F16;
+    uint32_t    depth = 0;
+    uint32_t    in_width = 0;
+    uint32_t    in_height = 0;
+    uint32_t    out_width = 0;
+    uint32_t    out_height = 0;
+    vsi_bool    is_2x_up_kernel  = FALSE;
+    vsi_bool    is_3x_up_kernel  = FALSE;
+    vsi_bool    is_4x_up_kernel  = FALSE;
+    vsi_bool    is_8x_up_kernel  = FALSE;
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    out_shape     = output_attr->shape;
+    in_shape      = input_attr->shape;
+    input_dtype   = input_attr->dtype;
+
+    in_width          = (uint32_t)(in_shape->data[0]);
+    in_height         = (uint32_t)(in_shape->data[1]);
+    depth             = (uint32_t)(in_shape->data[2]);
+    out_width         = (uint32_t)(out_shape->data[0]);
+    out_height        = (uint32_t)(out_shape->data[1]);
+
+    if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
    {
-        status  = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value);
+        is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
+        is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
+        is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
+        is_8x_up_kernel = (8 * in_width == out_width) && (8 * in_height == out_height);
+    }
+
+    if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
+    {
+        gpu_param.global_scale[0] = 16;
+        gpu_param.global_scale[1] = 1;
+    }
+    else if (is_3x_up_kernel)
+    {
+        gpu_param.global_scale[0] = 15;
+        gpu_param.global_scale[1] = 6;
+        gpu_param.global_scale[2] = 1;
+    }
+    else
+    {
+        gpu_param.global_scale[0] = 4;
+        gpu_param.global_scale[1] = 1;
+        gpu_param.global_scale[2] = 1;
+    }
+
+    if (is_2x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize2xUp_0_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
+            0x00000704, // AccumType, ConstantType, and PostShift
+            0x09030301, 0x03090103, 0x09030301, 0x03090103,
+            0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize2xUp_1_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
+            0x00000704, // AccumType, ConstantType, and PostShift
+            0x09030301, 0x03090103, 0x09030301, 0x03090103,
+            0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
+    else if (is_3x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{
+            0x15515515, // TCfg
+            0x00000000, // ASelt
+            0x21210110, 0x03323202, // ABin
+            0x2aa2aa2a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
+            0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{
+            0x05155155, // TCfg
+            0x00000000, // ASelt
+            0x54044343, 0x00650554, // ABin
+            0x0a2aa2aa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
+            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{
+            0x55551155, // TCfg
+            0x50501050, // ASelt
+            0x01011010, 0x21212121, // ABin
+            0xaaaa22aa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
+            0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{
+            0x11555511, // TCfg
+            0x10505010, // ASelt
+            0x32320202, 0x03033232, // ABin
+            0x22aaaa22, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
+            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{
+            0x55115555, // TCfg
+            0x50105050, // ASelt
+            0x43434343, 0x54540404, // ABin
+            0xaa22aaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
+            0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{
+            0x00551155, // TCfg
+            0x00501050, // ASelt
+            0x05055454, 0x00006565, // ABin
+            0x00aa22aa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
+            0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else if (is_4x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
+            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
+            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x23150503, 0x31070701, 0x07310107, 0x15230305,
+            0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x23150503, 0x31070701, 0x07310107, 0x15230305,
+            0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else if (is_8x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
+            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
+            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
+            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
+            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
+            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
+            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
+            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
+            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else
+    {
+        VSILOGE("input or output's format is not support");
+        status = VSI_FAILURE;
+        goto final;
+    }

    if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
    {
@ -1071,7 +1137,168 @@ final:
    if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
    if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
    return status;
-} /* _resize_bilinear_initializer() */
+} /* _bilinear_half_pixel_centers_opt_initializer() */
+
+DEF_KERNEL_INITIALIZER(_bilinear_align_corners_opt_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
+    vsi_nn_kernel_tensor_attr_t * input_attr    = NULL;
+    vsi_size_array_t             * out_shape     = NULL;
+    vsi_size_array_t             * in_shape      = NULL;
+    vsi_nn_kernel_dtype_e         input_dtype   = F16;
+    uint32_t    depth = 0;
+    float       scale_factor[2] = {0};
+    uint32_t    in_width = 0;
+    uint32_t    in_height = 0;
+    uint32_t    out_width = 0;
+    uint32_t    out_height = 0;
+    vsi_bool    is_8x_align_corners  = FALSE;
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    out_shape     = output_attr->shape;
+    in_shape      = input_attr->shape;
+    input_dtype   = input_attr->dtype;
+
+    in_width          = (uint32_t)(in_shape->data[0]);
+    in_height         = (uint32_t)(in_shape->data[1]);
+    depth             = (uint32_t)(in_shape->data[2]);
+    out_width         = (uint32_t)(out_shape->data[0]);
+    out_height        = (uint32_t)(out_shape->data[1]);
+
+    if (out_width > 1)
+    {
+        scale_factor[0] = ((float)(in_width - 1) * 1.0f) / (float)(out_width - 1);
+    }
+    else
+    {
+        scale_factor[0] = ((float)in_width * 1.0f) / (float)out_width;
+    }
+
+    if (out_height > 1)
+    {
+        scale_factor[1] = ((float)(in_height - 1) * 1.0f) / (float)(out_height - 1);
+    }
+    else
+    {
+        scale_factor[1] = ((float)in_height * 1.0f) / (float)out_height;
+    }
+
+    if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
+    {
+        is_8x_align_corners = (scale_factor[0] == scale_factor[1]) && (scale_factor[0] = 0.125f);
+    }
+
+    if (is_8x_align_corners)
+    {
+        gpu_param.global_scale[0] = 2;
+        gpu_param.global_scale[1] = 1;
+        gpu_param.global_scale[2] = 1;
+    }
+
+    if (is_8x_align_corners)
+    {
+        gpu_dp_inst_t uniBilinear_8x_l10_4x8 = {{
+            0x55555505, 0x55555555, // TCfg
+            0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x00000838, 0x01070731, 0x02060e2a, 0x03051523,
+            0x04041c1c, 0x05032315, 0x06022a0e, 0x07013107 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniBilinear_8x_l11_4x8 = {{
+            0x55555505, 0x55555555, // TCfg
+            0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x00000838, 0x01070731, 0x02060e2a, 0x03051523,
+            0x04041c1c, 0x05032315, 0x06022a0e, 0x07013107 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniBilinear_8x_l20_4x8 = {{
+            0x55555505, 0x55555555, // TCfg
+            0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x00001030, 0x020e062a, 0x040c0c24, 0x060a121e,
+            0x08081818, 0x0a061e12, 0x0c04240c, 0x0e022a06 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniBilinear_8x_l21_4x8 = {{
+            0x55555505, 0x55555555, // TCfg
+            0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x00001030, 0x020e062a, 0x040c0c24, 0x060a121e,
+            0x08081818, 0x0a061e12, 0x0c04240c, 0x0e022a06 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniBilinear_8x_l30_4x8 = {{
+            0x55555505, 0x55555555, // TCfg
+            0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x00001828, 0x03150523, 0x06120a1e, 0x090f0f19,
+            0x0c0c1414, 0x0f09190f, 0x12061e0a, 0x15032305 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniBilinear_8x_l31_4x8 = {{
+            0x55555505, 0x55555555, // TCfg
+            0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x00001828, 0x03150523, 0x06120a1e, 0x090f0f19,
+            0x0c0c1414, 0x0f09190f, 0x12061e0a, 0x15032305 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniBilinear_8x_l40_4x8 = {{
+            0x55555505, 0x55555555, // TCfg
+            0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x00002020, 0x041c041c, 0x08180818, 0x0c140c14,
+            0x10101010, 0x140c140c, 0x18081808, 0x1c041c04 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniBilinear_8x_l41_4x8 = {{
+            0x55555505, 0x55555555, // TCfg
+            0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x00002020, 0x041c041c, 0x08180818, 0x0c140c14,
+            0x10101010, 0x140c140c, 0x18081808, 0x1c041c04 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l10_4x8", &uniBilinear_8x_l10_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l11_4x8", &uniBilinear_8x_l11_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l20_4x8", &uniBilinear_8x_l20_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l21_4x8", &uniBilinear_8x_l21_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l30_4x8", &uniBilinear_8x_l30_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l31_4x8", &uniBilinear_8x_l31_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l40_4x8", &uniBilinear_8x_l40_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l41_4x8", &uniBilinear_8x_l41_4x8);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else
+    {
+        VSILOGE("input or output's format is not support");
+        status = VSI_FAILURE;
+        goto final;
+    }
+
+    gpu_param.global_size[0] = gpu_align_p2((in_width  + \
+                               gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (in_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1];
+    gpu_param.global_size[2] = depth / gpu_param.global_scale[2];
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+final:
+    if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
+    if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
+    return status;
+} /* _bilinear_align_corners_opt_initializer() */

 /*
 * Query kernel
@ -1098,19 +1325,46 @@ static vsi_status _query_kernel
    vx_kernel_initialize_f  initializer = _resize_bilinear_initializer;
    uint32_t key;
    uint32_t i;
-    vsi_bool is_2x_upsample =(2 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
-                    && (2 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
-    vsi_bool is_3x_upsample =(3 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
-                    && (3 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
-    vsi_bool is_4x_upsample =(4 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
-                    && (4 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
-    vsi_bool is_8x_upsample =(8 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
-                    && (8 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
+    float width_scale = 0;
+    float height_scale = 0;
+    vsi_size_t input_width = inputs[0]->attr.size[0];
+    vsi_size_t input_height = inputs[0]->attr.size[1];
+    vsi_size_t output_width = outputs[0]->attr.size[0];
+    vsi_size_t output_height = outputs[0]->attr.size[1];
+    vsi_bool is_2x_upsample =(2 * input_width == output_width) \
+                    && (2 * input_height == output_height);
+    vsi_bool is_3x_upsample =(3 * input_width == output_width) \
+                    && (3 * input_height == output_height);
+    vsi_bool is_4x_upsample =(4 * input_width == output_width) \
+                    && (4 * input_height == output_height);
+    vsi_bool is_8x_upsample =(8 * input_width == output_width) \
+                    && (8 * input_height == output_height);
+    vsi_bool is_8x_align_corners = FALSE;
    _internal_scale_e scale_flag = UP;

+    if (align_corners && outputs[0]->attr.size[0] > 1)
+    {
+        width_scale = ((float)(input_width - 1) * 1.0f) / (float)(output_width - 1);
+    }
+    else
+    {
+        width_scale = ((float)input_width * 1.0f) / (float)output_width;
+    }
+
+    if (align_corners && output_height > 1)
+    {
+        height_scale = ((float)(input_height - 1) * 1.0f) / (float)(output_height - 1);
+    }
+    else
+    {
+        height_scale = ((float)input_height * 1.0f) / (float)output_height;
+    }
+
    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

+    is_8x_align_corners = (vsi_bool)( width_scale == 0.125f && height_scale == 0.125f && in_dtype == U8 );
+
    is_2x_upsample &= (in_dtype == U8);
    is_3x_upsample &= (in_dtype == U8);
    is_4x_upsample &= (in_dtype == U8);
@ -1121,18 +1375,27 @@ static vsi_status _query_kernel
        if (is_same_type && (!align_corners) && (half_pixel_centers) && is_2x_upsample)
        {
            scale_flag = UP_2X_HALF;
+            initializer = _bilinear_half_pixel_centers_opt_initializer;
        }
        else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_3x_upsample)
        {
            scale_flag = UP_3X_HALF;
+            initializer = _bilinear_half_pixel_centers_opt_initializer;
        }
        else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_4x_upsample)
        {
            scale_flag = UP_4X_HALF;
+            initializer = _bilinear_half_pixel_centers_opt_initializer;
        }
        else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_8x_upsample)
        {
            scale_flag = UP_8X_HALF;
+            initializer = _bilinear_half_pixel_centers_opt_initializer;
+        }
+        else if (is_same_type && (align_corners) && (!half_pixel_centers) && is_8x_align_corners)
+        {
+            scale_flag = UP_8X_ALIGN;
+            initializer = _bilinear_align_corners_opt_initializer;
        }
        else if (is_same_type && is_evis2)
        {
@ -1240,20 +1503,20 @@ static vsi_nn_tensor_t* _create_scale_tensor

    if (align_corners && width > 1)
    {
-        width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(width - 1);
+        width_scale = ((float)(input_width - 1) * 1.0f) / (float)(width - 1);
    }
    else
    {
-        width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)width;
+        width_scale = ((float)input_width * 1.0f) / (float)width;
    }

    if (align_corners && height > 1)
    {
-        height_scale = ((vx_float32)(input_height - 1) * 1.0f) / (vx_float32)(height - 1);
+        height_scale = ((float)(input_height - 1) * 1.0f) / (float)(height - 1);
    }
    else
    {
-        height_scale = ((vx_float32)input_height * 1.0f) / (vx_float32)height;
+        height_scale = ((float)input_height * 1.0f) / (float)height;
    }


@ -1273,7 +1536,7 @@ static vsi_nn_tensor_t* _create_scale_tensor
            int32_t  h0      = 0;
            if (half_pixel_centers)
            {
-                input_h = ((vx_float32)y + 0.5f) * height_scale - 0.5f;
+                input_h = ((float)y + 0.5f) * height_scale - 0.5f;
            }
            else
            {
@ -1291,7 +1554,7 @@ static vsi_nn_tensor_t* _create_scale_tensor
                float     br      = 0.0f;
                if (half_pixel_centers)
                {
-                    input_w = ((vx_float32)x + 0.5f) * width_scale - 0.5f;
+                    input_w = ((float)x + 0.5f) * width_scale - 0.5f;
                }
                else
                {
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c
@ -51,6 +51,15 @@ __BEGIN_DECLS
            "_"STR(UP_SCALE)"x_upsample_half_pixel_centers"), \
          "resize_bilinear_nhwc" }

+#define BILINEAR_NHWC_BOUND_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_SCALE ) \
+        (( IN_DTYPE ) | ( OUT_DTYPE << 8) | (UP_SCALE << 16))
+
+#define BILINEAR_NHWC_BOUND_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, UP_SCALE ) \
+        { BILINEAR_NHWC_BOUND_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_SCALE ), \
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_nhwc_bound_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_"STR(UP_SCALE)"x"), \
+          "resize_bilinear_nhwc_bound" }
+
 typedef struct
 {
    uint32_t key;
@ -65,6 +74,12 @@ static const _kernel_map_type _resize_bilinear_nhwc_kernel_map[] =
    BILINEAR_NHWC_PACK_KERNEL_MAP_UP_SCALE(U8, U8, 1, 0, 4),
 };

+static const _kernel_map_type _bilinear_nhwc_bound_kernel_map[] =
+{
+    BILINEAR_NHWC_BOUND_KERNEL_MAP(U8, U8, 2),
+    BILINEAR_NHWC_BOUND_KERNEL_MAP(U8, U8, 3),
+    BILINEAR_NHWC_BOUND_KERNEL_MAP(U8, U8, 4),
+};

 /*
 * Kernel params
@ -81,6 +96,14 @@ static vx_param_description_t _resize_bilinear_nhwc_kernel_param_def[] =
 #define SCALAR_ALIGN_CORNERS         (2)
 #define SCALAR_HALF_PIXEL            (3)

+static vx_param_description_t _bilinear_nhwc_bound_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _BILINEAR_NHWC_BOUND_PARAM_NUM  _cnt_of_array( _bilinear_nhwc_bound_kernel_param_def )
+
 /*
 * Kernel initializer
 */
@ -382,50 +405,193 @@ final:
    return status;
 } /* _resize_bilinear_initializer() */

+DEF_KERNEL_INITIALIZER(_bilinear_nhwc_bound_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
+    vsi_nn_kernel_tensor_attr_t * input_attr  = NULL;
+    vsi_size_array_t             * in_shape  = NULL;
+    vsi_size_array_t             * out_shape  = NULL;
+    uint32_t  x_coord[2] = {0};
+    uint32_t    in_width;
+    uint32_t    in_height;
+    uint32_t    out_width;
+    uint32_t    out_height;
+    vsi_bool    is_2x_up_kernel  = FALSE;
+    vsi_bool    is_3x_up_kernel  = FALSE;
+    vsi_bool    is_4x_up_kernel  = FALSE;
+
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    in_shape = input_attr->shape;
+    out_shape = output_attr->shape;
+
+    in_width          = (uint32_t)(in_shape->data[0]);
+    in_height         = (uint32_t)(in_shape->data[1]);
+    out_width         = (uint32_t)(out_shape->data[0]);
+    out_height        = (uint32_t)(out_shape->data[1]);
+
+    is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
+    is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
+    is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
+
+
+    if (is_2x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize_x2_nhwc2_0_4x8 = {{
+            0x55555511, 0x55555555, // TCfg
+            0x46104000, 0x3a48829c, 0x4882acca, 0xc4acca3a, 0xbd4e5b50, // BinSelect
+            0x00000704, // AccumType, ConstantType, and PostShift
+            0x000c0004, 0x09030301, 0x03090103, 0x03090103,
+            0x09030301, 0x09030301, 0x03090103, 0x03090103 // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_param.global_scale[0] = 2;
+        gpu_param.global_scale[1] = 1;
+        x_coord[1] = (uint32_t)(out_shape->data[0]) - 2;
+        x_coord[0] = (x_coord[1] * 2 - 1) >> 2;
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize_x2_nhwc2_0_4x8", &uniResize_x2_nhwc2_0_4x8);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else if (is_3x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize_x3_nhwc2_l10_4x4 = {{
+            0x05055511, // TCfg
+            0x04045010, // ASelt
+            0x31310000, 0x00330022, // ABin
+            0x0a0aaa22, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x00005556, 0x00002aab, 0x38e41c72, 0x1c720e39,
+            0x2aab5556, 0x00000000, 0x2aab5556, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_param.global_scale[0] = 3;
+        gpu_param.global_scale[1] = 1;
+        x_coord[1] = (uint32_t)(out_shape->data[0]) - 2;
+        x_coord[0] = (x_coord[1] - 1) / 6 * 2;
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l10_4x4", &uniResize_x3_nhwc2_l10_4x4);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else if (is_4x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize_x4_nhwc2_l00_4x8 = {{
+            0x55555511, 0x55555555, // TCfg
+            0x46104000, 0x1940409c, 0x48829c46, 0x82acca3a, 0xacca3a48, // BinSelect
+            0x00000706, // AccumType, ConstantType, and PostShift
+            0x00280018, 0x190f0f09, 0x23051503, 0x23051503,
+            0x05230315, 0x05230315, 0x0f19090f, 0x0f19090f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize_x4_nhwc2_l10_4x8 = {{
+            0x55555511, 0x55555555, // TCfg
+            0x46104000, 0x1940409c, 0x48829c46, 0x82acca3a, 0xacca3a48, // BinSelect
+            0x00000706, // AccumType, ConstantType, and PostShift
+            0x00380008, 0x23150503, 0x31070701, 0x31070701,
+            0x07310107, 0x07310107, 0x15230305, 0x15230305 // Constant
+        }, GPU_DP_TYPE_16};
+
+
+        gpu_param.global_scale[0] = 4;
+        gpu_param.global_scale[1] = 1;
+        x_coord[1] = (uint32_t)(out_shape->data[0]) - 2;
+        x_coord[0] = ((x_coord[1] - 3) >> 3) * 2;
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l00_4x8", &uniResize_x4_nhwc2_l00_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l10_4x8", &uniResize_x4_nhwc2_l10_4x8);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else
+    {
+        VSILOGE("input or output's format is not support");
+        status = VSI_FAILURE;
+        goto final;
+    }
+
+    gpu_param.global_size[0]   = gpu_align_p2((out_height  + \
+        gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = 1;
+    gpu_param.dim              = 2;
+
+    status |= vsi_nn_kernel_gpu_add_param( node, "x_coord", &x_coord);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+final:
+    if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
+    if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
+
+    return status;
+} /* _bilinear_nhwc_bound_initializer() */
+
 /*
 * Query kernel
 */
 static vsi_status _query_kernel
    (
    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs,
-    int32_t align_corners,
-    int32_t half_pixel_centers,
-    uint32_t  up_scale
+    const uint32_t hashkey,
+    uint32_t kernel_id
    )
 {
+    vx_kernel_initialize_f  initializer = NULL;
+    vx_param_description_t * param_def;
    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_dtype_e in_dtype;
-    vsi_nn_kernel_dtype_e out_dtype;
-    const _kernel_map_type * kernel_map = _resize_bilinear_nhwc_kernel_map;
-    size_t kernel_map_size              = _cnt_of_array( _resize_bilinear_nhwc_kernel_map );
-    vx_param_description_t * param_def  = _resize_bilinear_nhwc_kernel_param_def;
-    size_t param_def_size               = _cnt_of_array( _resize_bilinear_nhwc_kernel_param_def );
-    vx_kernel_initialize_f  initializer = _resize_bilinear_nhwc_initializer;
-    uint32_t key;
-    uint32_t i;
+    const _kernel_map_type* kernel_map;
+    size_t kernel_map_size;
+    size_t param_size;
+    uint32_t i = 0;

-    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
-    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-
-    in_dtype = in_dtype == I8 ? U8 : in_dtype;
-    out_dtype = out_dtype == I8 ? U8 : out_dtype;
-
-    key = RESIZE_BILINEAR_NHWC_HASH_KEY( in_dtype, out_dtype, half_pixel_centers, align_corners, up_scale );
-    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    switch( kernel_id )
    {
-        if ( kernel_map[i].key == key )
+        case 0:
+            initializer = _resize_bilinear_nhwc_initializer;
+            kernel_map = _resize_bilinear_nhwc_kernel_map;
+            kernel_map_size = _cnt_of_array( _resize_bilinear_nhwc_kernel_map );
+            param_def = _resize_bilinear_nhwc_kernel_param_def;
+            param_size = _RESIZE_BILINEAR_NHWC_PARAM_NUM;
+            break;
+        case 1:
+            initializer = _bilinear_nhwc_bound_initializer;
+            kernel_map = _bilinear_nhwc_bound_kernel_map;
+            kernel_map_size = _cnt_of_array( _bilinear_nhwc_bound_kernel_map );
+            param_def = _bilinear_nhwc_bound_kernel_param_def;
+            param_size = _BILINEAR_NHWC_BOUND_PARAM_NUM;
+            break;
+        default:
+            VSI_ASSERT( FALSE );
+            return VSI_FAILURE;
+    }
+
+    for( i = 0; i < kernel_map_size; i ++ )
+    {
+        if( kernel_map[i].key == hashkey )
        {
            break;
        }
    }
-
-    if ( i < kernel_map_size )
+    if( i < kernel_map_size )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
        kernel->info.parameters  = param_def;
-        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.numParams   = (uint32_t)param_size;
        kernel->info.initialize  = initializer;
        // Register code source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
@ -453,7 +619,8 @@ static vsi_nn_kernel_node_t _setup
    )
 {
    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_RESIZE_BILINEAR_NHWC_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_param_t node0_params[_RESIZE_BILINEAR_NHWC_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_param_t node1_params[_BILINEAR_NHWC_BOUND_PARAM_NUM] = {NULL};
    vsi_nn_kernel_node_t node   = NULL;
    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
@ -463,8 +630,14 @@ static vsi_nn_kernel_node_t _setup
    float scale_y               = (float)outputs[0]->attr.size[2] / (float)inputs[0]->attr.size[2];
    float up_scale              = scale_x == scale_y ? scale_x : 0;
    uint32_t rank               = inputs[0]->attr.dim_num;
-    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
    vsi_size_t  shapes[2][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    vsi_nn_kernel_t * ikernels[2] = { NULL };
+    uint32_t hashkeys[2] = {0};
+    uint32_t i = 0;
+    vsi_nn_tensor_attr_t attr;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;

    if (!is_same_type || depth != 2 || rank < 3 ||
        (up_scale != 2.0f && up_scale != 3.0f && up_scale != 4.0f))
@ -472,8 +645,24 @@ static vsi_nn_kernel_node_t _setup
        return NULL;
    }

-    status = _query_kernel( kernel, inputs, outputs,
-                            align_corners, half_pixel_centers, (uint32_t)up_scale);
+    ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+    // Assign unique_id
+    ikernels[0]->unique_id = kernel->unique_id;
+    ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+    // Assign unique_id
+    ikernels[1]->unique_id = kernel->unique_id;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    hashkeys[0] = RESIZE_BILINEAR_NHWC_HASH_KEY( in_dtype, out_dtype, half_pixel_centers,
+        align_corners, (vsi_size_t)up_scale );
+    hashkeys[1] = BILINEAR_NHWC_BOUND_HASH_KEY( in_dtype, out_dtype, (vsi_size_t)up_scale );
+
+    status = _query_kernel( ikernels[0], hashkeys[0], 0);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = _query_kernel( kernel, hashkeys[1], 1);
+    CHECK_STATUS_FAIL_GOTO(status, final );

    shapes[0][0] = depth * inputs[0]->attr.size[1];
    shapes[0][1] = inputs[0]->attr.size[2];
@ -491,26 +680,41 @@ static vsi_nn_kernel_node_t _setup
    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
        outputs[0], shapes[1], rank );

-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM,
-                    reshape_tensors, input_num, &reshape_tensors[1], output_num );
-            node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
-            node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
+    // resize bilinear
+    node = vsi_nn_kernel_create_node( graph, ikernels[0] );
+    VSI_ASSERT( node != NULL );
+    vsi_nn_kernel_node_pack_io( node0_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM,
+            reshape_tensors, input_num, &reshape_tensors[1], output_num );
+    node0_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
+    node0_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
+    status  = vsi_nn_kernel_node_pass_param( node, node0_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM );
+    vsi_nn_kernel_scalar_release( &node0_params[SCALAR_ALIGN_CORNERS] );
+    vsi_nn_kernel_scalar_release( &node0_params[SCALAR_HALF_PIXEL] );
+    vsi_nn_kernel_node_release( &node );

-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
+    // update bound for output tensor
+    memcpy( &attr, &(reshape_tensors[1]->attr), sizeof(vsi_nn_tensor_attr_t) );
+    attr.size[0] = 1;
+    attr.size[1] = 1;
+    attr.dim_num = 2;
+    reshape_tensors[2] = vsi_nn_CreateTensor( graph, &attr );
+    node = vsi_nn_kernel_create_node( graph, kernel );
+    VSI_ASSERT( node != NULL );
+    vsi_nn_kernel_node_pack_io( node1_params, _BILINEAR_NHWC_BOUND_PARAM_NUM,
+            reshape_tensors, 2, &reshape_tensors[2], 1 );
+    status  = vsi_nn_kernel_node_pass_param( node, node1_params, _BILINEAR_NHWC_BOUND_PARAM_NUM );
+
+final:
+    for( i = 0; i < 2; i ++ )
+    {
+        if( ikernels[i] )
+        {
+            vsi_nn_kernel_release( &ikernels[i] );
        }
    }
-
    vsi_safe_release_tensor(reshape_tensors[0]);
    vsi_safe_release_tensor(reshape_tensors[1]);
+    vsi_safe_release_tensor(reshape_tensors[2]);

    return node;
 } /* _setup() */
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
@ -118,7 +118,7 @@ static vsi_status get_scatter_nd_tensor_reshape_size
        return status;
    }

-#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH

    newDim[0] = 0;
    for(i = 0; i < dims_num; ++i)
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
@ -207,7 +207,7 @@ static vsi_status get_scatter_nd_update_tensor_reshape_size
        return status;
    }

-#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH

    newDim[0] = 0;
    for(i = 0; i < dims_num; ++i)
--- a/src/tim/vx/internal/src/kernel/evis/select_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/select_evis.c
@ -75,10 +75,24 @@ static const _kernel_map_type _select_kernel_map[] =
    PACK_KERNEL_MAP(I8, U8,  U8,  U8),
    PACK_KERNEL_MAP(I8, I16, I16, I16),
    PACK_KERNEL_MAP(I8, F16, F16, F16),
+    PACK_KERNEL_MAP(I8, F16, U8,  F16),
+    PACK_KERNEL_MAP(I8, U8,  F16, F16),
+    PACK_KERNEL_MAP(I8, F16, I8,  F16),
+    PACK_KERNEL_MAP(I8, I8,  F16, F16),
+    PACK_KERNEL_MAP(I8, F16, I16, F16),
+    PACK_KERNEL_MAP(I8, I16, F16, F16),
+    PACK_KERNEL_MAP(I8, F16, F16, U8),
    PACK_KERNEL_MAP_2D(I8, I8,  I8,  I8),
    PACK_KERNEL_MAP_2D(I8, U8,  U8,  U8),
    PACK_KERNEL_MAP_2D(I8, I16, I16, I16),
    PACK_KERNEL_MAP_2D(I8, F16, F16, F16),
+    PACK_KERNEL_MAP_2D(I8, U8,  F16, F16),
+    PACK_KERNEL_MAP_2D(I8, F16, U8,  F16),
+    PACK_KERNEL_MAP_2D(I8, F16, I8,  F16),
+    PACK_KERNEL_MAP_2D(I8, I8,  F16, F16),
+    PACK_KERNEL_MAP_2D(I8, F16, I16, F16),
+    PACK_KERNEL_MAP_2D(I8, I16, F16, F16),
+    PACK_KERNEL_MAP_2D(I8, F16, F16, U8),
 };

 /*
@ -142,7 +156,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
    output_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );

-    if( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
        input0_fl = input0_attr->dfp.fl;
        if (input0_fl > 0)
@ -154,13 +168,13 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
            input0Scale = (float)((int64_t)1 << -input0_fl);
        }
    }
-    else if( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    else if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
    {
        input0Scale = input0_attr->asymm.scale;
        input0Zp    = input0_attr->asymm.zero_point;
    }

-    if( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
        input1_fl = input1_attr->dfp.fl;
        if (input1_fl > 0)
@ -172,13 +186,13 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
            input1Scale = (float)((int64_t)1 << -input1_fl);
        }
    }
-    else if( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    else if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
    {
        input1Scale = input1_attr->asymm.scale;
        input1Zp    = input1_attr->asymm.zero_point;
    }

-    if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if ( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
    {
        output_fl = output_attr->dfp.fl;
        if (output_fl > 0)
@ -190,7 +204,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
            outputScale = (float)((int64_t)1 << -output_fl);
        }
    }
-    else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    else if ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
    {
        outputScale = output_attr->asymm.scale;
        outputZP    = output_attr->asymm.zero_point;
@ -203,13 +217,10 @@ DEF_KERNEL_INITIALIZER(_select_initializer)

    output_shape  = output_attr->shape;
    gpu_param.dim = output_shape->size < 3 ? 2 : 3;
-    gpu_param.global_offset[0] = 0;
-    gpu_param.global_offset[1] = 0;
-    gpu_param.global_offset[2] = 0;
+
    gpu_param.global_scale[0]  = 8;
    gpu_param.global_scale[1]  = 1;
    gpu_param.global_scale[2]  = 1;
-
    gpu_param.global_size[0]   = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1)
                                             / gpu_param.global_scale[0], 4);
    gpu_param.global_size[1]   = (output_shape->data[1] + gpu_param.global_scale[1] - 1)
@ -218,83 +229,8 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
                                 (output_shape->data[2] + gpu_param.global_scale[2] - 1)
                                             / gpu_param.global_scale[2] : 1;

-
    switch( pack_key )
    {
-        case _PACK_SELECT_KEY( I8,  I8,  I8 ):
-        case _PACK_SELECT_KEY( I16, I16, I16 ):
-        {
-            gpu_dp_inst_t uniConvConditiontoDst_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniConvIntIn0toDst_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniConvIntIn1toDst_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            if (input0_fl >= output_fl)
-            {
-                uint8_t  postshift      = (uint8_t)gpu_min(input0_fl - output_fl, MAX_POST_SHIFT_BITS);
-                uniConvIntIn0toDst_2x8.data[7]    = uniConvIntIn0toDst_2x8.data[7] | (postshift & 0x1F);
-            }
-            else
-            {
-                uint32_t idx = 0;
-                uint32_t multiplier = gpu_min((int64_t)1 << (output_fl - input0_fl), MAX_MULTIPLIER_NUM);
-                for (idx = 8; idx < 16; idx ++)
-                {
-                    uniConvIntIn0toDst_2x8.data[idx] = (uint32_t)(multiplier << 16) | (multiplier & 0xffff);
-                }
-            }
-
-
-            if (input1_fl >= output_fl)
-            {
-                uint8_t  postshift      = (uint8_t)gpu_min(input1_fl - output_fl, MAX_POST_SHIFT_BITS);
-                uniConvIntIn1toDst_2x8.data[7]    = uniConvIntIn1toDst_2x8.data[7] | (postshift & 0x1F);
-            }
-            else
-            {
-                uint32_t idx = 0;
-                uint32_t multiplier = gpu_min((int64_t)1 << (output_fl - input1_fl), MAX_MULTIPLIER_NUM);
-                for (idx = 8; idx < 16; idx ++)
-                {
-                    uniConvIntIn1toDst_2x8.data[idx] = (uint32_t)(multiplier << 16) | (multiplier & 0xffff);
-                }
-            }
-
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvIntIn0toDst_2x8", &uniConvIntIn0toDst_2x8 );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvIntIn1toDst_2x8", &uniConvIntIn1toDst_2x8 );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvConditiontoDst_2x8", &uniConvConditiontoDst_2x8 );
-            CHECK_STATUS_FAIL_GOTO(status, final );
-        }
-        break;
        case _PACK_SELECT_KEY( F16,  F16,  F16 ):
        {
            gpu_dp_inst_t uniConvConditiontoDst_2x8 = {{
@ -312,61 +248,66 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
            CHECK_STATUS_FAIL_GOTO(status, final );
        }
        break;
+        case _PACK_SELECT_KEY( I8,  I8,  I8 ):
+        case _PACK_SELECT_KEY( I16, I16, I16 ):
        case _PACK_SELECT_KEY( U8,  U8,  U8 ):
+        case _PACK_SELECT_KEY( I8,  F16, F16 ):
+        case _PACK_SELECT_KEY( U8,  F16, F16 ):
+        case _PACK_SELECT_KEY( I16, F16, F16 ):
+        case _PACK_SELECT_KEY( F16, U8,  F16 ):
+        case _PACK_SELECT_KEY( F16, I8,  F16 ):
+        case _PACK_SELECT_KEY( F16, I16, F16 ):
+        case _PACK_SELECT_KEY( F16, F16, U8 ):
        {
-            uint32_t idx = 0;
-            gpu_dp_inst_t uniU8SubZP_MulM_PStoF16In0_2x8 = {{
-                0x99999999, // TCfg
-                0x44444444, // ASelt
+            uint32_t multAndoutZP0[2] = {0};
+            uint32_t multAndoutZP1[2] = {0};
+            gpu_dp_inst_t uniConvConditiontoDst_2x8 = {{
+                0x11111111, // TCfg
+                0x00000000, // ASelt
                0x03020100, 0x07060504, // ABin
-                0xaaaaaaaa, // BSelt
+                0x22222222, // BSelt
                0x00000000, 0x00000000, // BBin
                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00010001, 0x00010001, 0x00010001, 0x00010001,
-                0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniU8SubZP_MulM_PStoF16In1_2x8 = {{
-                0x99999999, // TCfg
+            gpu_dp_inst_t uniU8MulAndPostShift0_Lo_2x8 = {{
+                0xdddddddd, // TCfg
                0x44444444, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0xaaaaaaaa, // BSelt
+                0x13121110, 0x17161514, // ABin
+                0x11111111, // BSelt
                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00010001, 0x00010001, 0x00010001, 0x00010001,
-                0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniU8AddZP_2x8 = {{
-                0x55555555, // TCfg
+            gpu_dp_inst_t uniU8MulAndPostShift1_Lo_2x8 = {{
+                0xdddddddd, // TCfg
                0x44444444, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0xaaaaaaaa, // BSelt
+                0x13121110, 0x17161514, // ABin
+                0x11111111, // BSelt
                0x00000000, 0x00000000, // BBin
-                0x00000400, // AccumType, ConstantType, and PostShift
-                0x00010001, 0x00010001, 0x00010001, 0x00010001,
-                0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
            }, GPU_DP_TYPE_16 };

-            uniU8SubZP_MulM_PStoF16In0_2x8.data[7] |= (in0_postShift & 0x1F);
-            uniU8SubZP_MulM_PStoF16In1_2x8.data[7] |= (in1_postShift & 0x1F);
+            multAndoutZP0[0] = (uint32_t)(in0_M0);
+            multAndoutZP0[1] = (uint32_t)((outputZP << in0_postShift) - input0Zp * in0_M0);
+            multAndoutZP1[0] = (uint32_t)(in1_M0);
+            multAndoutZP1[1] = (uint32_t)((outputZP << in1_postShift) - input1Zp * in1_M0);

-            for (idx = 8; idx < 16; idx ++)
-            {
-                uniU8SubZP_MulM_PStoF16In0_2x8.data[idx] = (vx_uint32)(in0_M0 << 16) | in0_M0;
-                uniU8SubZP_MulM_PStoF16In1_2x8.data[idx] = (vx_uint32)(in1_M0 << 16) | in1_M0;
-            }
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift0_Lo_2x8, in0_postShift );
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift1_Lo_2x8, in1_postShift );

-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniU8SubZP_MulM_PStoF16In0_2x8", &uniU8SubZP_MulM_PStoF16In0_2x8 );
+            status  = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
+            status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniU8SubZP_MulM_PStoF16In1_2x8", &uniU8SubZP_MulM_PStoF16In1_2x8 );
+                "uniConvConditiontoDst_2x8",  &uniConvConditiontoDst_2x8 );
            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniU8AddZP_2x8", &uniU8AddZP_2x8 );
+                "uniU8MulAndPostShift0_Lo_2x8",  &uniU8MulAndPostShift0_Lo_2x8 );
            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "input0Zp", &input0Zp );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "input1Zp", &input1Zp );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "outputZP", &outputZP );
+                "uniU8MulAndPostShift1_Lo_2x8",  &uniU8MulAndPostShift1_Lo_2x8 );
            CHECK_STATUS_FAIL_GOTO(status, final );
        }
        break;
@ -501,4 +442,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_EVIS( select, _setup )
-
--- a/src/tim/vx/internal/src/kernel/evis/slice_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/slice_evis.c
@ -39,7 +39,6 @@

 __BEGIN_DECLS

-#define _SLICE_KERNEL_SOURCE      "slice"
 #define _SLICE_KERNEL_NAME        CVIVANTE_NAMESPACE("evis.slice")

    // Add kernel hashtable here
@ -50,30 +49,30 @@ __BEGIN_DECLS
 #define SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE , _IMAGE_2D, _SAMEFL) \
    (( IN1_DTYPE << 18 ) | ( IN0_DTYPE << 10 ) | ( OUT_DTYPE << 2 ) | (_IMAGE_2D << 1) | (_SAMEFL))

-#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
+#define PACK_KERNEL_MAP_3D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
 {   SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 0 ), \
-    SLICE_SH_KERNEL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
+    SLICE_SH_KERNEL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" }

 #define SLICE_SH_KERNEL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
    CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D")

-#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
+#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
 {   SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 0 ), \
-    SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
+    SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" }

 #define SLICE_SH_KERNEL_SAMEFL_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
    CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_SAMEFL")

-#define PACK_KERNEL_MAP_SAMEFL( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
+#define PACK_KERNEL_MAP_SAMEFL( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
 {   SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 1 ), \
-    SLICE_SH_KERNEL_SAMEFL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
+    SLICE_SH_KERNEL_SAMEFL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" }

 #define SLICE_SH_KERNEL_SAMEFL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
    CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_SAMEFL_2D")

-#define PACK_KERNEL_MAP_SAMEFL_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
+#define PACK_KERNEL_MAP_SAMEFL_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
 {   SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 1 ), \
-    SLICE_SH_KERNEL_SAMEFL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
+    SLICE_SH_KERNEL_SAMEFL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" }

    typedef struct
 {
@ -85,21 +84,33 @@ __BEGIN_DECLS
 static const _kernel_map_type _slice_kernel_map[] =
 {
    // Register kernel here
-    PACK_KERNEL_MAP( F16, I32, F16, _SLICE_KERNEL_SOURCE ),
-    PACK_KERNEL_MAP( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
-    PACK_KERNEL_MAP( U8, I32,  U8,  _SLICE_KERNEL_SOURCE ),
-    PACK_KERNEL_MAP( I8, I32,  I8,  _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP_3D( F16, I32, F16 ),
+    PACK_KERNEL_MAP_3D( F16, I32, I8 ),
+    PACK_KERNEL_MAP_3D( F16, I32, U8 ),
+    PACK_KERNEL_MAP_3D( F16, I32, I16 ),
+    PACK_KERNEL_MAP_3D( I8,  I32, F16 ),
+    PACK_KERNEL_MAP_3D( U8,  I32, F16 ),
+    PACK_KERNEL_MAP_3D( I16, I32, F16 ),
+    PACK_KERNEL_MAP_3D( I16, I32, I16 ),
+    PACK_KERNEL_MAP_3D( U8,  I32,  U8 ),
+    PACK_KERNEL_MAP_3D( I8,  I32,  I8 ),

-    PACK_KERNEL_MAP_2D( F16, I32, F16, _SLICE_KERNEL_SOURCE ),
-    PACK_KERNEL_MAP_2D( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
-    PACK_KERNEL_MAP_2D( U8, I32,  U8,  _SLICE_KERNEL_SOURCE ),
-    PACK_KERNEL_MAP_2D( I8, I32,  I8,  _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP_2D( F16, I32, F16 ),
+    PACK_KERNEL_MAP_2D( I16, I32, I16 ),
+    PACK_KERNEL_MAP_2D( F16, I32, I8 ),
+    PACK_KERNEL_MAP_2D( F16, I32, U8 ),
+    PACK_KERNEL_MAP_2D( F16, I32, I16 ),
+    PACK_KERNEL_MAP_2D( I8,  I32, F16 ),
+    PACK_KERNEL_MAP_2D( U8,  I32, F16 ),
+    PACK_KERNEL_MAP_2D( I16, I32, F16 ),
+    PACK_KERNEL_MAP_2D( U8, I32,  U8 ),
+    PACK_KERNEL_MAP_2D( I8, I32,  I8 ),

-    PACK_KERNEL_MAP_SAMEFL( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
-    PACK_KERNEL_MAP_SAMEFL( U8,  I32, U8, _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP_SAMEFL( I16, I32, I16 ),
+    PACK_KERNEL_MAP_SAMEFL( U8,  I32, U8 ),

-    PACK_KERNEL_MAP_SAMEFL_2D( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
-    PACK_KERNEL_MAP_SAMEFL_2D( U8, I32,  U8,  _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP_SAMEFL_2D( I16, I32, I16 ),
+    PACK_KERNEL_MAP_SAMEFL_2D( U8, I32,  U8 ),
 };

 #define _INPUT_NUM          (2)
@ -201,18 +212,16 @@ DEF_KERNEL_INITIALIZER(_slice_initializer)
        scaleOut         = output_attr->asymm.scale;
    }

-    if ((F16 == input_dtype)
-        || (I16 == input_dtype)
-        || (BF16 == input_dtype)
-        )
+    if ((I8 == input_dtype && input_dtype == output_dtype ) ||
+        (U8 == input_dtype && input_dtype == output_dtype ) )
    {
-        gpu_param.global_scale[0]  = 8;
+        gpu_param.global_scale[0]  = 16;
        gpu_param.global_scale[1]  = 1;
        gpu_param.global_scale[2]  = 1;
    }
    else
    {
-        gpu_param.global_scale[0]  = 16;
+        gpu_param.global_scale[0]  = 8;
        gpu_param.global_scale[1]  = 1;
        gpu_param.global_scale[2]  = 1;
    }
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@ -1416,31 +1416,42 @@ vsi_nn_kernel_tensor_attr_t * vsi_nn_kernel_tensor_attr_create
    switch( attr->quant )
    {
    case VSI_NN_KERNEL_QUANT_DFP:
-        {
+    {
        int8_t fl = 0;
        status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_FIXED_POINT_POS,
            &fl, sizeof(int8_t));
        CHECK_STATUS( status );
        attr->dfp.fl = (int32_t)fl;
+        if (fl >= 0) {
+            attr->scale = 1.0f / ((float)((int64_t)1 << fl));
+        } else {
+            attr->scale = (float)((int64_t)1 << -fl);
        }
-        break;
+    } break;
    case VSI_NN_KERNEL_QUANT_ASYMM:
-        {
-        status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_ZERO_POINT,
-            &(attr->asymm.zero_point), sizeof(int32_t));
-        CHECK_STATUS( status );
-        status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_SCALE,
-            &(attr->asymm.scale), sizeof(float));
-        CHECK_STATUS( status );
+    {
+        status = vxQueryTensor((vx_tensor)tensor,
+                               VX_TENSOR_ZERO_POINT,
+                               &(attr->asymm.zero_point),
+                               sizeof(int32_t));
+        CHECK_STATUS(status);
+        status = vxQueryTensor((vx_tensor)tensor,
+                               VX_TENSOR_SCALE,
+                               &(attr->asymm.scale),
+                               sizeof(float));
+        CHECK_STATUS(status);
        // Reset scale to 1e-8
-        if( (attr->asymm.scale - 0.f) < 1e-8 )
-            {
+        if ((attr->asymm.scale - 0.f) < 1e-8)
+        {
            attr->asymm.scale = (float)1e-8;
            attr->asymm.zero_point = 0;
-            }
        }
-        break;
+        attr->scale = attr->asymm.scale;
+        attr->zero_point = attr->asymm.zero_point;
+    }
+    break;
    default:
+        attr->scale = 1.0f;
        break;
    }
    return attr;
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
@ -189,6 +189,16 @@ static float celu_eval(float x, vsi_nn_kernel_lut_params *lut_param)
    return positive + negative;
 }

+static float rcp_eval(float x)
+{
+    return 1.0f / x;
+}
+
+static float softsign_eval(float x)
+{
+    return x / (1 + vsi_abs(x));
+}
+
 static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *lut_param)
 {
    float result = 0;
@ -245,6 +255,12 @@ static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *
    case VSI_NN_KERNEL_LUT_CELU:
        result =  celu_eval(data, lut_param);
        break;
+    case VSI_NN_KERNEL_LUT_RCP:
+        result =  rcp_eval(data);
+        break;
+    case VSI_NN_KERNEL_LUT_SOFTSIGN:
+        result = softsign_eval(data);
+        break;
    default:
        VSILOGE( "unsupported activation function:%d", lut_param->act_type );
        break;
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
@ -133,5 +133,9 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(gelu)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(hard_gelu)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(matrixmul)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(celu)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(rcp)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(softsign)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(resize_bilinear)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(resize_nearest)

 __END_DECLS
--- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
@ -146,6 +146,8 @@ REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( erf,          VSI_NN_KERNEL_LUT_ERF )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( relu_keras,   VSI_NN_KERNEL_LUT_RELU_KERAS )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( clip,         VSI_NN_KERNEL_LUT_CLIP )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( celu,         VSI_NN_KERNEL_LUT_CELU )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( rcp,          VSI_NN_KERNEL_LUT_RCP )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( softsign,     VSI_NN_KERNEL_LUT_SOFTSIGN )

 #undef REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL

--- a/src/tim/vx/internal/src/kernel/vx/resize_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/resize_vx.c
@ -0,0 +1,152 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+#define REGISTER_SOFTMAX_OPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vx_node node = NULL;
+    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
+    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
+    int32_t type  = vsi_nn_kernel_param_get_int32( params, "type" );
+
+#ifdef VX_SCALE_EXTRA_PARAMETER_SUPPORT
+    vx_nn_scale_params_ext_t param;
+    param.align_corners = align_corners;
+    param.half_pixel_centers = half_pixel_centers;
+    switch (type)
+    {
+        case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR:
+            param.base.type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
+            break;
+        case VSI_NN_INTERPOLATION_BILINEAR:
+            param.base.type = VX_INTERPOLATION_BILINEAR;
+            break;
+        case VSI_NN_INTERPOLATION_AREA:
+            param.base.type = VX_INTERPOLATION_AREA;
+            break;
+        default:
+            param.base.type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
+    }
+    node = vxTensorScaleNode( graph->g,
+                              inputs[0]->t,
+                              (vx_nn_scale_params)(&param),
+                              sizeof(vx_nn_scale_params_ext_t),
+                              outputs[0]->t );
+#else
+    vx_nn_scale_params_t param;
+    if (align_corners || half_pixel_centers)
+    {
+        return NULL;
+    }
+    switch (type)
+    {
+        case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR:
+            param.type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
+            break;
+        case VSI_NN_INTERPOLATION_BILINEAR:
+            param.type = VX_INTERPOLATION_BILINEAR;
+            break;
+        case VSI_NN_INTERPOLATION_AREA:
+            param.type = VX_INTERPOLATION_AREA;
+            break;
+        default:
+            param.type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
+            break;
+    }
+
+    node = vxTensorScaleNode( graph->g,
+                              inputs[0]->t,
+                              &param,
+                              sizeof(param),
+                              outputs[0]->t );
+#endif
+    if ( NULL == node )
+    {
+        VSILOGI("Call vxTensorScaleNode fail.(resize)");
+    }
+
+    return (vsi_nn_kernel_node_t)node;
+} /* _setup() */
+
+#define REGISTER_RESIZE_OPENVX_KERNEL(KERNEL_NAME) \
+    static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num, \
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ) \
+    { \
+        return _setup(graph, inputs, input_num, outputs, output_num, \
+                params, kernel); \
+    } \
+    REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
+
+REGISTER_RESIZE_OPENVX_KERNEL( resize_nearest )
+REGISTER_RESIZE_OPENVX_KERNEL( resize_bilinear )
+
+#undef REGISTER_RESIZE_OPENVX_KERNEL
--- a/src/tim/vx/internal/src/kernel/vx/square_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/square_vx.c
@ -32,7 +32,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel_lut.h"

 static vsi_nn_kernel_node_t _setup
    (
@ -46,57 +45,7 @@ static vsi_nn_kernel_node_t _setup
    )
 {
    vx_node node = NULL;
-#ifdef VX_USER_LOOKUP_TABLE_SUPPORT
-    vx_lut lut1 = NULL;
-    vx_lut lut2 = NULL;
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_lut_params lut_param;

-    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
-         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 )
-    {
-        return NULL;
-    }
-
-    lut_param.act_type = VSI_NN_KERNEL_LUT_SQUARE;
-
-    lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE);
-    lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE);
-    if( NULL == lut1 || NULL == lut2 )
-    {
-        VSILOGE("create lut object fail.");
-        goto final;
-    }
-
-    status = vsi_nn_kernel_lut(lut1, lut2, &lut_param);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-
-    node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t);
-    if ( NULL == node )
-    {
-        node = vxActivationLayer(
-            graph->g,
-            inputs[0]->t,
-            VX_NN_ACTIVATION_SQUARE,
-            0,
-            0,
-            outputs[0]->t
-            );
-    }
-
-final:
-    if (lut1)
-    {
-        vxReleaseLUT(&lut1);
-        lut1 = NULL;
-    }
-    if (lut2)
-    {
-        vxReleaseLUT(&lut2);
-        lut2 = NULL;
-    }
-    return (vsi_nn_kernel_node_t)node;
-#else
    node = vxActivationLayer(
        graph->g,
        inputs[0]->t,
@ -107,7 +56,6 @@ final:
        );

    return (vsi_nn_kernel_node_t)node;
-#endif
 } /* _setup() */

 #define REGISTER_SQUARE_OPENVX_KERNEL(KERNEL_NAME) \
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl
@ -0,0 +1,478 @@
+__kernel void cumsum_F32toF32_axis2(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    float4 sum = (float4)(0);
+
+    if(exclusive && rev)
+    {
+        coord_out.z = channel - 1;
+        write_imagef(output, coord_out, sum);
+
+        for(coord.z = channel - 1; coord.z > 0; coord.z--)
+        {
+            float4 data = read_imagef(input, coord);
+            coord_out.z--;
+            sum += data;
+
+            write_imagef(output, coord_out, sum);
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.z = 0;
+        write_imagef(output, coord_out, sum);
+        for(coord.z = 0; coord.z < channel - 1; coord.z++)
+        {
+            float4 data = read_imagef(input, coord);
+            coord_out.z++;
+            sum += data;
+
+            write_imagef(output, coord_out, sum);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)
+        {
+            float4 data = read_imagef(input, coord);
+            sum += data;
+
+            write_imagef(output, coord, sum);
+        }
+    }
+    else
+    {
+        for(coord.z = 0; coord.z < channel; coord.z++)
+        {
+            float4 data = read_imagef(input, coord);
+            sum += data;
+
+            write_imagef(output, coord, sum);
+        }
+    }
+}
+
+__kernel void cumsum_U8toU8_axis2(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    uint4 sum = (uint4)(0);
+    uint4 dst = (uint4)(0);
+
+    float cnt = 0.0f;
+
+    if(exclusive && rev)
+    {
+        coord_out.z = channel - 1;
+        write_imageui(output, coord_out, dst);
+        for(coord.z = channel - 1; coord.z > 0; coord.z--)
+        {
+            uint4 data = read_imageui(input, coord);
+            coord_out.z--;
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord_out, dst);
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.z = 0;
+        write_imageui(output, coord_out, dst);
+        for(coord.z = 0; coord.z < channel - 1; coord.z++)
+        {
+            uint4 data = read_imageui(input, coord);
+            coord_out.z++;
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord_out, dst);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)
+        {
+            uint4 data = read_imageui(input, coord);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord, dst);
+        }
+    }
+    else
+    {
+        for(coord.z = 0; coord.z < channel; coord.z++)
+        {
+            uint4 data = read_imageui(input, coord);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord, dst);
+        }
+    }
+}
+
+__kernel void cumsum_F32toF32_axis1(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    float4 sum = (float4)(0);
+
+    if(exclusive && rev)
+    {
+        coord_out.y = height - 1;
+        write_imagef(output, coord_out, sum);
+        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        {
+            float4 data = read_imagef(input, coord);
+            coord_out.y--;
+            sum += data;
+
+            write_imagef(output, coord_out, sum);
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.y = 0;
+        write_imagef(output, coord_out, sum);
+        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        {
+            float4 data = read_imagef(input, coord);
+            coord_out.y++;
+            sum += data;
+
+            write_imagef(output, coord_out, sum);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            float4 data = read_imagef(input, coord);
+            sum += data;
+
+            write_imagef(output, coord, sum);
+        }
+    }
+    else
+    {
+        for(coord.y = 0; coord.y < height; coord.y++)
+        {
+            float4 data = read_imagef(input, coord);
+            sum += data;
+
+            write_imagef(output, coord, sum);
+        }
+    }
+}
+
+__kernel void cumsum_U8toU8_axis1(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    uint4 sum = (uint4)(0);
+    uint4 dst = (uint4)(0);
+
+    float cnt = 0;
+
+    if(exclusive && rev)
+    {
+        coord_out.y = height - 1;
+        write_imageui(output, coord_out, dst);
+
+        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        {
+            uint4 data = read_imageui(input, coord);
+            cnt += 1.0f;
+            coord_out.y--;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord_out, dst);
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.y = 0;
+        write_imageui(output, coord_out, dst);
+        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        {
+            uint4 data = read_imageui(input, coord);
+            cnt += 1.0f;
+            coord_out.y++;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord_out, dst);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            uint4 data = read_imageui(input, coord);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord, dst);
+        }
+    }
+    else
+    {
+        for(coord.y = 0; coord.y < height; coord.y++)
+        {
+            uint4 data = read_imageui(input, coord);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord, dst);
+        }
+    }
+}
+
+__kernel void cumsum_F32toF32_axis0(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    float4 sum = (float4)(0);
+
+    if(exclusive && rev)
+    {
+        coord_out.x = width - 1;
+        write_imagef(output, coord_out, sum);
+        for(coord.x = width - 1; coord.x > 0; coord.x--)
+        {
+            float4 data = read_imagef(input, coord);
+            coord_out.x--;
+            sum += data;
+
+            write_imagef(output, coord_out, sum);
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.x = 0;
+        write_imagef(output, coord_out, sum);
+        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        {
+            float4 data = read_imagef(input, coord);
+            coord_out.x++;
+            sum += data;
+
+            write_imagef(output, coord_out, sum);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+        {
+            float4 data = read_imagef(input, coord);
+            sum += data;
+
+            write_imagef(output, coord, sum);
+        }
+    }
+    else
+    {
+        for(coord.x = 0; coord.x < width; coord.x++)
+        {
+            float4 data = read_imagef(input, coord);
+            sum += data;
+
+            write_imagef(output, coord, sum);
+        }
+    }
+}
+
+__kernel void cumsum_U8toU8_axis0(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    uint4 sum = (uint4)(0);
+    uint4 dst = (uint4)(0);
+
+    float cnt = 0;
+
+    if(exclusive && rev)
+    {
+        coord_out.x = width - 1;
+        write_imageui(output, coord_out, dst);
+        for(coord.x = width - 1; coord.x > 0; coord.x--)
+        {
+            uint4 data = read_imageui(input, coord);
+            coord_out.x--;
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord_out, dst);
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.x = 0;
+        write_imageui(output, coord_out, dst);
+        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        {
+            uint4 data = read_imageui(input, coord);
+            coord_out.x++;
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord_out, dst);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+        {
+            uint4 data = read_imageui(input, coord);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord, dst);
+        }
+    }
+    else
+    {
+        for(coord.x = 0; coord.x < width; coord.x++)
+        {
+            uint4 data = read_imageui(input, coord);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord, dst);
+        }
+    }
+}
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl
@ -0,0 +1,314 @@
+
+__kernel void cumsum_F32toF32_axis1_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    float4 sum = (float4)(0);
+
+    if(exclusive && rev)
+    {
+        coord.w = height - 1;
+        write_imagef(output, coord.zw, sum);
+        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            coord.w--;
+            sum += data;
+
+            write_imagef(output, coord.zw, sum);
+        }
+    }
+    else if(exclusive)
+    {
+        write_imagef(output, coord.zw, sum);
+        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            coord.w++;
+            sum += data;
+
+            write_imagef(output, coord.zw, sum);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            sum += data;
+
+            write_imagef(output, coord.xy, sum);
+        }
+    }
+    else
+    {
+        for(coord.y = 0; coord.y < height; coord.y++)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            sum += data;
+
+            write_imagef(output, coord.xy, sum);
+        }
+    }
+}
+
+__kernel void cumsum_U8toU8_axis1_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    uint4 sum = (uint4)(0);
+    uint4 dst = (uint4)(0);
+
+    float cnt = 0;
+
+    if(exclusive && rev)
+    {
+        coord.w = height - 1;
+        write_imageui(output, coord.zw, sum);
+        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        {
+            uint4 data = read_imageui(input, coord.xy);
+            cnt += 1.0f;
+            coord.w--;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.zw, dst);
+        }
+    }
+    else if(exclusive)
+    {
+        write_imageui(output, coord.zw, sum);
+        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        {
+            uint4 data = read_imageui(input, coord.xy);
+            cnt += 1.0f;
+            coord.w++;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.zw, dst);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            uint4 data = read_imageui(input, coord.xy);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.xy, dst);
+        }
+    }
+    else
+    {
+        for(coord.y = 0; coord.y < height; coord.y++)
+        {
+            uint4 data = read_imageui(input, coord.xy);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.xy, dst);
+        }
+    }
+}
+
+__kernel void cumsum_F32toF32_axis0_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    float4 sum = (float4)(0);
+
+    if(exclusive && rev)
+    {
+        coord.x = width - 1;
+        coord.z = coord.x;
+        write_imagef(output, coord.zw, sum);
+        for(; coord.x > 0; coord.x--)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            coord.z--;
+            sum += data;
+
+            write_imagef(output, coord.zw, sum);
+        }
+    }
+    else if(exclusive)
+    {
+        coord.z = 0;
+        write_imagef(output, coord.zw, sum);
+        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            coord.z++;
+            sum += data;
+
+            write_imagef(output, coord.zw, sum);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            sum += data;
+
+            write_imagef(output, coord.xy, sum);
+        }
+    }
+    else
+    {
+        for(coord.x = 0; coord.x < width; coord.x++)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            sum += data;
+
+            write_imagef(output, coord.xy, sum);
+        }
+    }
+}
+
+__kernel void cumsum_U8toU8_axis0_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    uint4 sum = (uint4)(0);
+    uint4 dst = (uint4)(0);
+
+    float cnt = 0.0f;
+
+    if(exclusive && rev)
+    {
+        coord.x = width - 1;
+        coord.z = coord.x;
+        write_imageui(output, coord.zw, sum);
+        for(; coord.x > 0; coord.x--)
+        {
+            uint4 data = read_imageui(input, coord.xy);
+            coord.z--;
+            cnt += 1.0;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.zw, dst);
+        }
+    }
+    else if(exclusive)
+    {
+        coord.z = 0;
+        write_imageui(output, coord.zw, sum);
+        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        {
+            uint4 data = read_imageui(input, coord.xy);
+            cnt += 1.0f;
+            coord.z++;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.zw, dst);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+        {
+            uint4 data = read_imageui(input, coord.xy);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.xy, dst);
+        }
+    }
+    else
+    {
+        for(coord.x = 0; coord.x < width; coord.x++)
+        {
+            uint4 data = read_imageui(input, coord.xy);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.xy, dst);
+        }
+    }
+}
--- a/Show More
+++ b/Show More