diff --git a/.gitignore b/.gitignore
index 8def752..cef9f4c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -336,3 +336,4 @@ ASALocalRun/
 # IDE
 .settings/
 build/
+*_build/
diff --git a/prebuilt-sdk/x86_64_linux/VERSION b/prebuilt-sdk/x86_64_linux/VERSION
index 703a8d3..e5a66ba 100644
--- a/prebuilt-sdk/x86_64_linux/VERSION
+++ b/prebuilt-sdk/x86_64_linux/VERSION
@@ -1 +1 @@
-REL/6.4.10.2
+6.4.11
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
index a8ea910..8b93beb 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
@@ -499,6 +499,8 @@ enum vx_kernel_e {
 
     VX_KERNEL_NN_DECONV_3D_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x31,
 
+    VX_KERNEL_STREAM_PROCESSOR = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x32,
+
     VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */
 };
 
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
index 74f3592..6cf283c 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
@@ -196,4 +196,45 @@ VX_DECONV_3D_API_SUPPORT is used to declare that vsi openvx driver can support d
 #define VX_TENSOR_STRIDE_X_BITS_SUPPORT 1
 #endif
 
+/*
+VX_REMOVE_RESHAPE_SUPPORT is used to declare if graph opt support to remove reshape op, if support, it's not need to remove reshape in ovxlib.
+ 0: not support
+ 1: support
+*/
+/*
+#ifndef VX_REMOVE_RESHAPE_SUPPORT
+#define VX_REMOVE_RESHAPE_SUPPORT 0
+#endif
+*/
+
+/*
+VX_STREAM_PROCESSOR_SUPPORT is used to declare that vsi openvx driver can support vxStreamProcessorNode API
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_STREAM_PROCESSOR_SUPPORT
+#define VX_STREAM_PROCESSOR_SUPPORT 0
+#endif
+
+/*
+ VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL is used to declare that this tensor connect to fixed DMA channel.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL
+#define VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL 1
+#endif
+
+/*
+ VX_SCALE_EXTRA_PARAMETER_SUPPORT is used to declare that RESIZE can support align_cornor and half_pixel_center parameter
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_SCALE_EXTRA_PARAMETER_SUPPORT
+#define VX_SCALE_EXTRA_PARAMETER_SUPPORT 1
+#endif
+
 #endif /* __VX_KHR_COMPATIBLE_H__ */
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
index cca4338..623c541 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
@@ -57,6 +57,12 @@ enum vx_graph_attribute_internal_type_e
     VX_GRAPH_AXI_SRAM_PRE_LOAD                    =  VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x2,
     /*! \brief Queries a graph for its running priority (read-write. Use a <tt>\ref vx_uint32</tt> parameter. */
     VX_GRAPH_PRIORITY_VALUE_VIV                   =  VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x3,
+    VX_GRAPH_PSI_EXTRATOR_PARAMETER               = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x4,
+    VX_GRAPH_PSI_FILLER_PARAMETER                 = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x5,
+    VX_GRAPH_DENOISE_POSTPROCESS_PARAMETER        = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x6,
+    VX_GRAPH_DATA_COMPRESSION_RATIO               = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x7,
+    VX_GRAPH_ISP_EMULATION_PARAMETER              = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x8,
+    VX_GRAPH_PROCESS_FPS                          = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x9,
 };
 
 /*! \brief Size Alignment of User Memory
@@ -209,7 +215,8 @@ enum vx_nn_activation_function_e
     VX_NN_ACTIVATION_LEAKYRELU_MAX_POOLING = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x4,
     VX_NN_ACTIVATION_SWISH = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x5,
     VX_NN_ACTIVATION_HSWISH = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x6,
-    VX_NN_ACTIVATION_NONE = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x7,
+    VX_NN_ACTIVATION_CUSTOM = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x7,
+    VX_NN_ACTIVATION_NONE = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x8,
 };
 
 /*! \brief  The Convolutional network type
@@ -285,6 +292,59 @@ enum vx_tensor_rank_type_e
     VX_TENSOR_RANK_SN,
 };
 
+/*! \brief The attribute of tensor.
+ * \ingroup group_tensor
+ * \version 0.4
+ */
+enum vx_tensor_priority_e
+{
+    /*! \brief no special requirement */
+    VX_TENSOR_DEFAULT = 0,
+
+    /*! \brief  2nd input(reference) */
+    /*VX_TENSOR_2ND_INPUT_FOR       = 1,*/
+    VX_TENSOR_FOR_GRAPH_REFERENCE = 1,
+};
+
+
+/*! \brief The attribute of tensor memory.
+ * \ingroup group_tensor
+ * \version 0.4
+ */
+enum vx_tensor_memory_attribute_e
+{
+    /*! \brief no special requirement */
+    VX_TENSOR_MEMORY_DEFAULT = 0,
+
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_0 = (0x1 << 0),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_1 = (0x1 << 1),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_2 = (0x1 << 2),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_3 = (0x1 << 3),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_4 = (0x1 << 4),
+    /*
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_5 = (0x1 << VX_DMA5_IN_ISP_OCM_PSI),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_6 = (0x1 << VX_DMA6_DDR_DECOMPRESS),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_7 = (0x1 << VX_DMA7_POSTOUT_OCM_ISP),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_8 = (0x1 << VX_DMA8_COMPRESS_DDR),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_9 = (0x1 << VX_DMA9_ISP_PATTERN_GENERATOR),
+    VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_10 = (0x1 << VX_DMA10_ISP_CHECKSUM_GENERATOR),
+    */
+    /*! \brief DMA transfer data to VIP and enable circular buffer */
+#if !VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL
+    VX_TENSOR_MEMORY_ENABLE_CIRCULAR_BY_DMA = 0xFFFFFFFF,
+#endif
+};
+
+enum vx_dma_extrator_pad_mode_e
+{
+    /*! \brief no special requirement */
+    VX_DMA_EXTRATOR_PAD_CONST = 0,
+
+    /*! \brief DMA extrator pad with nearest edge */
+    VX_DMA_EXTRATOR_PAD_WITH_NEAREAST_EDGE = 1,
+};
+
+
 /*! \brief The precision of tensor.
  * \ingroup group_tensor
  * \version 0.4
@@ -601,6 +661,19 @@ VX_API_ENTRY vx_tensor VX_API_CALL vxReshapeTensor(vx_tensor tensor, vx_int32* n
  */
 VX_API_ENTRY vx_status VX_API_CALL vxSetTensorAttribute(vx_tensor tensor, vx_enum attribute, const void *ptr, vx_size size);
 
+/*! \brief Creates an opaque reference to a tensor data buffer.
+ * \details The tensor is a dummy tensor which will not allocate any memory. And it cannot reshape or view.
+ * Not guaranteed to exist until the <tt>vx_graph</tt> containing it has been verified.
+ * \param [in] context The reference to the implementation context.
+ * \param [in] number_of_dims The number of dimensions.
+ * \param [in] dims Dimensions sizes in elements.
+ * \param [in] data_format The <tt>\ref vx_type_e</tt> that represents the data format of the tensor data elements.
+ * \return A tensor data reference or zero when an error is encountered.
+ * \ingroup group_tensor
+ * \version 0.3
+ */
+VX_API_ENTRY vx_tensor VX_API_CALL vxCreateDummyTensor(vx_context context, vx_size number_of_dims, const vx_size *dims, vx_enum data_format);
+
 
 /*! \brief The type enumeration lists all NN extension types.
  * \ingroup group_cnn
@@ -1317,6 +1390,13 @@ typedef struct _vx_nn_scale_params_t
     vx_enum type;             /*!< \brief  The interpolation type, only support VX_INTERPOLATION_BILINEAR.  */
 } vx_nn_scale_params_t, * vx_nn_scale_params;
 
+typedef struct _vx_nn_scale_params_ext_t
+{
+    vx_nn_scale_params_t base;
+    vx_bool align_corners;
+    vx_bool half_pixel_centers;
+} vx_nn_scale_params_ext_t, * vx_nn_scale_params_ext;
+
 /*! \brief [Graph] Creates a scale Layer Node.
  * \param [in] graph The reference to the parent graph.
  * \param [in] input The input tensor data to scale.
@@ -2054,8 +2134,15 @@ typedef struct _vx_hardware_caps_params_ext_t
     vx_hardware_caps_params_t base;
     vx_uint32 subGroupSize;        /*!< \brief  shader sub-group size.*/
     vx_bool   supportVA40;         /*!< \brief  support 40bit virtual address.*/
+    vx_uint32 supportStreamProcessor; /*!< \brief  support stream processor.*/
 } vx_hardware_caps_params_ext_t;
 
+typedef struct _vx_hardware_caps_params_ext2_t
+{
+    vx_hardware_caps_params_ext_t base;
+    vx_uint32 streamProcessorExecCount;     /*!< \brief  streamprocess execution count.  */
+} vx_hardware_caps_params_ext2_t;
+
 /*! \brief Queries hardware caps information.
  * \param [in] context The reference to the context.
  * \param [in] hardware_caps_params <tt>\ref vx_hardware_caps_params_t </tt>.
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
index 506938f..66427cb 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
@@ -219,6 +219,15 @@ typedef struct _vx_nn_convolution_relu_pooling_params_ext4_t
     vx_bool         enable_nn_tensor_add_relu;  /*!< \brief  Enable Relu function after tensor add. */
 } vx_nn_convolution_relu_pooling_params_ext4_t, * vx_nn_convolution_relu_pooling_params_ext4;
 
+typedef struct _vx_nn_convolution_relu_pooling_params_ext5_t
+{
+    vx_nn_convolution_relu_pooling_params_ext4_t ext4;  /*!< \brief convolution relu pooling params <tt>\ref vx_nn_convolution_relu_pooling_params_ext_t</tt> */
+
+    vx_object_array inputs_list;
+    vx_object_array outputs_list;
+    vx_spinst       spinst_obj;
+} vx_nn_convolution_relu_pooling_params_ext5_t, * vx_nn_convolution_relu_pooling_params_ext5;
+
 /*! \brief [Graph] Creates a Convolutional Network Convolution and Activation(Relu) and Pooling Layer Node, this fucntion match kronos NN Extension 1.2 verion.
  * \details This function implement Convolutional Network Convolution and Activation(Relu) and Pooling layer.
  *  For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined,
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_nodes.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_nodes.h
index bf513b5..64504ca 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_nodes.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_nodes.h
@@ -963,6 +963,40 @@ VX_API_ENTRY vx_node VX_API_CALL vxBatchGemmNode(vx_graph graph,
                                                  vx_scalar trans_c,
                                                  vx_tensor output);
 
+typedef struct _vx_lut_params_s
+{
+    vx_enum         lut_function;     /*!< \brief Set VX_NN_ACTIVATION_NONE to disable lut table or set VX_NN_ACTIVATION_CUSTOM to customize lut table or set others to use fixed lut table */
+    vx_float32      float_values[4];  /*!< \brief Float parameters of fixed lut table */
+    vx_uint32       fvalues_count;    /*!< \brief Count of float_values */
+    vx_int32        int_values[4];    /*!< \brief Int parameters of fixed lut table */
+    vx_uint32       ivalues_count;    /*!< \brief Count of int_values */
+    vx_lut          in_lut;           /*!< \brief Only valid when lut_function is VX_NN_ACTIVATION_CUSTOM */
+    vx_lut          out_lut;          /*!< \brief Only valid when lut_function is VX_NN_ACTIVATION_CUSTOM */
+} vx_lut_params_s, * vx_lut_params;
+
+/*! \brief Create a stream processor node.
+ * \param [in] graph The reference to the graph.
+ * \param [in] input_list The input tensor list.
+ * \param [in] input_count The input tensor count.
+ * \param [in] output_list The output tensor list.
+ * \param [in] output_count The output tensor count.
+ * \param [in] spinst_obj The stream processor instrunction object. Use vxCreateSPINST() to create.
+ * \param [in] lut_params The lut parameters. Refer to vx_lut_params_s.
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation
+ * should be checked using <tt>\ref vxGetStatus</tt>
+ * \ingroup group_vision_function_sp
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxStreamProcessorNode(
+    vx_graph                    graph,
+    vx_tensor*                  input_list,
+    vx_uint32                   input_count,
+    vx_tensor*                  output_list,
+    vx_uint32                   output_count,
+    vx_spinst                   spinst_obj,
+    vx_lut_params               lut_params
+    );
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h
new file mode 100644
index 0000000..bcfe401
--- /dev/null
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h
@@ -0,0 +1,332 @@
+/****************************************************************************
+*
+*    Copyright 2017 - 2021 Vivante Corporation, Santa Clara, California.
+*    All Rights Reserved.
+*
+*    Permission is hereby granted, free of charge, to any person obtaining
+*    a copy of this software and associated documentation files (the
+*    'Software'), to deal in the Software without restriction, including
+*    without limitation the rights to use, copy, modify, merge, publish,
+*    distribute, sub license, and/or sell copies of the Software, and to
+*    permit persons to whom the Software is furnished to do so, subject
+*    to the following conditions:
+*
+*    The above copyright notice and this permission notice (including the
+*    next paragraph) shall be included in all copies or substantial
+*    portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+*    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+*    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+*    IN NO EVENT SHALL VIVANTE AND/OR ITS SUPPLIERS BE LIABLE FOR ANY
+*    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+*    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+*    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VX_SPINST_H_
+#define _VX_SPINST_H_
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+typedef enum _vx_sp_inst_type_e
+{
+    VX_SP_INST_TYPE_FADD,
+    VX_SP_INST_TYPE_FMULT,
+    VX_SP_INST_TYPE_MOVE,
+    VX_SP_INST_TYPE_PWL,
+
+    VX_SP_INST_TYPE_COUNT,
+}
+vx_sp_inst_type_e;
+
+typedef enum _vx_sp_inst_type_fadd_e
+{
+    VX_SP_INST_TYPE_FADD_IDLE,   // FADD-IDLE
+    VX_SP_INST_TYPE_FADD_ADD,    // dst = src0 + src1
+    VX_SP_INST_TYPE_FADD_SUB,    // dst = src0 - src1
+
+    VX_SP_INST_TYPE_FADD_COUNT,
+}
+vx_sp_inst_type_fadd_e;
+
+typedef enum _vx_sp_inst_type_fmult_e
+{
+    VX_SP_INST_TYPE_FMULT_IDLE,       /* FMULT-IDLE */
+    VX_SP_INST_TYPE_FMULT_MUL,        /* dst = src0 * src1 */
+    VX_SP_INST_TYPE_FMULT_MUL_CLAMP,  /* dst = clamp (src0, src1, R6, R7) */
+
+    VX_SP_INST_TYPE_FMULT_COUNT,
+}
+vx_sp_inst_type_fmult_e;
+
+typedef enum _vx_sp_inst_type_move_e
+{
+    VX_SP_INST_TYPE_MOVE_IDLE,
+    VX_SP_INST_TYPE_MOVE_MOVE,  // dst = src1
+    VX_SP_INST_TYPE_MOVE_SEL0,  // dst = (src0 > 0) ? src1[0] : src1[1]
+    VX_SP_INST_TYPE_MOVE_SEL1,  // dst = (src0 > 0) ? src1 : FA-src0  // use FA's SRC0
+    VX_SP_INST_TYPE_MOVE_IMMD,  // dst = Constant assign immmediate
+    VX_SP_INST_TYPE_MOVE_ABS,   // dst = abs(src1)
+
+    VX_SP_INST_TYPE_MOVE_COUNT,
+}
+vx_sp_inst_type_move_e;
+
+typedef enum _vx_sp_inst_type_pwl_e
+{
+    VX_SP_INST_TYPE_PWL_IDLE,
+    VX_SP_INST_TYPE_PWL_SETUP_0,  /* PWL ID = 0 */
+    VX_SP_INST_TYPE_PWL_SETUP_1,  /* Sigmode() */
+    VX_SP_INST_TYPE_PWL_SETUP_2,  /* Tanh() */
+
+    VX_SP_INST_TYPE_PWL_COUNT,
+}
+vx_sp_inst_type_pwl_e;
+
+typedef enum _vx_sp_inst_src_dst_e
+{
+    VX_SP_INST_SPINOUT,
+    VX_SP_INST_SR1,
+    VX_SP_INST_SR2,
+    VX_SP_INST_SR3,
+    VX_SP_INST_SR4,
+    VX_SP_INST_SR5,
+    VX_SP_INST_SR6,   /* nn_clamp_min */
+    VX_SP_INST_SR7,   /* nn_clamp_max */
+    VX_SP_INST_SR8,
+    VX_SP_INST_SR9,
+    VX_SP_INST_SR10,
+    VX_SP_INST_VR11,
+    VX_SP_INST_VR12,
+    VX_SP_INST_VR13,
+    VX_SP_INST_VR14,
+    VX_SP_INST_SETUPOUT,   /* Input of PWL Mult and Add: FMInA, FMInB, FAInA, FAInB */
+}
+vx_sp_inst_src_dst_e;
+
+typedef struct _vx_spinst_unit_param
+{
+    vx_enum         op;    /* vx_sp_inst_type_e */
+
+    struct
+    {
+        vx_enum     op;    /* vx_sp_inst_type_fadd/fmult/move/pwl_e */
+
+        struct
+        {
+            vx_uint8    src0;       /* vx_sp_inst_src_dst_e */
+            vx_uint8    src1;       /* vx_sp_inst_src_dst_e */
+            vx_uint8    dst;        /* vx_sp_inst_src_dst_e */
+            vx_float32  constant;
+        } var;
+
+    } sub;
+
+}
+vx_spinst_unit_param;
+
+/**********************************************************************************************/
+
+typedef enum _vx_sp_attribute_e
+{
+    VX_SP_ATTRIBUTE_NONE,
+
+    VX_SP_ATTRIBUTE_INPUT_TILE_MAPPING,
+    VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_X,
+    VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_Y,
+    VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_Z,
+
+    VX_SP_ATTRIBUTE_PROG_INIT_INSTR_NUM,
+    VX_SP_ATTRIBUTE_PROG_LOOP_INSTR_NUM,
+    VX_SP_ATTRIBUTE_PROG_COMPLETE_INSTR_NUM,
+    VX_SP_ATTRIBUTE_PROG_ROUNDING_MODE,
+    VX_SP_ATTRIBUTE_INPUT_SETUP,
+
+    VX_SP_ATTRIBUTE_IGNORED_LEADING_OUTPUTS,
+    VX_SP_ATTRIBUTE_FLUSH_CYCLE_NUM,
+    VX_SP_ATTRIBUTE_IGNORED_LEADING_V11_WR,
+    VX_SP_ATTRIBUTE_IGNORED_LEADING_V12_WR,
+    VX_SP_ATTRIBUTE_IGNORED_LEADING_V11_RD,
+    VX_SP_ATTRIBUTE_IGNORED_LEADING_V12_RD,
+
+    VX_SP_ATTRIBUTE_CH0_POST_REDISTRIBUTE,
+    VX_SP_ATTRIBUTE_CH1_POST_REDISTRIBUTE,
+    VX_SP_ATTRIBUTE_V11_RESET_AT_START,
+    VX_SP_ATTRIBUTE_V12_RESET_AT_START,
+    VX_SP_ATTRIBUTE_V11_POP_CONFIG,
+    VX_SP_ATTRIBUTE_V12_POP_CONFIG,
+    VX_SP_ATTRIBUTE_ACCELERATOR_INPUT_SELECT,
+    VX_SP_ATTRIBUTE_IGNORED_LEADING_ACC_OUT,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_RESET,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_NUM_CH_MINUS_ONE,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_2D_ACCUM_STORAGE,
+
+    VX_SP_ATTRIBUTE_GENERAL_COUNT,
+
+    VX_SP_ATTRIBUTE_CONST0,     /* NN post multiplier    */
+    VX_SP_ATTRIBUTE_CONST1,     /* NN neg pos multiplier */
+    VX_SP_ATTRIBUTE_CONST2,     /* NN tensor add const   */
+    VX_SP_ATTRIBUTE_CONST3,     /* NN clamp max          */
+    VX_SP_ATTRIBUTE_CONST4,     /* NN clmap min          */
+
+    VX_SP_ATTRIBUTE_TOTAL_COUNT,
+}
+vx_sp_attribute_e;
+
+typedef enum _vx_sp_attribute_input_tile_mapping_e
+{
+    VX_SP_ATTRIBUTE_INPUT_TILE_MAPPING_XYMERGE,
+    VX_SP_ATTRIBUTE_INPUT_TILE_MAPPING_YZMERGE,
+}
+vx_sp_attribute_input_tile_mapping_e;
+
+typedef enum _vx_sp_attribute_output_collapse_e
+{
+    VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_DISABLED,
+    VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_ENABLED,
+}
+vx_sp_attribute_output_collapse_e;
+
+typedef enum _vx_sp_attribute_rounding_mode_e
+{
+    VX_SP_ATTRIBUTE_PROG_ROUNDING_MODE_RTNE,
+    VX_SP_ATTRIBUTE_PROG_ROUNDING_MODE_STICKY,
+}
+vx_sp_attribute_rounding_mode_e;
+
+typedef enum _vx_sp_attribute_input_setup_e
+{
+    VX_SP_ATTRIBUTE_INPUT_SETUP_SINGLE_INPUT,
+    VX_SP_ATTRIBUTE_INPUT_SETUP_INTERLEAVE_TWO_INPUTS,
+    VX_SP_ATTRIBUTE_INPUT_SETUP_V11,
+    VX_SP_ATTRIBUTE_INPUT_SETUP_V12,
+}
+vx_sp_attribute_input_setup_e;
+
+typedef enum _vx_sp_attribute_ch_post_redistribute_e
+{
+    VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_DISABLED,
+    VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_SCALAR_GATHER,
+    VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_VECTOR_GATHER,
+    VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_VECTOR_SCATTER,
+}
+vx_sp_attribute_ch_post_redistribute_e;
+
+typedef enum _vx_sp_attribute_v_reset_at_start_e
+{
+    VX_SP_ATTRIBUTE_V_RESET_AT_START_NONE,
+    VX_SP_ATTRIBUTE_V_RESET_AT_START_RESET,
+}
+vx_sp_attribute_v_reset_at_start_e;
+
+typedef enum _vx_sp_attribute_v_pop_config_e
+{
+    VX_SP_ATTRIBUTE_V_POP_CONFIG_EVERY_READ,
+    VX_SP_ATTRIBUTE_V_POP_CONFIG_EVERY_ROW,
+}
+vx_sp_attribute_v_pop_config_e;
+
+typedef enum _vx_sp_attribute_accelerator_input_select_e
+{
+    VX_SP_ATTRIBUTE_ACCELERATOR_INPUT_SELECT_FROM_OUTPUT,
+    VX_SP_ATTRIBUTE_ACCELERATOR_INPUT_SELECT_FROM_ACCLERATOR,
+}
+vx_sp_attribute_accelerator_input_select_e;
+
+typedef enum _vx_sp_attribute_sum_engine_reset_e
+{
+    VX_SP_ATTRIBUTE_SUM_ENGINE_RESET_NONE,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_RESET_RESET,
+}
+vx_sp_attribute_sum_engine_reset_e;
+
+typedef enum _vx_sp_attribute_sum_engine_control_e
+{
+    VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL_ACCUM_INTERNAL,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL_ACCUM_1D,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL_ACCUM_2D,
+}
+vx_sp_attribute_sum_engine_control_e;
+
+typedef enum _vx_sp_attribute_sum_engine_num_ch_minus_one_e
+{
+    VX_SP_ATTRIBUTE_SUM_ENGINE_NUM_CH_MINUS_ONE_ONE_CH,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_NUM_CH_MINUS_ONE_TWO_CH,
+}
+vx_sp_attribute_sum_engine_num_ch_minus_one_e;
+
+typedef enum _vx_sp_attribute_sum_engine_2d_accum_storage_e
+{
+    VX_SP_ATTRIBUTE_SUM_ENGINE_2D_ACCUM_STORAGE_SAME,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_2D_ACCUM_STORAGE_DIFFERENT,
+}
+vx_sp_attribute_sum_engine_2d_accum_storage_e;
+
+/**********************************************************************************************/
+
+/*! \brief Creates an opaque reference to a spinst data.
+ * \param [in] context The reference to the implementation context.
+ * \return A spinst data reference.
+ * \Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_object_spinst
+ */
+VX_API_ENTRY vx_spinst VX_API_CALL vxCreateSPINST(
+    vx_context          context
+    );
+
+/*! \brief Releases a reference to a spinst object.
+ * The object may not be garbage collected until its total reference count is zero.
+ * \param [in] spinst_obj The pointer to the spinst data to release.
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors; all other values indicate failure
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ * \ingroup group_object_spinst
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseSPINST(
+    vx_spinst            *spinst_obj
+    );
+
+/*! \brief Add a instruction to spinst object.
+ * \param [in] spinst_obj The reference to the spinst object.
+ * \param [in] inst_unit_array The units of one instruction. Use a <tt>\ref vx_spinst_unit_param</tt>.
+ * \param [in] inst_unit_count The count of instruction units.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If data is not a <tt>\ref spinst_obj</tt>.
+ * \retval VX_ERROR_INVALID_PARAMETERS If any of parameters is incorrect.
+ * \retval VX_ERROR_NO_MEMORY If fail to allocate internal instruction memory.
+ * \ingroup group_object_spinst
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxAddOneInstToSPINST(
+    vx_spinst                 spinst_obj,
+    vx_spinst_unit_param*     inst_unit_array,
+    vx_uint8                  inst_unit_count
+    );
+
+/*! \brief Set various attributes of a spinst data.
+ * \param [in] spinst_obj The reference to the vx_spinst object to set.
+ * \param [in] attribute The attribute to set. Use a <tt>\ref vx_sp_attribute_e</tt>.
+ * \param [in] value The value of attribute.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If data is not a <tt>\ref vx_spinst</tt>.
+ * \retval VX_ERROR_INVALID_PARAMETERS If any of attribute is incorrect.
+ * \ingroup group_object_spinst
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxSetAttributeToSPINST(
+    vx_spinst          spinst_obj,
+    vx_enum            attribute,
+    vx_uint32          value
+    );
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
index 0dbdcc8..e10a32e 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
@@ -342,6 +342,10 @@ typedef struct _vx_tensorpatch_addressing_t * vx_trensor_addressing;
  */
 typedef struct _vx_weights_biases_parameter_s *     vx_weights_biases_parameter;
 
+/*! \brief The object for stream processor
+ * \ingroup group_spinst
+ */
+typedef struct _vx_spinst_s *     vx_spinst;
 
 /*! \brief A Boolean value.
  * This allows 0 to be FALSE, as it is in C, and any non-zero to be TRUE.
@@ -470,6 +474,7 @@ enum vx_type_e {
     /* \todo add new object types here */
     VX_TYPE_BFLOAT16        = 0x81A,/*!< \brief A <tt>\ref vx_bfloat16</tt>. */
 
+    VX_TYPE_SPINST          = 0x81B,/*!< \brief A <tt>\ref vx_spinst</tt>. */
     VX_TYPE_INT4            = 0x81C,/*!< \brief A <tt>\ref signed 4bits tensor.</tt>. */
     VX_TYPE_UINT4           = 0x81D,/*!< \brief A <tt>\ref unsigned 4bits tensor.</tt>. */
 };
@@ -1021,6 +1026,8 @@ enum vx_node_attribute_e {
 
     VX_NODE_ATTRIBUTE_CONST_TENSOR_CACHE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x9,
 
+    VX_NODE_ATTRIBUTE_FOR_HW_QUALITY     = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0xA,
+
 };
 
 /*! \brief The parameter attributes list
@@ -1290,6 +1297,9 @@ enum vx_tensor_attribute_e
     VX_TENSOR_LIFETIME = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x5,
     /*! \brief the value status of tensor. */
     VX_TENSOR_VALUE = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x6,
+    /*XiaoMi project*/
+    VX_TENSOR_INPUT_FOR_REFERENCE = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x7,
+    VX_TENSOR_MEMORY_ATTRIBUTE = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x8,
 };
 
 /*! \brief The meta valid rectangle attributes.
diff --git a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so
index 5f9565c..793d7a3 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so and b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libCLC.so b/prebuilt-sdk/x86_64_linux/lib/libCLC.so
index d278960..b86b927 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libCLC.so and b/prebuilt-sdk/x86_64_linux/lib/libCLC.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so
index 213d250..1b79027 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so and b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libGAL.so b/prebuilt-sdk/x86_64_linux/lib/libGAL.so
index 434ffc4..06d1f8a 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libGAL.so and b/prebuilt-sdk/x86_64_linux/lib/libGAL.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so
index d88e0ce..62fca2c 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so and b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so
deleted file mode 120000
index 664ae82..0000000
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so
+++ /dev/null
@@ -1 +0,0 @@
-libOpenVX.so.1.3.0
\ No newline at end of file
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so
new file mode 100755
index 0000000..6d83612
Binary files /dev/null and b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1 b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1
deleted file mode 120000
index 664ae82..0000000
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1
+++ /dev/null
@@ -1 +0,0 @@
-libOpenVX.so.1.3.0
\ No newline at end of file
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1 b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1
new file mode 100755
index 0000000..6d83612
Binary files /dev/null and b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1 differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0
index ebea7d4..6d83612 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 and b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so
index ee7b8f8..6658be6 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so and b/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so
index 2339562..2c6a14b 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so and b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libVSC.so b/prebuilt-sdk/x86_64_linux/lib/libVSC.so
index bb370e9..3621c9a 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libVSC.so and b/prebuilt-sdk/x86_64_linux/lib/libVSC.so differ
diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def
index fa4dc17..ae52716 100644
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@@ -172,3 +172,10 @@ DEF_OP(PRE_PROCESS_RGB888_PLANAR)
 DEF_OP(GATHER_ELEMENTS)
 DEF_OP(SELU)
 DEF_OP(CELU)
+DEF_OP(MAX_POOL3D)
+DEF_OP(RCP)
+DEF_OP(SIGN)
+DEF_OP(SOFTSIGN)
+DEF_OP(CUMSUM)
+DEF_OP(MAXPOOLWITHARGMAX)
+DEF_OP(MOD)
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_gpu_config.h b/src/tim/vx/internal/include/kernel/vsi_nn_gpu_config.h
index a7ce5e3..a5dcb34 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_gpu_config.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_gpu_config.h
@@ -25,10 +25,13 @@
 #ifndef _VSI_NN_GPU_CONFIG_H
 #define _VSI_NN_GPU_CONFIG_H
 
-#define GPU_TENSOR_MAX_WIDTH    (65536)
+#ifdef VSI_40BIT_VA_SUPPORT
+#define GPU_TENSOR_MAX_WIDTH    (1 << 30)
+#else
+#define GPU_TENSOR_MAX_WIDTH    (1 << 16)
+#endif
 #define GPU_MAX_MULTIPLIER_NUM  (65535)
 #define GPU_MAX_POST_SHIFT_BITS (31)
 #define GPU_TENSOR_DIM_2        (2)
 
 #endif
-
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
index 501dd5d..7d75720 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
@@ -156,6 +156,8 @@ typedef struct
         vsi_nn_kernel_quant_asymm_t asymm;
         vsi_nn_kernel_quant_asymm_perchannel_t asymm_v;
     };
+    float scale;
+    int32_t zero_point;
 } vsi_nn_kernel_tensor_attr_t;
 
 typedef struct
@@ -411,7 +413,7 @@ vsi_status vsi_nn_kernel_node_pass_param
     size_t num
     );
 
-static inline void vsi_nn_kernel_node_release
+static VSI_INLINE_API void vsi_nn_kernel_node_release
     (
     vsi_nn_kernel_node_t * node
     )
@@ -422,7 +424,7 @@ static inline void vsi_nn_kernel_node_release
     }
 }
 
-static inline void vsi_nn_kernel_node_pack_io
+static VSI_INLINE_API void vsi_nn_kernel_node_pack_io
     (
     vsi_nn_kernel_node_param_t * params,
     size_t param_num,
@@ -476,7 +478,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
     );
 
 /** Map data type to gpu internal dtype. */
-static inline vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
+static VSI_INLINE_API vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
     (
     vsi_nn_type_e dtype
     )
@@ -516,7 +518,7 @@ static inline vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
     return I8;
 } /* vsi_nn_kernel_map_dtype() */
 
-static inline  vsi_nn_type_e vsi_nn_dtype_map_kernel
+static VSI_INLINE_API  vsi_nn_type_e vsi_nn_dtype_map_kernel
     (
     vsi_nn_kernel_dtype_e dtype
     )
@@ -556,7 +558,7 @@ static inline  vsi_nn_type_e vsi_nn_dtype_map_kernel
     return VSI_NN_TYPE_INT8;
 } /* vsi_nn_kernel_map_dtype() */
 
-static inline size_t vsi_nn_kernel_dtype_get_bytes
+static VSI_INLINE_API size_t vsi_nn_kernel_dtype_get_bytes
     (
     vsi_nn_kernel_dtype_e dtype
     )
@@ -585,7 +587,7 @@ static inline size_t vsi_nn_kernel_dtype_get_bytes
     return 0;
 } /* vsi_nn_kernel_dtype_get_bytes() */
 
-static inline vsi_size_t vsi_nn_kernel_dtype_get_bits
+static VSI_INLINE_API vsi_size_t vsi_nn_kernel_dtype_get_bits
     (
     vsi_nn_kernel_dtype_e dtype
     )
@@ -617,7 +619,7 @@ static inline vsi_size_t vsi_nn_kernel_dtype_get_bits
     return 0;
 } /* vsi_nn_kernel_dtype_get_bits() */
 
-static inline vsi_nn_kernel_quant_type_e vsi_nn_kernel_map_quant_type
+static VSI_INLINE_API vsi_nn_kernel_quant_type_e vsi_nn_kernel_map_quant_type
     ( vsi_nn_qnt_type_e quant_type )
 {
     switch( quant_type )
@@ -658,7 +660,7 @@ vsi_nn_kernel_scalar_t vsi_nn_kernel_scalar_create
     const void * data
     );
 
-static inline void vsi_nn_kernel_scalar_release
+static VSI_INLINE_API void vsi_nn_kernel_scalar_release
     ( vsi_nn_kernel_scalar_t * scalar )
 {
     if( scalar && *scalar )
@@ -803,7 +805,7 @@ vsi_status vsi_nn_kernel_tensor_write
     size_t size
     );
 
-static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_size
+static VSI_INLINE_API vsi_size_t vsi_nn_kernel_tensor_attr_get_size
     ( const vsi_nn_kernel_tensor_attr_t * attr )
 {
     if( !attr )
@@ -813,7 +815,7 @@ static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_size
     return vsi_nn_shape_get_size( attr->shape->data, (vsi_size_t)attr->shape->size );
 } /* vsi_nn_kernel_tensor_attr_get_size() */
 
-static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes
+static VSI_INLINE_API vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes
     ( const vsi_nn_kernel_tensor_attr_t * attr )
 {
     vsi_size_t i = 0;
@@ -851,7 +853,7 @@ static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes
     return bytes;
 } /* vsi_nn_kernel_tensor_attr_get_bytes() */
 
-static inline void vsi_nn_kernel_tensor_attr_get_stride
+static VSI_INLINE_API void vsi_nn_kernel_tensor_attr_get_stride
     ( const vsi_nn_kernel_tensor_attr_t * attr, vsi_size_t * out_stride)
 {
     vsi_size_t type_bits;
@@ -902,7 +904,7 @@ static inline void vsi_nn_kernel_tensor_attr_get_stride
     }
 } /* vsi_nn_kernel_tensor_attr_get_size() */
 
-static inline vsi_bool vsi_nn_kernel_tensor_attr_is_quantized
+static VSI_INLINE_API vsi_bool vsi_nn_kernel_tensor_attr_is_quantized
     ( const vsi_nn_kernel_tensor_attr_t * attr )
 {
     return ( attr && attr->quant > VSI_NN_KERNEL_QUANT_NONE
@@ -1072,7 +1074,7 @@ OVXLIB_API vsi_status vsi_nn_KernelGpuConfig
     const gpu_param_t * gpu_param
     );
 
-static inline const char* vsi_nn_kernel_type_str
+static VSI_INLINE_API const char* vsi_nn_kernel_type_str
     (
     vsi_nn_kernel_type_e type
     )
@@ -1095,7 +1097,7 @@ static inline const char* vsi_nn_kernel_type_str
     return "None";
 } /* vsi_nn_kernel_type_str() */
 
-static inline vsi_status vsi_nn_kernel_unpack_4bit_data
+static VSI_INLINE_API vsi_status vsi_nn_kernel_unpack_4bit_data
     (
     const vsi_nn_kernel_tensor_attr_t * attr,
     uint8_t * src,
@@ -1162,7 +1164,7 @@ static inline vsi_status vsi_nn_kernel_unpack_4bit_data
     return status;
 }
 
-static inline vsi_status vsi_nn_kernel_pack_4bit_data
+static VSI_INLINE_API vsi_status vsi_nn_kernel_pack_4bit_data
     (
     const vsi_nn_kernel_tensor_attr_t * attr,
     uint8_t * src,
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
index 53c4969..c872cca 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
@@ -46,6 +46,8 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum
     VSI_NN_KERNEL_LUT_CLIP             = 12,
     VSI_NN_KERNEL_LUT_SQUARE           = 13,
     VSI_NN_KERNEL_LUT_CELU             = 14,
+    VSI_NN_KERNEL_LUT_RCP              = 15,
+    VSI_NN_KERNEL_LUT_SOFTSIGN         = 16,
 };
 
 #define VSI_NN_KERNEL_LUT_MAX_SIZE  (1024)
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_crop.h b/src/tim/vx/internal/include/ops/vsi_nn_op_crop.h
index 5a74974..3a9e98b 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_crop.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_crop.h
@@ -30,11 +30,20 @@
 extern "C" {
 #endif
 
+typedef struct _vsi_nn_crop_lcl_data
+{
+    vx_int32 begin_dims[VSI_NN_MAX_DIM_NUM];
+    vx_int32 end_dims[VSI_NN_MAX_DIM_NUM];
+    vx_int32 stride_dims[VSI_NN_MAX_DIM_NUM];
+} vsi_nn_crop_lcl_data;
+
 typedef struct _vsi_nn_crop_param
 {
     int32_t  axis;
     uint32_t dims;
     uint32_t offset[VSI_NN_MAX_DIM_NUM];
+
+    vsi_nn_crop_lcl_data  *lcl_data;
 } vsi_nn_crop_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_cumsum.h b/src/tim/vx/internal/include/ops/vsi_nn_op_cumsum.h
new file mode 100644
index 0000000..f016884
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_cumsum.h
@@ -0,0 +1,45 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VSI_NN_OP_CUMSUM_H
+#define _VSI_NN_OP_CUMSUM_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_cumsum_param
+{
+    int32_t     axis;
+    vsi_bool    exclusive;
+    vsi_bool    reverse;
+} vsi_nn_cumsum_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_max_pool3d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_max_pool3d.h
new file mode 100644
index 0000000..043d9e0
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_max_pool3d.h
@@ -0,0 +1,55 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_MAX_POOL3D_H
+#define _VSI_NN_OP_MAX_POOL3D_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_max_pool3d_param
+{
+    struct _max_pool3d_local_data_t* local;
+    // Add parameters here
+
+    /* round_type is used to calculate the output shape */
+    vsi_nn_round_type_e round_type;
+    uint32_t     ksize[3];
+    uint32_t     stride[3];
+    /* Pad left, right, top, bottom */
+    uint32_t     pad[6];
+    /* Pad type default value shall be AUTO */
+    vsi_nn_pad_e pad_type;
+} vsi_nn_max_pool3d_param;
+_compiler_assert(offsetof(vsi_nn_max_pool3d_param, local) == 0, \
+    vsi_nn_max_pool3d_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_mod.h b/src/tim/vx/internal/include/ops/vsi_nn_op_mod.h
new file mode 100644
index 0000000..9d8d73c
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_mod.h
@@ -0,0 +1,44 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_MOD_H
+#define _VSI_NN_OP_MOD_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_mod_param
+{
+    int32_t fmod;
+} vsi_nn_mod_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_rcp.h b/src/tim/vx/internal/include/ops/vsi_nn_op_rcp.h
new file mode 100644
index 0000000..201762b
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_rcp.h
@@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_RCP_H
+#define _VSI_NN_OP_RCP_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_rcp_param
+{
+    struct _rcp_local_data_t* local;
+    // Add parameters here
+} vsi_nn_rcp_param;
+_compiler_assert(offsetof(vsi_nn_rcp_param, local) == 0, \
+    vsi_nn_rcp_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_sign.h b/src/tim/vx/internal/include/ops/vsi_nn_op_sign.h
new file mode 100644
index 0000000..f596802
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_sign.h
@@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_SIGN_H
+#define _VSI_NN_OP_SIGN_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_sign_param
+{
+    struct _sign_local_data_t* local;
+    // Add parameters here
+} vsi_nn_sign_param;
+_compiler_assert(offsetof(vsi_nn_sign_param, local) == 0, \
+    vsi_nn_sign_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_softsign.h b/src/tim/vx/internal/include/ops/vsi_nn_op_softsign.h
new file mode 100644
index 0000000..97bf611
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_softsign.h
@@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_SOFTSIGN_H
+#define _VSI_NN_OP_SOFTSIGN_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_softsign_param
+{
+    struct _softsign_local_data_t* local;
+    // Add parameters here
+} vsi_nn_softsign_param;
+_compiler_assert(offsetof(vsi_nn_softsign_param, local) == 0, \
+    vsi_nn_softsign_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
index 4586fa8..4e19fc0 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
@@ -32,7 +32,7 @@
 extern "C" {
 #endif
 
-static inline vsi_bool type_is_integer
+static VSI_INLINE_API vsi_bool type_is_integer
     (
     const vsi_nn_type_e type
     )
@@ -60,7 +60,7 @@ static inline vsi_bool type_is_integer
     return ret;
 } /* type_is_integer() */
 
-static inline vsi_bool type_is_signed
+static VSI_INLINE_API vsi_bool type_is_signed
     (
     const vsi_nn_type_e type
     )
@@ -86,7 +86,7 @@ static inline vsi_bool type_is_signed
     return ret;
 } /* type_is_signed() */
 
-static inline uint32_t type_get_bytes
+static VSI_INLINE_API uint32_t type_get_bytes
     (
     const vsi_nn_type_e type
     )
@@ -115,7 +115,7 @@ static inline uint32_t type_get_bytes
     }
 } /* type_get_bytes() */
 
-static inline uint32_t type_get_bits
+static VSI_INLINE_API uint32_t type_get_bits
     (
     const vsi_nn_type_e type
     )
@@ -147,7 +147,7 @@ static inline uint32_t type_get_bits
     }
 } /* type_get_bits() */
 
-static inline void type_get_range
+static VSI_INLINE_API void type_get_range
     (
     vsi_nn_type_e type,
     double  * max_range,
@@ -186,7 +186,24 @@ static inline void type_get_range
     }
 } /* type_get_range() */
 
-static inline int32_t fp32_to_affine
+static VSI_INLINE_API vsi_bool fp32_is_inf
+    (
+        float val
+    )
+{
+    uint32_t u_value = *(uint32_t*)&val;
+
+    if ((u_value & (uint32_t)VSI_NN_INT32_MAX) == (uint32_t)VSI_NN_FLOAT32_INF)
+    {
+        return TRUE;
+    }
+    else
+    {
+        return FALSE;
+    }
+}
+
+static VSI_INLINE_API int32_t fp32_to_affine
     (
     const float  in,
     const float  scale,
@@ -200,10 +217,17 @@ static inline int32_t fp32_to_affine
     type_get_range( type, &max_range, &min_range );
     data = (int32_t)(vsi_rint( in / scale ) + zero_point );
     data = vsi_nn_max( (int32_t)min_range, vsi_nn_min( (int32_t)max_range , data ) );
+
+    if (fp32_is_inf(in) != 0)
+    {
+        uint32_t sign = (*(uint32_t*)&in) >> 31;
+        data = sign == 1 ? (int32_t)min_range : (int32_t)max_range;
+    }
+
     return data;
 } /* fp32_to_affine() */
 
-static inline float affine_to_fp32
+static VSI_INLINE_API float affine_to_fp32
     (
     const int32_t    val,
     const float  scale,
@@ -216,7 +240,7 @@ static inline float affine_to_fp32
     return data;
 } /* affine_to_fp32() */
 
-static inline int32_t fp32_to_dfp
+static VSI_INLINE_API int32_t fp32_to_dfp
     (
     const float in,
     const int8_t    fl,
@@ -237,10 +261,17 @@ static inline int32_t fp32_to_dfp
     }
     data = vsi_nn_min( data, (int32_t)max_range );
     data = vsi_nn_max( data, (int32_t)min_range );
+
+    if (fp32_is_inf(in) != 0)
+    {
+        uint32_t sign = (*(uint32_t*)&in) >> 31;
+        data = sign == 1 ? (int32_t)min_range : (int32_t) max_range;
+    }
+
     return data;
 } /* fp32_to_dfp() */
 
-static inline float dfp_to_fp32
+static VSI_INLINE_API float dfp_to_fp32
     (
     const int32_t val,
     const int8_t  fl,
@@ -259,7 +290,7 @@ static inline float dfp_to_fp32
     return result;
 } /* dfp_to_fp32() */
 
-static inline vsi_status integer_convert
+static VSI_INLINE_API vsi_status integer_convert
     (
     const void *    src,
     vsi_nn_type_e   src_type,
@@ -303,7 +334,7 @@ typedef union
     float f;
 } _fp32_t;
 
-static inline float fp16_to_fp32
+static VSI_INLINE_API float fp16_to_fp32
     (
     int16_t in
     )
@@ -323,7 +354,7 @@ static inline float fp16_to_fp32
     return o.f;
 } /* fp16_to_fp32() */
 
-static inline float bfp16_to_fp32
+static VSI_INLINE_API float bfp16_to_fp32
     (
     int16_t in
     )
@@ -344,7 +375,7 @@ static inline float bfp16_to_fp32
     return t3 == 0 ? 0 : out;
 } /* bfp16_to_fp32() */
 
-static inline uint16_t fp32_to_fp16
+static VSI_INLINE_API uint16_t fp32_to_fp16
     (
     float in
     )
@@ -370,7 +401,7 @@ static inline uint16_t fp32_to_fp16
     return (uint16_t) fp16;
 } /* fp32_to_fp16() */
 
-static inline uint16_t fp32_to_bfp16
+static VSI_INLINE_API uint16_t fp32_to_bfp16
     (
     float in
     )
@@ -381,7 +412,7 @@ static inline uint16_t fp32_to_bfp16
     return (uint16_t) t1;
 } /* fp32_to_bfp16() */
 
-static inline uint16_t fp32_to_bfp16_rtne
+static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
     (
     float in
     )
@@ -409,7 +440,7 @@ static inline uint16_t fp32_to_bfp16_rtne
     return out;
 } /* fp32_to_bfp16_rtne */
 
-static inline vsi_status dtype_to_float32
+static VSI_INLINE_API vsi_status dtype_to_float32
     (
     uint8_t *src,
     float   *dst,
@@ -461,7 +492,7 @@ static inline vsi_status dtype_to_float32
     return VSI_SUCCESS;
 }
 
-static inline vsi_status float32_to_dtype
+static VSI_INLINE_API vsi_status float32_to_dtype
     (
     float   src,
     uint8_t *dst,
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_math.h b/src/tim/vx/internal/include/utils/vsi_nn_math.h
index 18ea5e8..b8a6d2a 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_math.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_math.h
@@ -42,6 +42,8 @@ extern "C" {
 #define vsi_clamp(x, min, max)      vsi_nn_clamp(x, min, max)
 #define vsi_rtne(x)                 vsi_rint(x)
 
+#define VSI_NN_INT32_MAX            (0x7FFFFFFF)
+
 #define VSI_NN_FLOAT32_INF          (0x7F800000)
 #define VSI_NN_FLOAT32_NAN          (0x7FC00000)
 #define VSI_NN_FLOAT64_INF          (0x7FF0000000000000)
@@ -53,14 +55,14 @@ extern "C" {
         size_t size; \
         TYPE data[0]; \
     } vsi_##NAME##_array_t; \
-    static inline vsi_##NAME##_array_t * vsi_##NAME##_array_create( size_t size ) { \
+    static VSI_INLINE_API vsi_##NAME##_array_t * vsi_##NAME##_array_create( size_t size ) { \
         vsi_##NAME##_array_t * array = (vsi_##NAME##_array_t *)malloc( \
                 sizeof(vsi_##NAME##_array_t) + sizeof(TYPE) * size ); \
         if (array == NULL) return NULL; \
         array->size = size; \
         return array; \
     } \
-    static inline void vsi_##NAME##_array_release( vsi_##NAME##_array_t ** array ) \
+    static VSI_INLINE_API void vsi_##NAME##_array_release( vsi_##NAME##_array_t ** array ) \
         { \
             if( array && *array ) { \
                 free( *array ); \
@@ -167,7 +169,7 @@ void vsi_nn_random_uniform_transform
     uint32_t len
     );
 
-static inline double copy_sign
+static VSI_INLINE_API double copy_sign
     (
     double number,
     double sign
@@ -177,7 +179,7 @@ static inline double copy_sign
     return (sign > 0) ? value : (-value);
 } /* copy_sign() */
 
-static inline float simple_round
+static VSI_INLINE_API float simple_round
     (
     float x
     )
@@ -185,7 +187,7 @@ static inline float simple_round
     return (float) copy_sign(floorf(fabsf(x) + 0.5f), x);
 } /* simple_round() */
 
-static inline double vsi_rint
+static VSI_INLINE_API double vsi_rint
     (
     double x
     )
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h
index 9fb03d9..77b3cb6 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h
@@ -65,7 +65,7 @@ extern "C" {
 #define VSI_NN_DO_JOIN(X, Y) VSI_NN_DO_JOIN2(X,Y)
 #define VSI_NN_DO_JOIN2(X, Y) X##Y
 
-#if defined(_MSC_VER)
+#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
     #define VSI_NN_DEPRECATED(symbol, hints) \
        __declspec(deprecated(VSI_NN_STRINGIZE(hints))) symbol
 
@@ -381,7 +381,7 @@ int32_t vsi_nn_partition
  * @param[in]  num Number of tensors.
  * @param[out] out_tensors Ordered tensors
  * */
-static inline void vsi_nn_reorder_tensor
+static VSI_INLINE_API void vsi_nn_reorder_tensor
     (
     vsi_nn_tensor_t** tensors,
     const int32_t* order,
@@ -417,6 +417,15 @@ vsi_bool vsi_nn_is_broadcast_operaton
     vsi_nn_tensor_t            *  output
     );
 
+vsi_bool vsi_nn_is_broadcast_axes_operaton
+    (
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            *  output,
+    int32_t                    *  axis,
+    int32_t                       axis_num
+    );
+
 float vsi_nn_get_tensor_scale
     (
     vsi_nn_tensor_t * tensor
diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h
index 20a4dd1..95591ca 100644
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@@ -66,6 +66,8 @@ typedef struct _vsi_nn_hw_config_t
     uint32_t use_40bits_va;
     uint32_t support_stream_processor;
     uint32_t sp_exec_count;
+    uint32_t sp_vector_depth;
+    uint32_t sp_per_core_vector_depth;
 } vsi_nn_hw_config_t;
 
 typedef struct _vsi_nn_runtime_option_t
diff --git a/src/tim/vx/internal/include/vsi_nn_daemon.h b/src/tim/vx/internal/include/vsi_nn_daemon.h
index e005466..4fad88c 100644
--- a/src/tim/vx/internal/include/vsi_nn_daemon.h
+++ b/src/tim/vx/internal/include/vsi_nn_daemon.h
@@ -35,7 +35,7 @@
         struct f##_t_{ ~f##_t_(void) { f(); }}; static f##_t_ f##_; \
         static void f(void)
 
-#elif defined(_MSC_VER)
+#elif (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
     #pragma section(".CRT$XCU", read)
     #define _INITIALIZER2(f, p) \
         static void f(void); \
diff --git a/src/tim/vx/internal/include/vsi_nn_feature.h b/src/tim/vx/internal/include/vsi_nn_feature.h
index 2ebb367..7297269 100644
--- a/src/tim/vx/internal/include/vsi_nn_feature.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature.h
@@ -27,7 +27,7 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_prv.h"
 
-static inline vsi_bool vsi_nn_feature_conv_max_kernel_size()
+static VSI_INLINE_API vsi_bool vsi_nn_feature_conv_max_kernel_size()
 {
     return 11;
 }
diff --git a/src/tim/vx/internal/include/vsi_nn_log.h b/src/tim/vx/internal/include/vsi_nn_log.h
index d3afaa2..d8b5bad 100644
--- a/src/tim/vx/internal/include/vsi_nn_log.h
+++ b/src/tim/vx/internal/include/vsi_nn_log.h
@@ -31,7 +31,7 @@
 extern "C"{
 #endif
 
-#ifdef _MSC_VER
+#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
 #define snprintf(buffer, count, format, ...) \
     _snprintf_s(buffer, count, _TRUNCATE, format, ##__VA_ARGS__)
 #define vsnprintf(buffer, count, format, args) \
diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h
index 16f74fa..5c170df 100644
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@@ -190,6 +190,12 @@
 #include "ops/vsi_nn_op_gather_elements.h"
 #include "ops/vsi_nn_op_selu.h"
 #include "ops/vsi_nn_op_celu.h"
+#include "ops/vsi_nn_op_max_pool3d.h"
+#include "ops/vsi_nn_op_rcp.h"
+#include "ops/vsi_nn_op_sign.h"
+#include "ops/vsi_nn_op_softsign.h"
+#include "ops/vsi_nn_op_cumsum.h"
+#include "ops/vsi_nn_op_mod.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 
@@ -365,6 +371,12 @@ typedef union _vsi_nn_nn_param
     vsi_nn_gather_elements_param    gather_elements;
     vsi_nn_selu_param               selu;
     vsi_nn_celu_param               celu;
+    vsi_nn_max_pool3d_param         max_pool3d;
+    vsi_nn_rcp_param                rcp;
+    vsi_nn_sign_param               sign;
+    vsi_nn_softsign_param           softsign;
+    vsi_nn_cumsum_param             cumsum;
+    vsi_nn_mod_param                mod;
     void*                         client_param;
 
     /* custom node data struct define */
diff --git a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
index 124ac48..5cc2a3e 100644
--- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
+++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
@@ -243,6 +243,18 @@ OVXLIB_API vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam
         uint32_t enable_nodes_count
     );
 
+OVXLIB_API vsi_status vsi_nn_UpdateCropParamsForBinaryGraph
+    (
+        vsi_nn_graph_t* graph,
+        uint32_t enabled_crop_input_idx,
+        uint32_t start_x,
+        uint32_t start_y,
+        uint32_t crop_w,
+        uint32_t crop_h,
+        uint32_t dst_w,
+        uint32_t dst_h
+    );
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/tim/vx/internal/include/vsi_nn_pub.h b/src/tim/vx/internal/include/vsi_nn_pub.h
index 5e9194e..d36f570 100644
--- a/src/tim/vx/internal/include/vsi_nn_pub.h
+++ b/src/tim/vx/internal/include/vsi_nn_pub.h
@@ -26,7 +26,7 @@
 #define _VSI_NN_PUB_H
 
 #if !defined(OVXLIB_API)
-    #if defined(_WIN32)
+    #if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
         #define OVXLIB_API __declspec(dllimport)
     #else
         #define OVXLIB_API __attribute__((visibility("default")))
diff --git a/src/tim/vx/internal/include/vsi_nn_types.h b/src/tim/vx/internal/include/vsi_nn_types.h
index 8aa3ca9..076f493 100644
--- a/src/tim/vx/internal/include/vsi_nn_types.h
+++ b/src/tim/vx/internal/include/vsi_nn_types.h
@@ -33,11 +33,13 @@
 extern "C"{
 #endif
 
-#ifdef _WIN32
-#define inline __inline
+#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
+#define VSI_INLINE_API __inline
+#else
+#define VSI_INLINE_API inline
 #endif
 
-#if (defined(_MSC_VER) || defined(__MINGW32))
+#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
     #define SIZE_T_SPECIFIER "Iu"
     #define SSIZE_T_SPECIFIER "Id"
     #ifdef VSI_40BIT_VA_SUPPORT
@@ -59,7 +61,7 @@ extern "C"{
     #endif
 #endif
 
-#if defined(_MSC_VER)
+#if (defined(_MSC_VER))
 #include <BaseTsd.h>
 typedef SSIZE_T ssize_t;
 #else
diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h
index faab685..711c498 100644
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@@ -33,7 +33,7 @@ extern "C"{
 
 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 43
+#define VSI_NN_VERSION_PATCH 50
 #define VSI_NN_VERSION \
     (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
 
diff --git a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
index bf5b07c..5741690 100644
--- a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
@@ -188,7 +188,7 @@ static vsi_status _query_kernel
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    if (input_dtype == I8)
+    if (input_dtype == I8 || input_dtype == I16)
     {
         input_dtype = I32;
     }
@@ -269,7 +269,6 @@ static vsi_nn_kernel_node_t _setup
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
             CHECK_STATUS_FAIL_GOTO( status, OnError );
-
         }
     }
 
@@ -285,4 +284,3 @@ OnError:
 __END_DECLS
 
 REGISTER_BACKEND_CL( argmax, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c
index 2911a84..b710fa1 100644
--- a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c
@@ -188,6 +188,11 @@ static vsi_status _query_kernel
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
+    if (input_dtype == I8 || input_dtype == I16)
+    {
+        input_dtype = I32;
+    }
+
     if (output_dtype == I16)
     {
         output_dtype = I32;
@@ -264,7 +269,6 @@ static vsi_nn_kernel_node_t _setup
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
             CHECK_STATUS_FAIL_GOTO( status, OnError );
-
         }
     }
 
diff --git a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
new file mode 100644
index 0000000..91746ab
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
@@ -0,0 +1,365 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define KERNEL_SOURCE_1    "cumsum"
+#define KERNEL_SOURCE_2    "cumsum_2d"
+
+// Add kernel hashtable here
+#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
+    ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
+
+#define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
+        CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
+        KERNEL_SOURCE_1 },
+
+#define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \
+        CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
+        KERNEL_SOURCE_2 },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } cumsum_map[] =
+{
+    HASH_CUMSUM_KERNELS(0, U8,  U8)
+    HASH_CUMSUM_KERNELS(0, F32, F32)
+    HASH_CUMSUM_KERNELS(1, U8,  U8)
+    HASH_CUMSUM_KERNELS(1, F32, F32)
+    HASH_CUMSUM_KERNELS(2, U8,  U8)
+    HASH_CUMSUM_KERNELS(2, F32, F32)
+    HASH_CUMSUM_KERNELS_2D(0, U8,  U8)
+    HASH_CUMSUM_KERNELS_2D(0, F32, F32)
+    HASH_CUMSUM_KERNELS_2D(1, U8,  U8)
+    HASH_CUMSUM_KERNELS_2D(1, F32, F32)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _cumsum_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _CUMSUM_PARAM_NUM  _cnt_of_array( _cumsum_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_cumsum_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    vsi_size_array_t * input_shape = NULL;
+    int32_t       axis      = 0;
+    int32_t       width     = 0;
+    int32_t       height    = 0;
+    int32_t       channel   = 0;
+    int32_t       w         = 1;
+    int32_t       h         = 1;
+    int32_t       c         = 1;
+    uint32_t      dim       = 1;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    input_shape  = attr[0]->shape;
+    dim     = (uint32_t)input_shape->size;
+    width   = (int32_t)(input_shape->data[0]);
+    height  = (int32_t)(input_shape->data[1]);
+    channel = (int32_t)(dim > 2 ? input_shape->data[2] : 1);
+
+    if (axis == 0)
+    {
+        w = 1;
+        h = height;
+        c = channel;
+    }
+    else if (axis == 1)
+    {
+        w = width;
+        h = 1;
+        c = channel;
+    }
+    else if (axis == 2)
+    {
+        w = width;
+        h = height;
+        c = 1;
+    }
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = w;
+    gpu_param.global_size[1]   = h;
+    gpu_param.global_size[2]   = c;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _cumsum_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t axis,
+    int32_t is_2d
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (input0_dtype == U32)
+    {
+        input0_dtype = U8;
+    }
+
+    if (input0_dtype == F16)
+    {
+        input0_dtype = F32;
+    }
+
+    if (output_dtype == U32)
+    {
+        output_dtype = U8;
+    }
+
+    if (output_dtype == F16)
+    {
+        output_dtype = F32;
+    }
+
+    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d);
+
+    for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
+    {
+        if ( cumsum_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < _cnt_of_array(cumsum_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  cumsum_map[i].function_name );
+        kernel->info.parameters = _cumsum_kernel_param_def;
+        kernel->info.numParams = _cnt_of_array( _cumsum_kernel_param_def );
+        kernel->info.initialize = _cumsum_initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                cumsum_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                cumsum_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CUMSUM_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t  shapes[1][VSI_NN_MAX_DIM_NUM] = {{0}};
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    int32_t axis       = vsi_nn_kernel_param_get_int32( params, "axis" );
+    int32_t exclusive  = vsi_nn_kernel_param_get_int32( params, "exclusive" );
+    int32_t reverse    = vsi_nn_kernel_param_get_int32( params, "reverse" );
+    int32_t axis_new   = 0;
+    int32_t is_2d      = 0;
+    uint32_t rs_dim    = 2;
+    int32_t input_zp   = vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input_scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float output_zp    = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    float in_out_scale = input_scale * output_scale;
+    float in_out_zp_scale = in_out_scale * input_zp;
+    int32_t width      = 0;
+    int32_t height     = 0;
+    int32_t channel    = 1;
+    int32_t i = 0;
+
+    vsi_nn_kernel_optimize_softmax_shape(
+                inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+                shapes[0], &rs_dim, &axis_new);
+    if (rs_dim > 3)
+    {
+        return NULL;
+    }
+
+    width = (int32_t)shapes[0][0];
+    height = (int32_t)shapes[0][1];
+
+    if (rs_dim == 2)
+    {
+        is_2d = 1;
+    }
+    else
+    {
+        channel = (int32_t)shapes[0][2];
+    }
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+        inputs[0], shapes[0], (vsi_size_t)rs_dim );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+        outputs[0], shapes[0], (vsi_size_t)rs_dim );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, axis_new, is_2d );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 2;
+
+            /* Pass parameters to node. */
+            vsi_nn_kernel_node_pack_io( node_params, _CUMSUM_PARAM_NUM,
+                reshape_tensors, 1, &reshape_tensors[1], 1 );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_new );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &exclusive );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &channel );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_out_scale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_out_zp_scale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CUMSUM_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+        }
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( cumsum, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
index 7bf6d36..7e1d681 100644
--- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
@@ -53,6 +53,9 @@ typedef enum
     UNARY_HGELU,
     UNARY_SELU,
     UNARY_CELU,
+    UNARY_RCP,
+    UNARY_SIGN,
+    UNARY_SOFTSIGN,
 } unary_type_e;
 
 /*
@@ -94,6 +97,13 @@ typedef enum
 #define HGELU_OPERATION         hard_gelu
 #define SELU_OPERATION          selu
 #define CELU_OPERATION          celu
+#define RCP_OPERATION           rcp
+#define SIGN_OPERATION          sign
+#define SOFTSIGN_OPERATION      softsign
+
+#define ADD_UNARY_SH_KERNELS(name, src_type, dst_type) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, src_type, dst_type) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, src_type, dst_type)
 
 static const struct {
         uint32_t key;
@@ -101,61 +111,39 @@ static const struct {
         const char* source_name;
     } kernel_map[] =
 {
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,      UNARY_SIN,      F32, F32)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,      UNARY_COS,      F32, F32)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,      UNARY_EXP,      F32, F32)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,      UNARY_LOG,      F32, F32)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,      UNARY_NEG,      F32, F32)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION,     UNARY_MISH,     F32, F32)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION,    UNARY_ROUND,    F32, F32)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION,     UNARY_GELU,     F32, F32)
-    TENSOR_UNARY_KERNELS_3D(HGELU_OPERATION,    UNARY_HGELU,    F32, F32)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION,     UNARY_SELU,     F32, F32)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION,     UNARY_CELU,     F32, F32)
+    ADD_UNARY_SH_KERNELS(SIN,      F32, F32)
+    ADD_UNARY_SH_KERNELS(COS,      F32, F32)
+    ADD_UNARY_SH_KERNELS(EXP,      F32, F32)
+    ADD_UNARY_SH_KERNELS(LOG,      F32, F32)
+    ADD_UNARY_SH_KERNELS(NEG,      F32, F32)
+    ADD_UNARY_SH_KERNELS(HSIGMOID, F32, F32)
+    ADD_UNARY_SH_KERNELS(MISH,     F32, F32)
+    ADD_UNARY_SH_KERNELS(ROUND,    F32, F32)
+    ADD_UNARY_SH_KERNELS(GELU,     F32, F32)
+    ADD_UNARY_SH_KERNELS(HGELU,    F32, F32)
+    ADD_UNARY_SH_KERNELS(SELU,     F32, F32)
+    ADD_UNARY_SH_KERNELS(CELU,     F32, F32)
+    ADD_UNARY_SH_KERNELS(RCP,      F32, F32)
+    ADD_UNARY_SH_KERNELS(SIGN,     F32, F32)
+    ADD_UNARY_SH_KERNELS(SOFTSIGN, F32, F32)
 
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,      UNARY_SIN,      F32, F32)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,      UNARY_COS,      F32, F32)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,      UNARY_EXP,      F32, F32)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,      UNARY_LOG,      F32, F32)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,      UNARY_NEG,      F32, F32)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION,     UNARY_MISH,     F32, F32)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION,    UNARY_ROUND,    F32, F32)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION,     UNARY_GELU,     F32, F32)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION,    UNARY_HGELU,    F32, F32)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION,     UNARY_SELU,     F32, F32)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION,     UNARY_CELU,     F32, F32)
+    ADD_UNARY_SH_KERNELS(SIN,      U8,  U8)
+    ADD_UNARY_SH_KERNELS(COS,      U8,  U8)
+    ADD_UNARY_SH_KERNELS(EXP,      U8,  U8)
+    ADD_UNARY_SH_KERNELS(LOG,      U8,  U8)
+    ADD_UNARY_SH_KERNELS(NEG,      U8,  U8)
+    ADD_UNARY_SH_KERNELS(HSIGMOID, U8,  U8)
+    ADD_UNARY_SH_KERNELS(MISH,     U8,  U8)
+    ADD_UNARY_SH_KERNELS(ROUND,    U8,  U8)
+    ADD_UNARY_SH_KERNELS(GELU,     U8,  U8)
+    ADD_UNARY_SH_KERNELS(HGELU,    U8,  U8)
+    ADD_UNARY_SH_KERNELS(SELU,     U8,  U8)
+    ADD_UNARY_SH_KERNELS(CELU,     U8,  U8)
+    ADD_UNARY_SH_KERNELS(RCP,      U8,  U8)
+    ADD_UNARY_SH_KERNELS(SIGN,     U8,  U8)
+    ADD_UNARY_SH_KERNELS(SOFTSIGN, U8,  U8)
 
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,      UNARY_COS,      U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,      UNARY_LOG,      U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,      UNARY_NEG,      U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION,     UNARY_MISH,     U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION,    UNARY_ROUND,    U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION,     UNARY_GELU,     U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(HGELU_OPERATION,    UNARY_HGELU,    U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION,     UNARY_SELU,     U8,  U8)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION,     UNARY_CELU,     U8,  U8)
-
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,      UNARY_COS,      U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,      UNARY_LOG,      U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,      UNARY_NEG,      U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION,     UNARY_MISH,     U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION,    UNARY_ROUND,    U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION,     UNARY_GELU,     U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION,    UNARY_HGELU,    U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION,     UNARY_SELU,     U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION,     UNARY_CELU,     U8,  U8)
-
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I32,  I32)
-
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I32,  I32)
+    ADD_UNARY_SH_KERNELS(NEG,      I32, I32)
 };
 
 #undef SIN_OPERATION
@@ -170,6 +158,9 @@ static const struct {
 #undef HGELU_OPERATION
 #undef SELU_OPERATION
 #undef CELU_OPERATION
+#undef RCP_OPERATION
+#undef SIGN_OPERATION
+#undef SOFTSIGN_OPERATION
 /*
  * Kernel params
  */
@@ -458,4 +449,8 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( gelu,         UNARY_GELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_gelu,    UNARY_HGELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( selu,         UNARY_SELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( celu,         UNARY_CELU )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( rcp,          UNARY_RCP )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( sign,         UNARY_SIGN )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( softsign,     UNARY_SOFTSIGN )
+
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/cl/gather_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
index fdeda2e..f04c62f 100644
--- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
@@ -123,7 +123,7 @@ static vsi_status cal_gather_tensor_reshape_size
     uint32_t i = 0;
     vsi_size_t elementCnt = 1;
     vsi_size_t outerCnt = 1;
-#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
 
     for (i = 0; i < dims_num - batch_dims; ++i)
     {
@@ -365,4 +365,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( gather, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
index 4612e4f..74dd993 100644
--- a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
@@ -111,7 +111,7 @@ static vsi_status cal_gather_nd_tensor_reshape_size
     vsi_size_t *input_size = inputs[0]->attr.size;
     uint32_t i = 0;
     vsi_size_t elementCnt = 1;
-#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
 
     newDim[0] = 0;
     for(i = 0; i < dims_num; ++i)
@@ -336,4 +336,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( gather_nd, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
index 58eb2ee..892377b 100644
--- a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
@@ -22,7 +22,6 @@
 *
 *****************************************************************************/
 
-
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -44,21 +43,20 @@ __BEGIN_DECLS
  */
 typedef enum
 {
-    INTERNAL_KERNEL_MEAN_VARI,
+    INTERNAL_KERNEL_SUMS,
     INTERNAL_KERNEL_NORM,
 } _internal_kernel_e;
 
 #define KERNEL_SOURCE_1    "instance_normalization_u8"
-#define KERNEL_SOURCE_2    "instance_normalization_f16"
+#define KERNEL_SOURCE_2    "instance_normalization_f32"
 #define KERNEL_SOURCE_3    "instance_normalization_i32"
-#define KERNEL_SOURCE_4    "instance_normalization_f32"
 
 // Add kernel hashtable here
-#define HASH_INSTANCENORM_MEAN_VARI_KERNEL_NAME(SRC0_TYPE) \
-    CVIVANTE_NAMESPACE("cl.instance_norm_meanvari_"#SRC0_TYPE)
+#define HASH_INSTANCENORM_SUMS_KERNEL_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("cl.instance_norm_sums_"#SRC0_TYPE)
 
-#define HASH_INSTANCENORM_MEAN_VARI_KERNEL_2D_NAME(SRC0_TYPE) \
-    CVIVANTE_NAMESPACE("cl.instance_norm_meanvari_"#SRC0_TYPE"_2D")
+#define HASH_INSTANCENORM_SUMS_KERNEL_2D_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("cl.instance_norm_sums_"#SRC0_TYPE"_2D")
 
 #define HASH_INSTANCENORM_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
     CVIVANTE_NAMESPACE("cl.instance_norm_"#SRC0_TYPE"to"#DST_TYPE)
@@ -68,17 +66,17 @@ typedef enum
 
 // Add kernel hashtable here
 // mean vari
-#define HASH_INSTANCENORM_MEAN_VARI_KEY(_input0_type, _output_type, _reshape_flag) \
+#define HASH_INSTANCENORM_SUMS_KEY(_input0_type, _output_type, _reshape_flag) \
     ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
 
-#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 0), \
-        HASH_INSTANCENORM_MEAN_VARI_KERNEL_NAME(IN0_TYPE), \
+#define TENSOR_INSTANCENORM_SUMS_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_INSTANCENORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        HASH_INSTANCENORM_SUMS_KERNEL_NAME(IN0_TYPE), \
         SOURCE },
 
-#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 1), \
-        HASH_INSTANCENORM_MEAN_VARI_KERNEL_2D_NAME(IN0_TYPE), \
+#define TENSOR_INSTANCENORM_SUMS_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_INSTANCENORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 1), \
+        HASH_INSTANCENORM_SUMS_KERNEL_2D_NAME(IN0_TYPE), \
         SOURCE },
 
 // normalization
@@ -102,17 +100,15 @@ typedef struct
     const char * source_name;
 } _kernel_map_type;
 
-static const _kernel_map_type _instancenorm_mean_vari_kernel_map[] =
+static const _kernel_map_type _instancenorm_sums_kernel_map[] =
 {
     // Register kernel here
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( U8, F32, KERNEL_SOURCE_1 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F16, F32, KERNEL_SOURCE_2 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( I32, F32, KERNEL_SOURCE_3 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F32, F32, KERNEL_SOURCE_4 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F32, F32, KERNEL_SOURCE_4 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS( U8, F32, KERNEL_SOURCE_1 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS( F32, F32, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS( I32, F32, KERNEL_SOURCE_3 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
 };
 
 static const _kernel_map_type _instancenorm_kernel_map[] =
@@ -123,22 +119,19 @@ static const _kernel_map_type _instancenorm_kernel_map[] =
     TENSOR_INSTANCENORM_KERNELS( U8, F16, KERNEL_SOURCE_1 )
     TENSOR_INSTANCENORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_1 )
 
-    TENSOR_INSTANCENORM_KERNELS( F16, F16, KERNEL_SOURCE_2 )
-    TENSOR_INSTANCENORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_KERNELS( F32, F32, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 )
 
     TENSOR_INSTANCENORM_KERNELS( I32, I32, KERNEL_SOURCE_3 )
     TENSOR_INSTANCENORM_KERNELS_2D( I32, I32, KERNEL_SOURCE_3 )
     TENSOR_INSTANCENORM_KERNELS( I32, F32, KERNEL_SOURCE_3 )
     TENSOR_INSTANCENORM_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
-
-    TENSOR_INSTANCENORM_KERNELS( F32, F32, KERNEL_SOURCE_4 )
-    TENSOR_INSTANCENORM_KERNELS_2D( F32, F32, KERNEL_SOURCE_4 )
 };
 
 /*
  * Kernel params
  */
-static vx_param_description_t _instancenorm_mean_vari_kernel_param_def[] =
+static vx_param_description_t _instancenorm_sums_kernel_param_def[] =
 {
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@@ -146,12 +139,9 @@ static vx_param_description_t _instancenorm_mean_vari_kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
-#define _INSTANCENORM_MEAN_VARI_PARAM_NUM  _cnt_of_array( _instancenorm_mean_vari_kernel_param_def )
+#define _INSTANCENORM_SUMS_PARAM_NUM  _cnt_of_array( _instancenorm_sums_kernel_param_def )
 
 static vx_param_description_t _instancenorm_kernel_param_def[] =
 {
@@ -168,10 +158,6 @@ static vx_param_description_t _instancenorm_kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
 #define _INSTANCENORM_PARAM_NUM  _cnt_of_array( _instancenorm_kernel_param_def )
@@ -179,7 +165,7 @@ static vx_param_description_t _instancenorm_kernel_param_def[] =
 /*
  * Kernel initializer
  */
-DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
+DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
     (
     vsi_nn_kernel_node_t                node,
     const vsi_nn_kernel_node_param_t  * param,
@@ -244,7 +230,7 @@ final:
         attr[1] = NULL;
     }
     return status;
-} /* _instance_normalization_mean_vari_initializer() */
+} /* _instance_normalization_sums_initializer() */
 
 DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
     (
@@ -334,12 +320,12 @@ static vsi_status _query_kernel
 
     switch( kernel_id )
     {
-        case INTERNAL_KERNEL_MEAN_VARI:
-            initializer = _instancenorm_mean_vari_initializer;
-            kernel_map = _instancenorm_mean_vari_kernel_map;
-            kernel_map_size = _cnt_of_array( _instancenorm_mean_vari_kernel_map );
-            param_def = _instancenorm_mean_vari_kernel_param_def;
-            param_size = _INSTANCENORM_MEAN_VARI_PARAM_NUM;
+        case INTERNAL_KERNEL_SUMS:
+            initializer = _instancenorm_sums_initializer;
+            kernel_map = _instancenorm_sums_kernel_map;
+            kernel_map_size = _cnt_of_array( _instancenorm_sums_kernel_map );
+            param_def = _instancenorm_sums_kernel_param_def;
+            param_size = _INSTANCENORM_SUMS_PARAM_NUM;
             break;
         case INTERNAL_KERNEL_NORM:
             initializer = _instancenorm_initializer;
@@ -392,9 +378,9 @@ static vsi_nn_kernel_node_t _setup
     )
 {
 #define INTERNAL_KERNEL_SIZE    (1)
-#define MEAN_VARI_INDEX  (0)
+#define SUMS_INDEX  (0)
     vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t mean_vari_node_params[_INSTANCENORM_MEAN_VARI_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_param_t sums_node_params[_INSTANCENORM_SUMS_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_param_t node_params[_INSTANCENORM_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
     vsi_nn_kernel_dtype_e in0_dtype = U8;
@@ -407,18 +393,17 @@ static vsi_nn_kernel_node_t _setup
     uint32_t hashkey = 0;
     int32_t i = 0;
     uint32_t rank = outputs[0]->attr.dim_num;
-    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
+    float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" ) /
+                (input_scale * input_scale);
     size_t width = inputs[0]->attr.size[0];
     size_t height = inputs[0]->attr.size[1];
     int32_t reshape_flg  = outputs[0]->attr.size[1] * outputs[0]->attr.size[2] < GPU_TENSOR_MAX_WIDTH
             && rank > 2;
     int32_t group_num = (int32_t)(width + 15) / 16;
-    int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
-    float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
     int32_t output_zp = vsi_nn_get_tensor_zero_point(outputs[0]);
     float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
-    float in_fl_scale = 1.0f, out_fl_scale = 1.0;
-    float dim_ratio = (float)1.0 / (float)(width * height);
+    float inv_multiplier = (float)1.0 / (float)(width * height);
 
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
@@ -443,15 +428,21 @@ static vsi_nn_kernel_node_t _setup
     attr.size[2] = 1;
     attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
     attr.dim_num = 4;
-    tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr );
+    tensors[SUMS_INDEX] = vsi_nn_CreateTensor( graph, &attr );
 
     in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+    in0_dtype = in0_dtype == F16 ? F32 : in0_dtype;
+    in0_dtype = in0_dtype == I8 ? I32 : in0_dtype;
+    in0_dtype = in0_dtype == I16 ? I32 : in0_dtype;
+    out_dtype = out_dtype == F16 ? F32 : out_dtype;
+    out_dtype = out_dtype == I8 ? I32 : out_dtype;
+    out_dtype = out_dtype == I16 ? I32 : out_dtype;
 
-    hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_MEAN_VARI_KEY( in0_dtype, F32, reshape_flg );
+    hashkeys[SUMS_INDEX]= HASH_INSTANCENORM_SUMS_KEY( in0_dtype, F32, reshape_flg );
     hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg );
 
-    status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
+    status = _query_kernel( ikernels[SUMS_INDEX], hashkeys[SUMS_INDEX], INTERNAL_KERNEL_SUMS );
     if ( VSI_SUCCESS != status )
     {
         goto final;
@@ -497,37 +488,31 @@ static vsi_nn_kernel_node_t _setup
     }
     // Mean Vari
     {
-        node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] );
+        node = vsi_nn_kernel_create_node( graph, ikernels[SUMS_INDEX] );
         if (node)
         {
             uint32_t index = 0;
             if (reshape_flg)
             {
-                mean_vari_node_params[index++] = rs_input;
+                sums_node_params[index++] = rs_input;
             }
             else
             {
-                mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
+                sums_node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
             }
-            mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
-            mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
-            mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
-            mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp );
-            mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
-            mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_fl_scale );
-            mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
-            mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+            sums_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUMS_INDEX]->t;
+            sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+            sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
+            sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+            sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
 
-            status  = vsi_nn_kernel_node_pass_param( node, mean_vari_node_params,
-                        _INSTANCENORM_MEAN_VARI_PARAM_NUM );
+            status  = vsi_nn_kernel_node_pass_param( node, sums_node_params,
+                        _INSTANCENORM_SUMS_PARAM_NUM );
             CHECK_STATUS(status);
-            vsi_nn_kernel_scalar_release( &mean_vari_node_params[2] );
-            vsi_nn_kernel_scalar_release( &mean_vari_node_params[3] );
-            vsi_nn_kernel_scalar_release( &mean_vari_node_params[4] );
-            vsi_nn_kernel_scalar_release( &mean_vari_node_params[5] );
-            vsi_nn_kernel_scalar_release( &mean_vari_node_params[6] );
-            vsi_nn_kernel_scalar_release( &mean_vari_node_params[7] );
-            vsi_nn_kernel_scalar_release( &mean_vari_node_params[8] );
+            vsi_nn_kernel_scalar_release( &sums_node_params[2] );
+            vsi_nn_kernel_scalar_release( &sums_node_params[3] );
+            vsi_nn_kernel_scalar_release( &sums_node_params[4] );
+            vsi_nn_kernel_scalar_release( &sums_node_params[5] );
             vsi_nn_kernel_node_release( &node );
         }
     }
@@ -562,7 +547,7 @@ static vsi_nn_kernel_node_t _setup
             {
                 node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
             }
-            node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
+            node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUMS_INDEX]->t;
             if (reshape_flg)
             {
                 node_params[index++] = rs_output;
@@ -573,15 +558,11 @@ static vsi_nn_kernel_node_t _setup
             }
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_fl_scale );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_zp );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &out_fl_scale );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inv_multiplier );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &group_num );
 
             status  = vsi_nn_kernel_node_pass_param( node, node_params,
@@ -595,10 +576,6 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &node_params[10] );
             vsi_nn_kernel_scalar_release( &node_params[11] );
             vsi_nn_kernel_scalar_release( &node_params[12] );
-            vsi_nn_kernel_scalar_release( &node_params[13] );
-            vsi_nn_kernel_scalar_release( &node_params[14] );
-            vsi_nn_kernel_scalar_release( &node_params[15] );
-            vsi_nn_kernel_scalar_release( &node_params[16] );
         }
     }
 
diff --git a/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c b/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c
new file mode 100644
index 0000000..2311810
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c
@@ -0,0 +1,312 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define KERNEL_SOURCE_1    "maxpoolwithargmax"
+#define KERNEL_SOURCE_2    "maxpoolwithargmax_2d"
+
+// Add kernel hashtable here
+#define MAXPOOLWITHARGMAX_HASH_KEY( IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1, _image_2d) \
+        (( IN_DTYPE << 24 ) | ( OUT_DTYPE0 << 20) | ( OUT_DTYPE1 << 12) | (_image_2d))
+
+#define HASH_MAXPOOLWITHARGMAX_KERNELS( IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1) \
+        { MAXPOOLWITHARGMAX_HASH_KEY(IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1, 0), \
+        CVIVANTE_NAMESPACE("cl.maxpoolwithargmax_"#IN_DTYPE"to"#OUT_DTYPE0"_"#OUT_DTYPE1), \
+        KERNEL_SOURCE_1 },
+
+#define HASH_MAXPOOLWITHARGMAX_KERNELS_2D( IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1) \
+        { MAXPOOLWITHARGMAX_HASH_KEY(IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1, 1), \
+        CVIVANTE_NAMESPACE("cl.maxpoolwithargmax_"#IN_DTYPE"to"#OUT_DTYPE0"_"#OUT_DTYPE1"_2D"), \
+        KERNEL_SOURCE_2 },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } maxpoolwithargmax_map[] =
+{
+    HASH_MAXPOOLWITHARGMAX_KERNELS(F32,  F32,  I32)
+    HASH_MAXPOOLWITHARGMAX_KERNELS(BF16, BF16, I32)
+    HASH_MAXPOOLWITHARGMAX_KERNELS(U32,  U32,   I32)
+    HASH_MAXPOOLWITHARGMAX_KERNELS(I32,  I32,  I32)
+    HASH_MAXPOOLWITHARGMAX_KERNELS_2D(F32,  F32,  I32)
+    HASH_MAXPOOLWITHARGMAX_KERNELS_2D(BF16, BF16, I32)
+    HASH_MAXPOOLWITHARGMAX_KERNELS_2D(U32,  U32,   I32)
+    HASH_MAXPOOLWITHARGMAX_KERNELS_2D(I32,  I32,  I32)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _maxpoolwithargmax_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _MAXPOOLWITHARGMAX_PARAM_NUM  _cnt_of_array( _maxpoolwithargmax_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_maxpoolwithargmax_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vx_status    status             = VX_FAILURE;
+    vx_tensor    output             = (vx_tensor)param[1];
+    vsi_nn_kernel_tensor_attr_t * attr_out = NULL;
+    vsi_size_array_t * out_shape   = NULL;
+
+    attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
+    CHECK_PTR_FAIL_GOTO( attr_out, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    out_shape = attr_out->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    gpu_param.global_size[0]   = gpu_align_p2((out_shape->data[0] +  gpu_param.global_scale[0] - 1)
+                                        /  gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = out_shape->data[1];
+    gpu_param.global_size[2]   = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+final:
+    if (attr_out)
+    {
+        vsi_nn_kernel_tensor_attr_release(&attr_out);
+    }
+
+    return status;
+} /* _maxpoolwithargmax_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t is_2d
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input_dtype   = U8;
+    vsi_nn_kernel_dtype_e output0_dtype = U8;
+    vsi_nn_kernel_dtype_e output1_dtype = I32;
+    uint32_t key = 0;
+    int32_t i = 0;
+
+    input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output0_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+    output1_dtype = vsi_nn_kernel_map_dtype( outputs[1]->attr.dtype.vx_type );
+
+    if (input_dtype == U8)
+    {
+        input_dtype = U32;
+    }
+
+    if (input_dtype == I8 || input_dtype == I16)
+    {
+        input_dtype = I32;
+    }
+
+    if (input_dtype == F16)
+    {
+        input_dtype = F32;
+    }
+
+    if (output0_dtype == U8)
+    {
+        output0_dtype = U32;
+    }
+
+    if (output0_dtype == I8 || output0_dtype == I16)
+    {
+        output0_dtype = I32;
+    }
+
+    if (output0_dtype == F16)
+    {
+        output0_dtype = F32;
+    }
+
+    key = MAXPOOLWITHARGMAX_HASH_KEY( input_dtype, output0_dtype, output1_dtype, is_2d);
+
+    for ( i = 0; i < _cnt_of_array(maxpoolwithargmax_map); i ++ )
+    {
+        if ( maxpoolwithargmax_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < _cnt_of_array(maxpoolwithargmax_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  maxpoolwithargmax_map[i].function_name );
+        kernel->info.parameters = _maxpoolwithargmax_kernel_param_def;
+        kernel->info.numParams = _cnt_of_array( _maxpoolwithargmax_kernel_param_def );
+        kernel->info.initialize = _maxpoolwithargmax_initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                maxpoolwithargmax_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                maxpoolwithargmax_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_MAXPOOLWITHARGMAX_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t ksize_x  = vsi_nn_kernel_param_get_int32(params, "ksize_x");
+    int32_t ksize_y  = vsi_nn_kernel_param_get_int32(params, "ksize_y");
+    int32_t stride_x = vsi_nn_kernel_param_get_int32(params, "stride_x");
+    int32_t stride_y = vsi_nn_kernel_param_get_int32(params, "stride_y");
+    int32_t pad_x    = vsi_nn_kernel_param_get_int32(params, "pad_left");
+    int32_t pad_y    = vsi_nn_kernel_param_get_int32(params, "pad_top");
+    int32_t image_2d = inputs[0]->attr.dim_num == 2 ? 1 : 0;
+    int32_t width    = (int32_t)inputs[0]->attr.size[0];
+    int32_t height   = (int32_t)inputs[0]->attr.size[1];
+    float   outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float   inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float   scale_value  = 1.0f;
+    float   tail_value   = 0.0f;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
+                inputs[0]->attr.dim_num )
+     || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num )
+     || !vsi_nn_kernel_gpu_check_shape( outputs[1]->attr.size,
+                outputs[1]->attr.dim_num ))
+    {
+        return NULL;
+    }
+
+    scale_value = inputScale / outputScale;
+    tail_value  = outputTail - inputTail * inputScale / outputScale;
+
+    status = _query_kernel( kernel, inputs, outputs, image_2d );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 3;
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _MAXPOOLWITHARGMAX_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_value );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &tail_value );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _MAXPOOLWITHARGMAX_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( maxpoolwithargmax, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/mod_cl.c b/src/tim/vx/internal/src/kernel/cl/mod_cl.c
new file mode 100644
index 0000000..1398823
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/mod_cl.c
@@ -0,0 +1,303 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+#define MOD_KERNEL_SOURCE_NAME "mod"
+
+#define MOD_HASH_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
+    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
+
+
+#define MOD_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
+    { MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
+      CVIVANTE_NAMESPACE("cl.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE), \
+      MOD_KERNEL_SOURCE_NAME},
+
+#define MOD_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
+    { MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \
+      CVIVANTE_NAMESPACE("cl.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE"_2D"), \
+      MOD_KERNEL_SOURCE_NAME },
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _mod_kernel_map[] =
+{
+
+// Register kernel here
+    MOD_KERNELS( F32, F32, F32 )
+    MOD_KERNELS( I32, I32, I32 )
+    MOD_KERNELS( I32, I32, U8 )
+    MOD_KERNELS( U8,  U8,  U8 )
+    MOD_KERNELS( U8,  I32, U8 )
+
+    MOD_KERNELS_2D( F32, F32, F32 )
+    MOD_KERNELS_2D( I32, I32, I32 )
+    MOD_KERNELS_2D( I32, I32, U8 )
+    MOD_KERNELS_2D( U8,  U8,  U8 )
+    MOD_KERNELS_2D( U8,  I32, U8 )
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _mod_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _MOD_PARAM_NUM  _cnt_of_array( _mod_kernel_param_def )
+#define MOD_QUANT_PARAM_NUM   _cnt_of_array( _mod_kernel_param_def )
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_mod_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status status = VSI_FAILURE;
+    vx_tensor  output              = (vx_tensor)param[2];
+    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
+    vsi_size_array_t             *output_shape = NULL;
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
+    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    output_shape = output_attr->shape;
+
+    gpu_param.dim = output_shape->size < 3 ? 2 : 3;
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    gpu_param.global_size[0]   = gpu_align_p2((output_shape->data[0] +  gpu_param.global_scale[0] - 1)
+                                        /  gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = (output_shape->data[1] +  gpu_param.global_scale[1] - 1)
+                                        /  gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = output_shape->size > 2 ? output_shape->data[2] : 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (output_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&output_attr);
+    }
+
+    return status;
+} /* _mod_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool image_2d
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+     const _kernel_map_type * kernel_map = _mod_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _mod_kernel_map );
+    vx_param_description_t * param_def  = _mod_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array( _mod_kernel_param_def );
+    vx_kernel_initialize_f  initializer = _mod_initializer;
+
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (F16 == in0_dtype)
+    {
+        in0_dtype = F32;
+    }
+    else if (I16 == in0_dtype || I8 == in0_dtype)
+    {
+        in0_dtype = I32;
+    }
+
+    if (F16 == in1_dtype)
+    {
+        in1_dtype = F32;
+    }
+    else if (I16 == in1_dtype || I8 == in1_dtype)
+    {
+        in1_dtype = I32;
+    }
+
+    if (F16 == out_dtype)
+    {
+        out_dtype  = F32;
+    }
+    else if (I16 == out_dtype || I8 == out_dtype)
+    {
+        out_dtype = I32;
+    }
+
+    key = MOD_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d);
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_MOD_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_bool image_2d = FALSE;
+    float outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float input0Scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float input0Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input1Scale  = vsi_nn_get_tensor_scale(inputs[1]);
+    float input1Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
+    int32_t isfmod = vsi_nn_kernel_param_get_int32(params, "isfmod");
+
+    outputScale = 1.0f / outputScale;
+    input0Tail   = -(input0Tail * input0Scale);
+    input1Tail   = -(input1Tail * input1Scale);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    image_2d = (outputs[0]->attr.dim_num == 2);
+
+    status = _query_kernel( kernel, inputs, outputs, image_2d);
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            size_t node_params_num = MOD_QUANT_PARAM_NUM;
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _MOD_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &isfmod );
+            node_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale );
+            node_params[5] = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail );
+            node_params[6] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale );
+            node_params[7] = vsi_nn_kernel_scalar_create(graph, F32, &input1Tail );
+            node_params[8] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[9] = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( mod, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
index bed0f91..c36851e 100644
--- a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
@@ -48,7 +48,7 @@ __BEGIN_DECLS
 
 #define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE ) \
         { ROI_ALIGN_HASH_KEY( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, 0 ), \
-          CVIVANTE_NAMESPACE("cl.roi_align_"STR(IN0_DTYPE)"to"STR(OUT_DTYPE)), \
+          CVIVANTE_NAMESPACE("cl.roi_align_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
           _ROI_ALIGN_KERNEL_SOURCE(IN0_DTYPE) }
 
 typedef struct
@@ -61,6 +61,7 @@ typedef struct
 static const _kernel_map_type _roi_align_kernel_map[] =
 {
     PACK_KERNEL_MAP(F32, F32, I32, F32),
+    PACK_KERNEL_MAP(U8,  U16, I32, U8),
 };
 
 
@@ -82,20 +83,28 @@ static vx_param_description_t _roi_align_kernel_param_def[] =
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _ROI_ALIGN_PARAM_NUM  _cnt_of_array( _roi_align_kernel_param_def )
 
-#define SCALAR_SPATIAL_X_SCALE          (4)
-#define SCALAR_SPATIAL_Y_SCALE          (5)
-#define SCALAR_INPUT_WIDTH              (6)
-#define SCALAR_INPUT_HEIGHT             (7)
-#define SCALAR_RCP_OF_OUTPUT_WIDTH      (8)
-#define SCALAR_RCP_OF_OUTPUT_HEIGHT     (9)
-#define SCALAR_SAMPLING_X_RATIO         (10)
-#define SCALAR_SAMPLING_Y_RATIO         (11)
-#define SCALAR_DEPTH                    (12)
+#define SCALAR_INPUT_SCALE              (4)
+#define SCALAR_INPUT_TAIL               (5)
+#define SCALAR_OUTPUT_SCALE             (6)
+#define SCALAR_OUTPUT_ZP                (7)
+#define SCALAR_SPATIAL_X_SCALE          (8)
+#define SCALAR_SPATIAL_Y_SCALE          (9)
+#define SCALAR_INPUT_WIDTH              (10)
+#define SCALAR_INPUT_HEIGHT             (11)
+#define SCALAR_RCP_OF_OUTPUT_WIDTH      (12)
+#define SCALAR_RCP_OF_OUTPUT_HEIGHT     (13)
+#define SCALAR_SAMPLING_X_RATIO         (14)
+#define SCALAR_SAMPLING_Y_RATIO         (15)
+#define SCALAR_DEPTH                    (16)
 
-#define ROI_ALIGN_PARAM_NUM         13
+#define ROI_ALIGN_PARAM_NUM         17
 #define ROI_ALIGN_QUANT_PARAM_NUM   _cnt_of_array( _roi_align_kernel_param_def )
 
 /*
@@ -185,6 +194,7 @@ static vsi_status _query_kernel
 
     in0_dtype = in0_dtype == F16 ? F32 : in0_dtype;
     in1_dtype = in1_dtype == F16 ? F32 : in1_dtype;
+    out_dtype = out_dtype == F16 ? F32 : out_dtype;
 
     key = ROI_ALIGN_HASH_KEY( in0_dtype, in1_dtype, in2_dtype, out_dtype, image_2d );
 
@@ -241,8 +251,14 @@ static vsi_nn_kernel_node_t _setup
     float   height_ratio        = vsi_nn_kernel_param_get_float32( params, "height_ratio" );
     int32_t width_sample_num    = vsi_nn_kernel_param_get_int32( params, "width_sample_num" );
     int32_t height_sample_num   = vsi_nn_kernel_param_get_int32( params, "height_sample_num" );
-    float   width_scale         = 1.0f / width_ratio;
-    float   height_scale        = 1.0f / height_ratio;
+    float   input_zp    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float   input_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float   input_tail  = -(input_zp * input_scale);
+    float   roi_scale   = vsi_nn_get_tensor_scale(inputs[1]);
+    float   output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    float   output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   width_scale         = roi_scale / width_ratio;
+    float   height_scale        = roi_scale / height_ratio;
     float   in_width            = (float)(inputs[0]->attr.size[0]);
     float   in_height           = (float)(inputs[0]->attr.size[1]);
     float   rcp_of_out_width    = 1.0f / (float)(outputs[0]->attr.size[0]);
@@ -287,6 +303,10 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_node_pack_io( node_params, _ROI_ALIGN_PARAM_NUM,
                 reshape_tensors, input_num, &reshape_tensors[_INPUT_NUM], output_num );
 
+            node_params[SCALAR_INPUT_SCALE]          = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
+            node_params[SCALAR_INPUT_TAIL]           = vsi_nn_kernel_scalar_create( graph, F32, &input_tail );
+            node_params[SCALAR_OUTPUT_SCALE]         = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+            node_params[SCALAR_OUTPUT_ZP]            = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
             node_params[SCALAR_SPATIAL_X_SCALE]      = vsi_nn_kernel_scalar_create( graph, F32, &width_scale );
             node_params[SCALAR_SPATIAL_Y_SCALE]      = vsi_nn_kernel_scalar_create( graph, F32, &height_scale );
             node_params[SCALAR_INPUT_WIDTH]          = vsi_nn_kernel_scalar_create( graph, F32, &in_width );
@@ -299,6 +319,10 @@ static vsi_nn_kernel_node_t _setup
 
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_X_SCALE] );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_Y_SCALE] );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );
diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c
index 1eba1c2..5ec59b1 100644
--- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c
@@ -115,7 +115,7 @@ static vsi_status cal_scatter_nd_tensor_reshape_size
         return status;
     }
 
-#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
 
     newDim[0] = 0;
     for(i = 0; i < dims_num; ++i)
@@ -333,4 +333,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( scatter_nd, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
index 2ab4a16..fd72a9d 100644
--- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
@@ -108,7 +108,7 @@ static vsi_status cal_scatter_nd_update_tensor_reshape_size
         return status;
     }
 
-#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
 
     newDim[0] = 0;
     for(i = 0; i < dims_num; ++i)
@@ -373,4 +373,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( scatter_nd_update, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
index ad99bc6..a3d5428 100644
--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@@ -22,7 +22,6 @@
 *
 *****************************************************************************/
 
-
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -49,6 +48,13 @@ __BEGIN_DECLS
           CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
           _TOPK_KERNEL_SOURCE }
 
+#define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) )
+#define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+          CVIVANTE_NAMESPACE("cl.topk_odd_even_sort_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
+          "topk_odd_even_sort" }
+
 typedef struct
 {
     uint32_t key;
@@ -84,6 +90,14 @@ static const _kernel_map_type _topk_kernel_map[] =
     PACK_KERNEL_MAP( I32, I32, 6 ),
 };
 
+static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
+{
+    // Register kernel here
+    PACK_ODD_EVEN_SORT_KERNEL_MAP( F32, F32 ),
+    PACK_ODD_EVEN_SORT_KERNEL_MAP( U32, U32 ),
+    PACK_ODD_EVEN_SORT_KERNEL_MAP( I32, I32 ),
+};
+
 /*
  * Kernel params
  */
@@ -99,6 +113,19 @@ static vx_param_description_t _topk_kernel_param_def[] =
 #define _TOPK_PARAM_NUM  _cnt_of_array( _topk_kernel_param_def )
 #define SCALAR_INPUT_NUM_STAGES (3)
 #define SCALAR_INPUT_WIDTH      (4)
+
+static vx_param_description_t _topk_odd_even_sort_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _TOPK_ODD_EVEN_SORT_PARAM_NUM  _cnt_of_array( _topk_odd_even_sort_kernel_param_def )
+#define SCALAR_INPUT_SIZE  (5)
 /*
  * Kernel initializer
  */
@@ -140,9 +167,47 @@ DEF_KERNEL_INITIALIZER(_topk_initializer)
 final:
 #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
     SAFE_FREE_TENSOR_ATTR(input_attr);
+#undef SAFE_FREE_TENSOR_ATTR
     return status;
 } /* _topk_initializer() */
 
+DEF_KERNEL_INITIALIZER(_topk_odd_even_sort_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        2,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * input_attr   = NULL;
+    vsi_size_array_t * in_shape                = NULL;
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+
+    in_shape  = input_attr->shape;
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.local_size[0]   = 32;
+    gpu_param.local_size[1]   = 1;
+    gpu_param.global_size[0]  = 32;
+    gpu_param.global_size[1]  = in_shape->data[1];
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(input_attr);
+#undef SAFE_FREE_TENSOR_ATTR
+    return status;
+} /* _topk_odd_even_sort_initializer() */
 
 /*
  * Query kernel
@@ -215,6 +280,72 @@ static vsi_status _query_kernel
     return status;
 } /* _query_kernel() */
 
+static vsi_status _query_odd_even_sort_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _topk_odd_even_sort_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _topk_odd_even_sort_kernel_map );
+    vx_param_description_t * param_def  = _topk_odd_even_sort_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _topk_odd_even_sort_initializer;
+#define _PACK_SELECT_KEY( in_type, out_type ) \
+    ( (in_type) | (out_type << 8) )
+    uint32_t key = 0;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+        key = TOPK_ODD_EVEN_SORT_HASH_KEY( F32, F32 );
+        break;
+    case _PACK_SELECT_KEY(U32, U32):
+    case _PACK_SELECT_KEY(U16, U16):
+    case _PACK_SELECT_KEY(U8,  U8):
+        key = TOPK_ODD_EVEN_SORT_HASH_KEY( U32, U32 );
+        break;
+    case _PACK_SELECT_KEY(I32, I32):
+    case _PACK_SELECT_KEY(I16, I16):
+    case _PACK_SELECT_KEY(I8,  I8):
+        key = TOPK_ODD_EVEN_SORT_HASH_KEY( I32, I32 );
+        break;
+    default:
+        break;
+    }
+#undef _PACK_SELECT_KEY
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _topk_odd_even_sort_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
 
 static vsi_nn_kernel_node_t _setup
     (
@@ -228,16 +359,19 @@ static vsi_nn_kernel_node_t _setup
     )
 {
     vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_TOPK_PARAM_NUM];
+    vsi_nn_kernel_node_param_t node_params[_TOPK_ODD_EVEN_SORT_PARAM_NUM];
     vsi_nn_kernel_node_t node = NULL;
     vsi_size_t block_size = inputs[0]->attr.size[0];
     vsi_size_t block_num = 1;
     uint32_t i = 0;
-    vsi_nn_tensor_t* rs_tensors[3] = { NULL };
+    vsi_nn_tensor_t* rs_tensors[5] = { NULL };
+    vsi_nn_tensor_attr_t attr;
     vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
     int32_t width = (int32_t)block_size;
     int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
     int32_t num_stages = (int32_t)ceil(log10(block_size / 2.0f) / log10(2.0f));
+    vsi_bool is_odd_even_sort = FALSE;
+    size_t param_num = _TOPK_PARAM_NUM;
 
     for (i = 1; i < inputs[0]->attr.dim_num; i ++)
     {
@@ -257,26 +391,58 @@ static vsi_nn_kernel_node_t _setup
 
     rs_tensors[0] = vsi_nn_reshape_tensor( graph,
         inputs[0], shape[0], 2 );
-    rs_tensors[1] = vsi_nn_reshape_tensor( graph,
-        outputs[0], shape[1], 2 );
-    rs_tensors[2] = vsi_nn_reshape_tensor( graph,
-        outputs[1], shape[1], 2 );
 
-    status = _query_kernel( kernel, inputs, outputs, num_stages );
+    if (num_stages < 7)
+    {
+        status = _query_kernel( kernel, inputs, outputs, num_stages );
+
+        rs_tensors[1] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shape[1], 2 );
+        rs_tensors[2] = vsi_nn_reshape_tensor( graph,
+            outputs[1], shape[1], 2 );
+    }
+    else
+    {
+        status = _query_odd_even_sort_kernel( kernel, inputs, outputs );
+        is_odd_even_sort = TRUE;
+        param_num = _TOPK_ODD_EVEN_SORT_PARAM_NUM;
+
+        memcpy( &attr, &(rs_tensors[0]->attr), sizeof(vsi_nn_tensor_attr_t) );
+        rs_tensors[1] = vsi_nn_CreateTensor( graph, &attr );
+        attr.dtype.vx_type = VSI_NN_TYPE_INT32;
+        attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+        rs_tensors[2] = vsi_nn_CreateTensor( graph, &attr );
+
+        rs_tensors[3] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shape[1], 2 );
+        rs_tensors[4] = vsi_nn_reshape_tensor( graph,
+            outputs[1], shape[1], 2 );
+
+        input_num = 3;
+    }
     if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
         if ( node )
         {
             /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _TOPK_PARAM_NUM,
-                    rs_tensors, input_num, &rs_tensors[1], output_num );
+            vsi_nn_kernel_node_pack_io( node_params, param_num,
+                    rs_tensors, input_num, &rs_tensors[input_num], output_num );
             /* Pass parameters to node. */
-            node_params[SCALAR_INPUT_NUM_STAGES] = vsi_nn_kernel_scalar_create(
-                graph, I32, &num_stages );
-            node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create(
-                graph, I32, &width );
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _TOPK_PARAM_NUM );
+            if (is_odd_even_sort)
+            {
+                node_params[SCALAR_INPUT_SIZE] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &width );
+            }
+            else
+            {
+                node_params[SCALAR_INPUT_NUM_STAGES] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &num_stages );
+                node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &width );
+            }
+
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, param_num );
             CHECK_STATUS_FAIL_GOTO( status, final );
         }
     }
@@ -284,13 +450,25 @@ final:
     vsi_safe_release_tensor(rs_tensors[0]);
     vsi_safe_release_tensor(rs_tensors[1]);
     vsi_safe_release_tensor(rs_tensors[2]);
-    if (node_params[SCALAR_INPUT_NUM_STAGES])
+    vsi_safe_release_tensor(rs_tensors[3]);
+    vsi_safe_release_tensor(rs_tensors[4]);
+    if (is_odd_even_sort)
     {
-        vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_NUM_STAGES] );
+        if (node_params[SCALAR_INPUT_SIZE])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SIZE] );
+        }
     }
-    if (node_params[SCALAR_INPUT_WIDTH])
+    else
     {
-        vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );
+        if (node_params[SCALAR_INPUT_NUM_STAGES])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_NUM_STAGES] );
+        }
+        if (node_params[SCALAR_INPUT_WIDTH])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );
+        }
     }
 
     return node;
diff --git a/src/tim/vx/internal/src/kernel/cpu/cumsum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/cumsum_cpu.c
new file mode 100644
index 0000000..29f333d
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/cumsum_cpu.c
@@ -0,0 +1,260 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _CPU_ARG_NUM            (3)
+#define _CPU_INPUT_NUM          (1)
+#define _CPU_OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.cumsum")
+
+DEF_KERNEL_EXECUTOR(_cumsum_exec)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VX_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    float * buffer[2] = { NULL };
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
+    int32_t i = 0;
+    int32_t axisSize = 1, innerSize = 1, outerSize = 1;
+    int32_t axis = 0, exclusive = 0, reverse = 0;
+
+    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &exclusive);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &reverse);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
+
+    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
+    memset( buffer[1], 0, out_elements * sizeof(float) );
+
+    {
+        int32_t  dims_num  = (int32_t)attr[1]->shape->size;
+        int32_t  inner     = 0;
+        int32_t  outer     = 0;
+
+        for(i = 0; i < axis; ++i)
+        {
+            innerSize *= (int32_t)attr[0]->shape->data[i];
+        }
+
+        axisSize = (int32_t)attr[0]->shape->data[i++];
+
+        for(; i < dims_num; ++i)
+        {
+            outerSize *= (int32_t)attr[0]->shape->data[i];
+        }
+
+        for ( outer = 0; outer < outerSize; ++outer)
+        {
+            for ( inner = 0; inner < innerSize; ++inner)
+            {
+                float sum = .0f;
+
+                if (exclusive && reverse)
+                {
+                    int32_t idx_out = (outer * axisSize + axisSize - 1) * innerSize + inner;
+                    buffer[1][idx_out] = sum;
+                    for (i = axisSize - 1; i > 0; i--)
+                    {
+                        int32_t idx = (outer * axisSize + i) * innerSize + inner;
+                        float value = buffer[0][idx];
+                        idx_out = (outer * axisSize + i - 1) * innerSize + inner;
+                        sum += value;
+                        buffer[1][idx_out] = sum;
+                    }
+                }
+                else if (exclusive)
+                {
+                    int32_t idx_out = outer * axisSize * innerSize + inner;
+                    buffer[1][idx_out] = sum;
+                    for (i = 0; i < axisSize - 1; ++i)
+                    {
+                        int32_t idx = (outer * axisSize + i) * innerSize + inner;
+                        float value = buffer[0][idx];
+                        idx_out = (outer * axisSize + i + 1) * innerSize + inner;
+                        sum += value;
+                        buffer[1][idx_out] = sum;
+                    }
+                }
+                else if (reverse)
+                {
+                    for (i = axisSize - 1; i >= 0; i--)
+                    {
+                        int32_t idx = (outer * axisSize + i) * innerSize + inner;
+                        float value = buffer[0][idx];
+                        sum += value;
+                        buffer[1][idx] = sum;
+                    }
+                }
+                else
+                {
+                    for (i = 0; i < axisSize; ++i)
+                    {
+                        // i * innerSize + inner + outer * innerSize * axisSize
+                        int32_t idx = (outer * axisSize + i) * innerSize + inner;
+                        float value = buffer[0][idx];
+                        sum += value;
+                        buffer[1][idx] = sum;
+                    }
+                }
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
+            buffer[1], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    for ( i = 0; i < 2; i ++ )
+    {
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+    }
+    for ( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+    }
+    return status;
+} /* _cumsum_exec() */
+/*
+ * Kernel params
+ */
+static vx_param_description_t _cumsum_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _CUMSUM_PARAM_NUM  _cnt_of_array( _cumsum_kernel_param_def )
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel
+    )
+{
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _cumsum_exec;
+    kernel->info.parameters  = _cumsum_kernel_param_def;
+    kernel->info.numParams   = _CUMSUM_PARAM_NUM;
+
+    return VSI_SUCCESS;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VX_FAILURE;
+    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+
+    status = _query_kernel( inputs, outputs, kernel );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 2;
+            int32_t axis      = vsi_nn_kernel_param_get_int32( params, "axis" );
+            int32_t exclusive = vsi_nn_kernel_param_get_int32( params, "exclusive" );
+            int32_t reverse   = vsi_nn_kernel_param_get_int32( params, "reverse" );
+
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
+                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
+
+            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
+            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &exclusive );
+            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
+            /* Pass parameters to node. */
+            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
+            CHECK_STATUS( status );
+            vsi_nn_kernel_scalar_release( &backend_params[2] );
+            vsi_nn_kernel_scalar_release( &backend_params[3] );
+            vsi_nn_kernel_scalar_release( &backend_params[4] );
+        }
+        else
+        {
+            status = VSI_FAILURE;
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( cumsum, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
index 7c6c480..061d5bc 100644
--- a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
@@ -50,6 +50,9 @@ typedef enum
     UNARY_HGELU,
     UNARY_SELU,
     UNARY_CELU,
+    UNARY_RCP,
+    UNARY_SIGN,
+    UNARY_SOFTSIGN,
 } unary_type_e;
 
 
@@ -145,6 +148,21 @@ static float celu_eval(float x, float alpha)
     return positive + negative;
 }
 
+static float rcp_eval(float x)
+{
+    return 1 / x;
+}
+
+static float sign_eval(float x)
+{
+    return x > 0 ? 1.0f : x < 0 ? -1.0f : 0;
+}
+
+static float softsign_eval(float x)
+{
+    return x / (1.0f + vsi_abs(x));
+}
+
 DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
     (
     vsi_nn_kernel_node_t node,
@@ -227,6 +245,15 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
         case UNARY_CELU:
             data = celu_eval(data, alpha);
             break;
+        case UNARY_RCP:
+            data = rcp_eval(data);
+            break;
+        case UNARY_SIGN:
+            data = sign_eval(data);
+            break;
+        case UNARY_SOFTSIGN:
+            data = softsign_eval(data);
+            break;
         default:
             break;
         }
@@ -360,4 +387,7 @@ REGISTER_ELTWISE_UNARY_BACKEND_CPU( round,        UNARY_ROUND )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( gelu,         UNARY_GELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_gelu,    UNARY_HGELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( selu,         UNARY_SELU )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( celu,         UNARY_CELU )
\ No newline at end of file
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( celu,         UNARY_CELU )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( rcp,          UNARY_RCP )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( sign,         UNARY_SIGN )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( softsign,     UNARY_SOFTSIGN )
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/cpu/maxpoolwithargmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/maxpoolwithargmax_cpu.c
new file mode 100644
index 0000000..900451a
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/maxpoolwithargmax_cpu.c
@@ -0,0 +1,284 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _CPU_ARG_NUM            (8)
+#define _CPU_INPUT_NUM          (1)
+#define _CPU_OUTPUT_NUM         (2)
+#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.maxpoolwithargmax")
+
+#define FP32_MIN                -3.4e38
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _maxpoolwithargmax_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
+    // Add kererl parameters here
+};
+#define _MAXPOOLWITHARGMAX_PARAM_NUM  _cnt_of_array( _maxpoolwithargmax_kernel_param_def )
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_maxpoolwithargmax_exec)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VX_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    float * buffer[_CPU_IO_NUM] = { NULL };
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
+    int32_t ksize_x = 0, ksize_y = 0, stride_x = 0, stride_y = 0;
+    int32_t pad_left = 0, pad_right = 0, pad_top = 0, pad_bottom = 0;
+    int32_t i = 0;
+
+    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
+    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &ksize_x);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &ksize_y);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &stride_x);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &stride_y);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &pad_left);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &pad_right);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &pad_top);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &pad_bottom);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
+
+    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
+    memset( buffer[1], 0, out_elements * sizeof(float) );
+
+    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
+    memset( buffer[2], 0, out_elements * sizeof(float) );
+
+    {
+        int32_t dims_num = (int32_t)attr[1]->shape->size;
+        int32_t batch    = dims_num > 3 ? (int32_t)attr[1]->shape->data[3] : 1;
+        int32_t depth    = dims_num > 2 ? (int32_t)attr[1]->shape->data[2] : 1;
+        int32_t height_o = (int32_t)attr[1]->shape->data[1];
+        int32_t width_o  = (int32_t)attr[1]->shape->data[0];
+        int32_t width    = (int32_t)attr[0]->shape->data[0];
+        int32_t height   = (int32_t)attr[0]->shape->data[1];
+        int32_t b = 0, d = 0, j = 0;
+        int32_t output_base = 0;
+        int32_t input_base  = 0;
+
+        for (b = 0; b < batch; b++)
+        {
+            for (d = 0; d < depth; d++)
+            {
+                output_base = b * depth * height_o * width_o + d * height_o * width_o;
+                input_base = b * depth * height * width + d * height * width;
+                for (j = 0; j < height_o; j++)
+                {
+                    for (i = 0; i < width_o; i++)
+                    {
+                        int32_t hstart = j * stride_y - pad_top;
+                        int32_t wstart = i * stride_x - pad_left;
+                        int32_t hend = vsi_nn_min(hstart + ksize_y, height);
+                        int32_t wend = vsi_nn_min(wstart + ksize_x, width);
+                        int32_t pool_index = output_base + j * width_o + i;
+                        int32_t h = 0, w = 0;
+                        int32_t index_max = 0;
+                        float   value_max = (float)FP32_MIN;
+
+                        hstart = vsi_nn_max(hstart, 0);
+                        wstart = vsi_nn_max(wstart, 0);
+
+                        for (h = hstart; h < hend; ++ h)
+                        {
+                            for (w = wstart; w < wend; ++ w)
+                            {
+                                int32_t index = input_base + h * width + w;
+                                float data = buffer[0][index];
+
+                                if (data > value_max)
+                                {
+                                    value_max = data;
+                                    index_max = index;
+                                }
+                            }
+                        }
+                        buffer[1][pool_index] = value_max;
+                        buffer[2][pool_index] = (float)index_max;
+                    }
+                }
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
+            buffer[1], out_elements );
+    status |= vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
+            buffer[2], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    for ( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+    }
+    for ( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+    }
+    return status;
+} /* _maxpoolwithargmax_exec() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _maxpoolwithargmax_exec;
+    kernel->info.parameters  = _maxpoolwithargmax_kernel_param_def;
+    kernel->info.numParams   = _MAXPOOLWITHARGMAX_PARAM_NUM;
+    status = VSI_SUCCESS;
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_MAXPOOLWITHARGMAX_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+
+    int32_t ksize_x    = vsi_nn_kernel_param_get_int32(params, "ksize_x");
+    int32_t ksize_y    = vsi_nn_kernel_param_get_int32(params, "ksize_y");
+    int32_t stride_x   = vsi_nn_kernel_param_get_int32(params, "stride_x");
+    int32_t stride_y   = vsi_nn_kernel_param_get_int32(params, "stride_y");
+    int32_t pad_left   = vsi_nn_kernel_param_get_int32(params, "pad_left");
+    int32_t pad_right  = vsi_nn_kernel_param_get_int32(params, "pad_right");
+    int32_t pad_top    = vsi_nn_kernel_param_get_int32(params, "pad_top");
+    int32_t pad_bottom = vsi_nn_kernel_param_get_int32(params, "pad_bottom");
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            int32_t index = 3;
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _MAXPOOLWITHARGMAX_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_left );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_right );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_top );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_bottom );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _MAXPOOLWITHARGMAX_PARAM_NUM );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( maxpoolwithargmax, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/mod_cpu.c b/src/tim/vx/internal/src/kernel/cpu/mod_cpu.c
new file mode 100644
index 0000000..b391edd
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/mod_cpu.c
@@ -0,0 +1,247 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.mod")
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _mod_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _MOD_PARAM_NUM  _cnt_of_array( _mod_kernel_param_def )
+
+static vsi_ssize_t _expand_offset
+    (
+    vsi_ssize_t index,
+    vsi_size_t * shape, vsi_size_t rank,
+    vsi_size_t * strides, vsi_size_t * out_shape
+    )
+{
+    vsi_size_t i;
+    vsi_ssize_t offset = 0;
+
+    for( i = 0; i < rank && index; i ++ )
+    {
+        if( shape[i] == out_shape[i] )
+        {
+            offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
+        }
+        index /= out_shape[i];
+    }
+    return offset;
+}
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    int32_t isfmod = 0;
+    vsi_nn_kernel_dtype_e input0_dtype = F16;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float* f32_in_buffer[_INPUT_NUM] = {NULL};
+    float* f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t* in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t* out_attr[_OUTPUT_NUM] = {NULL};
+    vsi_size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    vsi_size_t out_elements[_OUTPUT_NUM] = {0};
+    vsi_size_t out_bytes[_OUTPUT_NUM] = {0};
+    uint32_t i;
+
+    /* prepare data */
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &isfmod);
+    for (i = 0; i < _INPUT_NUM; i++) {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create(input[i]);
+        vsi_nn_kernel_tensor_attr_get_stride(in_attr[i], in_stride_size[i]);
+        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer(input[i], in_attr[i], TRUE);
+        CHECK_PTR_FAIL_GOTO(f32_in_buffer[i], "Create input0 buffer fail.", final);
+    }
+
+    input0_dtype = in_attr[0]->dtype;
+    if (input0_dtype == F16 || input0_dtype == F32 || input0_dtype == BF16) {
+        isfmod = 1;
+    }
+
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create(output[i]);
+        vsi_nn_kernel_tensor_attr_get_stride(out_attr[i], out_stride_size[i]);
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size(out_attr[i]);
+        out_bytes[i] = out_elements[i] * sizeof(float);
+        f32_out_buffer[i] = (float*)malloc(out_bytes[i]);
+        CHECK_PTR_FAIL_GOTO(f32_out_buffer[i], "Create output buffer fail.", final);
+        memset(f32_out_buffer[i], 0, out_bytes[i]);
+    }
+
+    for (i = 0; i < out_elements[0]; i++)
+    {
+        vsi_ssize_t in0_offset = 0;
+        vsi_ssize_t in1_offset = 0;
+        float in0 = 0;
+        float in1 = 0;
+
+        in0_offset = _expand_offset( i, in_attr[0]->shape->data, (vsi_size_t)in_attr[0]->shape->size,
+                in_stride_size[0], out_attr[0]->shape->data );
+        in1_offset = _expand_offset( i, in_attr[1]->shape->data, (vsi_size_t)in_attr[1]->shape->size,
+                in_stride_size[1], out_attr[0]->shape->data );
+        in0 = f32_in_buffer[0][in0_offset];
+        in1 = f32_in_buffer[1][in1_offset];
+        if (isfmod)
+        {
+            f32_out_buffer[0][i] = (float)fmod(in0,in1);
+        }
+        else
+        {
+            f32_out_buffer[0][i] = in0 - in1 * (float)floor(in0 / in1);
+        }
+    }
+
+    /* save data */
+    for (i = 0; i < _OUTPUT_NUM; i++) {
+        status = vsi_nn_kernel_tensor_write_from_float(
+            output[i], out_attr[i], f32_out_buffer[i], out_elements[i]);
+        CHECK_STATUS_FAIL_GOTO(status, final);
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++) {
+        if (f32_in_buffer[i]) {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+
+        if (in_attr[i]) {
+            vsi_nn_kernel_tensor_attr_release(&in_attr[i]);
+        }
+    }
+
+    for (i = 0; i < _OUTPUT_NUM; i++) {
+        if (f32_out_buffer[i]) {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+
+        if (out_attr[i]) {
+            vsi_nn_kernel_tensor_attr_release(&out_attr[i]);
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _mod_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _mod_kernel_param_def );
+
+    return VSI_SUCCESS;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_MOD_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t isfmod = vsi_nn_kernel_param_get_int32(params, "isfmod");
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _MOD_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &isfmod );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _MOD_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( mod, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c
index 1af66f0..845c167 100644
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c
@@ -55,8 +55,8 @@ __BEGIN_DECLS
 static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
 {
     {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@@ -90,12 +90,16 @@ DEF_KERNEL_EXECUTOR(_compute)
     uint32_t i = 0;
     int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0;
     float mean[3] = {0}, scale = 1;
+    vsi_bool is_rgb888 = tensors[1] == NULL;
 
     for (i = 0; i < _CPU_IO_NUM; i++)
     {
         tensors[i] = (vsi_nn_kernel_tensor_t)param[i];
-        attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
-        CHECK_PTR_FAIL_GOTO( attr[i], "Create tensor attr buffer fail.", final );
+        if (tensors[i])
+        {
+            attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
+            CHECK_PTR_FAIL_GOTO( attr[i], "Create tensor attr buffer fail.", final );
+        }
     }
 
     out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
@@ -113,8 +117,11 @@ DEF_KERNEL_EXECUTOR(_compute)
 
     for (i = 0; i < 3; i++)
     {
-        buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( buffer[i], "Create input0 buffer fail.", final );
+        if (tensors[i])
+        {
+            buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE );
+            CHECK_PTR_FAIL_GOTO( buffer[i], "Create input0 buffer fail.", final );
+        }
 
         buffer[i + 3] = (float *)malloc( out_elements * sizeof(float) );
         CHECK_PTR_FAIL_GOTO( buffer[i + 3], "Create output buffer fail.", final );
@@ -125,12 +132,17 @@ DEF_KERNEL_EXECUTOR(_compute)
         int32_t line1[2], line2[2];
         int32_t dx = 0, dy = 0, idx = 0;
         int32_t src_width = (int32_t)attr[0]->shape->data[0];
+        int32_t src_height = (int32_t)attr[0]->shape->data[1];
         int32_t dst_width = (int32_t)attr[3]->shape->data[0];
         int32_t dst_height = (int32_t)attr[3]->shape->data[1];
         uint8_t result = 0;
+        int32_t offset = 0;
+        int32_t index = 0;
 
         for ( idx = 0; idx < 3; idx ++)
         {
+            offset = is_rgb888 ? idx * src_width * src_height : 0;
+            index = is_rgb888 ? 0 : idx;
             for ( dy = 0; dy < (int32_t)dst_height; dy ++)
             {
                 for ( dx = 0; dx < (int32_t)dst_width; dx ++)
@@ -170,10 +182,10 @@ DEF_KERNEL_EXECUTOR(_compute)
                         sy += yOffset;
                         source_index = (sx + sy * src_width);
 
-                        line1[0] = (int32_t)buffer[idx][source_index];
-                        line1[1] = (int32_t)buffer[idx][source_index + 1];
-                        line2[0] = (int32_t)buffer[idx][source_index + src_width];
-                        line2[1] = (int32_t)buffer[idx][source_index + src_width + 1];
+                        line1[0] = (int32_t)buffer[index][source_index + offset];
+                        line1[1] = (int32_t)buffer[index][source_index + 1 + offset];
+                        line2[0] = (int32_t)buffer[index][source_index + src_width + offset];
+                        line2[1] = (int32_t)buffer[index][source_index + src_width + 1 + offset];
 
                         temp1 = fx * (line1[1] - line1[0]) + (line1[0] << 10);
                         temp2 = fx * (line2[1] - line2[0]) + (line2[0] << 10);
@@ -184,10 +196,10 @@ DEF_KERNEL_EXECUTOR(_compute)
                     }
                     else
                     {
-                        int32_t offset = xOffset + yOffset * src_width;
-                        source_index = dx + dy * src_width + offset;
-                        finalVal = (buffer[0][source_index] - mean[idx]) * scale;
-                        buffer[1][output_index] = finalVal;
+                        int32_t ofset = xOffset + yOffset * src_width;
+                        source_index = dx + dy * src_width + ofset + offset;
+                        finalVal = (buffer[index][source_index] - mean[idx]) * scale;
+                        buffer[idx + 3][output_index] = finalVal;
                     }
                 }
             }
diff --git a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
index a5f0467..82e9c1a 100644
--- a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
@@ -209,16 +209,15 @@ DEF_KERNEL_EXECUTOR(_compute)
     for (n = 0; n < num_rois; n++)
     {
         uint32_t batchId = (uint32_t)f32_in_buffer[2][n];
-        float scale = (in_attr[1]->dtype == U16) ? 0.125f : 1.0f;
         float qx1 = f32_in_buffer[1][n * kRoiDim];
         float qy1 = f32_in_buffer[1][n * kRoiDim + 1];
         float qx2 = f32_in_buffer[1][n * kRoiDim + 2];
         float qy2 = f32_in_buffer[1][n * kRoiDim + 3];
 
-        float x1 = qx1 * scale;
-        float x2 = qx2 * scale;
-        float y1 = qy1 * scale;
-        float y2 = qy2 * scale;
+        float x1 = qx1;
+        float x2 = qx2;
+        float y1 = qy1;
+        float y2 = qy2;
         float roi_anchor_x = x1 * width_scale;
         float roi_anchor_y = y1 * height_scale;
         float roi_dims_x   = vsi_nn_max((x2 - x1) * width_scale, 1.0f);
diff --git a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
new file mode 100644
index 0000000..cad8476
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
@@ -0,0 +1,770 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define KERNEL_SOURCE_1    "cumsum"
+#define KERNEL_SOURCE_2    "cumsum_2d"
+#define KERNEL_SOURCE_3    "cumsum_bf16"
+#define KERNEL_SOURCE_4    "cumsum_f16_u8"
+
+// Add kernel hashtable here
+#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
+    ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
+
+#define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
+        CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
+        SOURCE },
+
+#define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \
+        CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
+        SOURCE },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } cumsum_map[] =
+{
+    HASH_CUMSUM_KERNELS(0, U8,   U8,   KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(0, I8,   I8,   KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(0, I16,  I16,  KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(0, F16,  F16,  KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(0, BF16, BF16, KERNEL_SOURCE_3)
+    HASH_CUMSUM_KERNELS(1, U8,   U8,   KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(1, I8,   I8,   KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(1, I16,  I16,  KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(1, F16,  F16,  KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(1, BF16, BF16, KERNEL_SOURCE_3)
+    HASH_CUMSUM_KERNELS(2, U8,   U8,   KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(2, I8,   I8,   KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(2, I16,  I16,  KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(2, F16,  F16,  KERNEL_SOURCE_1)
+    HASH_CUMSUM_KERNELS(2, BF16, BF16, KERNEL_SOURCE_3)
+    HASH_CUMSUM_KERNELS_2D(0, U8,   U8,   KERNEL_SOURCE_2)
+    HASH_CUMSUM_KERNELS_2D(0, I8,   I8,   KERNEL_SOURCE_2)
+    HASH_CUMSUM_KERNELS_2D(0, I16,  I16,  KERNEL_SOURCE_2)
+    HASH_CUMSUM_KERNELS_2D(0, F16,  F16,  KERNEL_SOURCE_2)
+    HASH_CUMSUM_KERNELS_2D(0, BF16, BF16, KERNEL_SOURCE_3)
+    HASH_CUMSUM_KERNELS_2D(1, U8,   U8,   KERNEL_SOURCE_2)
+    HASH_CUMSUM_KERNELS_2D(1, I8,   I8,   KERNEL_SOURCE_2)
+    HASH_CUMSUM_KERNELS_2D(1, I16,  I16,  KERNEL_SOURCE_2)
+    HASH_CUMSUM_KERNELS_2D(1, F16,  F16,  KERNEL_SOURCE_2)
+    HASH_CUMSUM_KERNELS_2D(1, BF16, BF16, KERNEL_SOURCE_3)
+    HASH_CUMSUM_KERNELS(0, F16,  U8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS(0, F16,  I8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS(0, F16,  I16, KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS(1, F16,  U8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS(1, F16,  I8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS(1, F16,  I16, KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS(2, F16,  U8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS(2, F16,  I8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS(2, F16,  I16, KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS_2D(0, F16,  U8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS_2D(0, F16,  I8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS_2D(0, F16,  I16, KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS_2D(1, F16,  U8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS_2D(1, F16,  I8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_KERNELS_2D(1, F16,  I16, KERNEL_SOURCE_4)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _cumsum_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _CUMSUM_PARAM_NUM  _cnt_of_array( _cumsum_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_cumsum_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    int32_t       axis    = 0;
+    int32_t       width   = 0;
+    int32_t       height  = 0;
+    int32_t       channel = 0;
+    int32_t       w       = 1;
+    int32_t       h       = 1;
+    int32_t       c       = 1;
+    uint32_t      dim     = 1;
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    vsi_size_array_t * input_shape = NULL;
+    int32_t input_zp        = 0;
+    float   input_scale     = 1.0f;
+    float   output_zp       = 0;
+    float   output_scale    = 1.0f;
+    float   in_out_zp_scale = 1.0f;
+    float   in_out_scale    = 1.0f;
+
+    uint32_t pack_key = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        if (attr[0]->dfp.fl > 0)
+        {
+            input_scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
+        }
+        else
+        {
+            input_scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
+        }
+    }
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        input_scale = attr[0]->asymm.scale;
+        input_zp = attr[0]->asymm.zero_point;
+    }
+
+    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        if (attr[1]->dfp.fl > 0)
+        {
+            output_scale = (float)((int64_t)1 << attr[1]->dfp.fl);
+        }
+        else
+        {
+            output_scale = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
+        }
+    }
+    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        output_scale = 1.0f / attr[1]->asymm.scale;
+        output_zp = (float)attr[1]->asymm.zero_point;
+    }
+
+    in_out_scale = input_scale * output_scale;
+    in_out_zp_scale = (float)in_out_scale * input_zp;
+
+    input_shape  = attr[0]->shape;
+    dim     = (uint32_t)input_shape->size;
+    width   = (int32_t)(input_shape->data[0]);
+    height  = (int32_t)(input_shape->data[1]);
+    channel = (int32_t)(dim > 2 ? input_shape->data[2] : 1);
+
+
+    if (axis == 0)
+    {
+        w = 1;
+        h = height;
+        c = channel;
+    }
+    else if (axis == 1)
+    {
+        w = width;
+        h = 1;
+        c = channel;
+    }
+    else if (axis == 2)
+    {
+        w = width;
+        h = height;
+        c = 1;
+    }
+
+    shaderParam.global_scale[0]  = 8;
+    if ((attr[0]->dtype == U8 || attr[0]->dtype == I8)
+        && (axis > 0))
+    {
+        shaderParam.global_scale[0]  = 16;
+    }
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.global_size[0]   = (w + shaderParam.global_scale[0] - 1) / shaderParam.global_scale[0];
+    shaderParam.global_size[1]   = h;
+    shaderParam.global_size[2]   = c;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, AXIS, DIM)    \
+        (IN0_TYPE | (OUT_TYPE << 8) | (AXIS << 16) | (DIM << 24))
+
+    pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, axis, dim);
+
+    {
+        uint16_t M0               = 0;
+        int32_t  postShift        = 0;
+        uint32_t multAndoutZP0[2] = {0};
+        gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{
+                0xdddddddd, // TCfg
+                0x44444444, // ASelt
+                0x13121110, 0x17161514, // ABin
+                0x11111111, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniAccSumVertF16toF16_2x8 = {{
+            0x55555555, // TCfg
+            0x44444444, // ASelt
+            0x33221100, 0x77665544, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00010001, 0x00010001, 0x00010001,
+            0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniAccSumVertU8toI32A_4x4 = {{
+            0x0d0d0d0d, // TCfg
+            0x04040404, // ASelt
+            0x00110000, 0x00330022, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniAccSumVertU8toI32B_4x4 = {{
+            0x0d0d0d0d, // TCfg
+            0x04040404, // ASelt
+            0x00150004, 0x00370026, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniAccSumVertU8toI32C_4x4 = {{
+            0x0d0d0d0d, // TCfg
+            0x04040404, // ASelt
+            0x00190008, 0x003b002a, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniAccSumVertU8toI32D_4x4 = {{
+            0x0d0d0d0d, // TCfg
+            0x04040404, // ASelt
+            0x001d000c, 0x003f002e, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniSumHorzF16toF16A_4x4 = {{
+            0x55150501, // TCfg
+            0x00000000, // ASelt
+            0x00100000, 0x32100210, // ABin
+            0xaa2a0a02, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x3c003c00, 0x00000000,
+            0x3c003c00, 0x00003c00, 0x3c003c00, 0x3c003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniSumHorzF16toF16B_4x4 = {{
+            0x55150501, // TCfg
+            0x00000000, // ASelt
+            0x00540004, 0x76540654, // ABin
+            0xaa2a0a02, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x3c003c00, 0x00000000,
+            0x3c003c00, 0x00003c00, 0x3c003c00, 0x3c003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniSumHorzF16toF16C_2x8 = {{
+            0x55551111, // TCfg
+            0x00000000, // ASelt
+            0x03020100, 0x37363534, // ABin
+            0xaaaa2222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniAccSumHorzF16toF16_2x8 = {{
+            0x55555555, // TCfg
+            0x44444444, // ASelt
+            0x73727170, 0x77767574, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00,
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniSumHorzU8toI16A_4x4 = {{
+            0x55150501, // TCfg
+            0x00000000, // ASelt
+            0x00100000, 0x32100210, // ABin
+            0xaa2a0a02, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000700, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000001, 0x00010001, 0x00010001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniSumHorzU8toI16B_8x4 = {{
+            0x05550155, 0x55551555, // TCfg
+            0x00418820, 0x41882000, 0x8820000a, 0x20018a41, 0x398a4188, // BinSelect
+            0x00000700, // AccumType, ConstantType, and PostShift
+            0x01010101, 0x00000001, 0x01010101, 0x00000101,
+            0x01010101, 0x00010101, 0x01010101, 0x01010101 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniSubZpI16toI16_2x8 = {{
+            0x99999999, // TCfg
+            0x44444444, // ASelt
+            0x03020100, 0x07060504, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00020001, 0x00030001, 0x00040001,
+            0x00050001, 0x00060001, 0x00070001, 0x00080001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniAccSumHorzI16toI32A_4x4 = {{
+            0x0d0d0d0d, // TCfg
+            0x04040404, // ASelt
+            0x00310030, 0x00330032, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniAccSumHorzI16toI32B_4x4 = {{
+            0x0d0d0d0d, // TCfg
+            0x04040404, // ASelt
+            0x00350034, 0x00370036, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111,  // TCfg
+            0x01010101,  // ASelt
+            0x01050004, 0x03070206,  // ABin
+            0x22222222,  // BSelt
+            0x00000000, 0x00000000,  // BBin
+            0x00000600,  // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001  // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+            0x11111111,  // TCfg
+            0x01010101,  // ASelt
+            0x05050404, 0x07070606,  // ABin
+            0x22222222,  // BSelt
+            0x00000000, 0x00000000,  // BBin
+            0x00000600,  // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001  // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtractOddData_2x8 = {{
+            0x11111111,  // TCfg
+            0x11110000,  // ASelt
+            0x07050301, 0x07050301,  // ABin
+            0x22222222,  // BSelt
+            0x00000000, 0x00000000,  // BBin
+            0x00000600,  // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001  // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_dp_inst_t uniSetZeroF16_2x8 = {{
+            0x11111111, // TCfg
+            0x00000000, // ASelt
+            0x03020100, 0x07060504, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_quantize_multiplier_16bit( (double)input_scale * output_scale, &M0, &postShift);
+        multAndoutZP0[0] = (uint32_t)(M0);
+        multAndoutZP0[1] = (uint32_t)((attr[1]->asymm.zero_point << postShift) - input_zp * M0);
+        gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift );
+
+        status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
+        status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+        switch( pack_key )
+        {
+        case _PACK_SELECT_KEY( U8,   U8,   2, 3):
+        case _PACK_SELECT_KEY( I8,   I8,   2, 3):
+        case _PACK_SELECT_KEY( I16,  I16,  2, 3):
+        case _PACK_SELECT_KEY( F16,  F16,  2, 3):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
+                status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
+                status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+                status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale);
+                status |= vsi_nn_kernel_gpu_add_param(node, "in_out_zp_scale", &in_out_zp_scale);
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertU8toI32A_4x4", &uniAccSumVertU8toI32A_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertU8toI32B_4x4", &uniAccSumVertU8toI32B_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertU8toI32C_4x4", &uniAccSumVertU8toI32C_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertU8toI32D_4x4", &uniAccSumVertU8toI32D_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzU8toI16A_4x4", &uniSumHorzU8toI16A_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzU8toI16B_8x4", &uniSumHorzU8toI16B_8x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSubZpI16toI16_2x8", &uniSubZpI16toI16_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumHorzI16toI32A_4x4", &uniAccSumHorzI16toI32A_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumHorzI16toI32B_4x4", &uniAccSumHorzI16toI32B_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case _PACK_SELECT_KEY( U8,   U8,   0, 2):
+        case _PACK_SELECT_KEY( U8,   U8,   1, 2):
+        case _PACK_SELECT_KEY( U8,   U8,   0, 3):
+        case _PACK_SELECT_KEY( U8,   U8,   1, 3):
+        case _PACK_SELECT_KEY( I8,   I8,   0, 2):
+        case _PACK_SELECT_KEY( I8,   I8,   1, 2):
+        case _PACK_SELECT_KEY( I8,   I8,   0, 3):
+        case _PACK_SELECT_KEY( I8,   I8,   1, 3):
+        case _PACK_SELECT_KEY( I16,  I16,  0, 2):
+        case _PACK_SELECT_KEY( I16,  I16,  1, 2):
+        case _PACK_SELECT_KEY( I16,  I16,  0, 3):
+        case _PACK_SELECT_KEY( I16,  I16,  1, 3):
+        case _PACK_SELECT_KEY( F16,  F16,  0, 2):
+        case _PACK_SELECT_KEY( F16,  F16,  1, 2):
+        case _PACK_SELECT_KEY( F16,  F16,  0, 3):
+        case _PACK_SELECT_KEY( F16,  F16,  1, 3):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
+                status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+                status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale);
+                status |= vsi_nn_kernel_gpu_add_param(node, "in_out_zp_scale", &in_out_zp_scale);
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertU8toI32A_4x4", &uniAccSumVertU8toI32A_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertU8toI32B_4x4", &uniAccSumVertU8toI32B_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertU8toI32C_4x4", &uniAccSumVertU8toI32C_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumVertU8toI32D_4x4", &uniAccSumVertU8toI32D_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzU8toI16A_4x4", &uniSumHorzU8toI16A_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzU8toI16B_8x4", &uniSumHorzU8toI16B_8x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSubZpI16toI16_2x8", &uniSubZpI16toI16_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumHorzI16toI32A_4x4", &uniAccSumHorzI16toI32A_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumHorzI16toI32B_4x4", &uniAccSumHorzI16toI32B_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case _PACK_SELECT_KEY( BF16, BF16, 0, 2):
+        case _PACK_SELECT_KEY( BF16, BF16, 1, 2):
+        case _PACK_SELECT_KEY( BF16, BF16, 0, 3):
+        case _PACK_SELECT_KEY( BF16, BF16, 1, 3):
+        case _PACK_SELECT_KEY( BF16, BF16, 2, 3):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case _PACK_SELECT_KEY( F16, U8,  0, 2):
+        case _PACK_SELECT_KEY( F16, U8,  1, 2):
+        case _PACK_SELECT_KEY( F16, U8,  0, 3):
+        case _PACK_SELECT_KEY( F16, U8,  1, 3):
+        case _PACK_SELECT_KEY( F16, U8,  2, 3):
+        case _PACK_SELECT_KEY( F16, I8,  0, 2):
+        case _PACK_SELECT_KEY( F16, I8,  1, 2):
+        case _PACK_SELECT_KEY( F16, I8,  0, 3):
+        case _PACK_SELECT_KEY( F16, I8,  1, 3):
+        case _PACK_SELECT_KEY( F16, I8,  2, 3):
+        case _PACK_SELECT_KEY( F16, I16, 0, 2):
+        case _PACK_SELECT_KEY( F16, I16, 1, 2):
+        case _PACK_SELECT_KEY( F16, I16, 0, 3):
+        case _PACK_SELECT_KEY( F16, I16, 1, 3):
+        case _PACK_SELECT_KEY( F16, I16, 2, 3):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "multAndoutZP0", &multAndoutZP0);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        default:
+            break;
+        }
+    }
+#undef _PACK_SELECT_KEY
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+}
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel,
+    const vsi_nn_kernel_param_t * params,
+    int32_t axis,
+    int32_t is_2d
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d);
+
+    for( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
+    {
+        if ( cumsum_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(cumsum_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  cumsum_map[i].function_name );
+        kernel->info.parameters = _cumsum_kernel_param_def;
+        kernel->info.numParams = _cnt_of_array( _cumsum_kernel_param_def );
+        kernel->info.initialize = _cumsum_initializer;
+
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                cumsum_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                cumsum_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t tmp_params[_CUMSUM_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t  shapes[1][VSI_NN_MAX_DIM_NUM] = {{0}};
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    int32_t axis       = vsi_nn_kernel_param_get_int32( params, "axis" );
+    int32_t exclusive  = vsi_nn_kernel_param_get_int32( params, "exclusive" );
+    int32_t reverse    = vsi_nn_kernel_param_get_int32( params, "reverse" );
+    int32_t axis_new   = 0;
+    int32_t is_2d      = 0;
+    uint32_t rs_dim    = 2;
+    int32_t i          = 0;
+
+    vsi_nn_kernel_optimize_softmax_shape(
+                inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+                shapes[0], &rs_dim, &axis_new);
+    if (exclusive || reverse || rs_dim > 3)
+    {
+        return NULL;
+    }
+
+    if (rs_dim == 2)
+    {
+        is_2d = 1;
+    }
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+        inputs[0], shapes[0], (vsi_size_t)rs_dim );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+        outputs[0], shapes[0], (vsi_size_t)rs_dim );
+
+    status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d);
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 2;
+
+            /* Pass parameters to node. */
+            vsi_nn_kernel_node_pack_io( tmp_params, _CUMSUM_PARAM_NUM,
+                reshape_tensors, 1, &reshape_tensors[1], 1 );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_new );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &exclusive );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
+            status = vsi_nn_kernel_node_pass_param( node, tmp_params, _CUMSUM_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &tmp_params[2] );
+            vsi_nn_kernel_scalar_release( &tmp_params[3] );
+            vsi_nn_kernel_scalar_release( &tmp_params[4] );
+        }
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( cumsum, _setup )
diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
index 6e85e40..dbbfc6e 100644
--- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
@@ -53,6 +53,9 @@ typedef enum
     UNARY_HGELU,
     UNARY_SELU,
     UNARY_CELU,
+    UNARY_RCP,
+    UNARY_SIGN,
+    UNARY_SOFTSIGN,
 } unary_type_e;
 
 /*
@@ -94,6 +97,34 @@ typedef enum
 #define HGELU_OPERATION         hard_gelu
 #define SELU_OPERATION          selu
 #define CELU_OPERATION          celu
+#define RCP_OPERATION           rcp
+#define SIGN_OPERATION          sign
+#define SOFTSIGN_OPERATION      softsign
+
+#define ADD_UNARY_SH_KERNELS(name, source) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, BF16, BF16, source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, BF16, BF16, source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16,  F16,  source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16,  F16,  source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16,  I16,  source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16,  I16,  source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16,  U8,   source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16,  U8,   source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16,  I8,   source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16,  I8,   source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I16,  I16,  source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I16,  I16,  source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I16,  F16,  source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I16,  F16,  source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I8,   I8,   source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I8,   I8,   source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I8,   F16,  source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I8,   F16,  source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, U8,   U8,   source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, U8,   U8,   source##_2D) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, U8,   F16,  source##_3D) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, U8,   F16,  source##_2D) \
+
 
 static const struct {
         uint32_t key;
@@ -101,269 +132,22 @@ static const struct {
         const char* source_name;
     } _eltwise_unary_evis_kernel_map[] =
 {
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, F16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, F16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, F16,  U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, F16,  I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, I16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, I16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, U8,   U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, U8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, I8,   I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, I8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, BF16, BF16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, F16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, F16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, F16,  U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, F16,  I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, I16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, I16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, U8,   U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, U8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, I8,   I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, I8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, BF16, BF16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, F16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, F16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, F16,  U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, F16,  I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, I16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, I16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, U8,   U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, U8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, I8,   I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, I8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, BF16, BF16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, F16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, F16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, F16,  U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, F16,  I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, I16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, I16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, U8,   U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, U8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, I8,   I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, I8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, BF16, BF16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16,  U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16,  I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, U8,   U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, U8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I8,   I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, BF16, BF16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  F16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  F16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  F16,  U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  F16,  I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  I16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  I16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  U8,   U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  U8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  I8,   I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  I8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  BF16, BF16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16,  U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16,  I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I16,  I16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I16,  F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, U8,   U8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, U8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I8,   I8, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I8,   F16, KERNEL_SOURCE1_3D)
-    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, BF16, BF16, KERNEL_SOURCE1_3D)
+    ADD_UNARY_SH_KERNELS(SIN,       KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(COS,       KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(EXP,       KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(LOG,       KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(SELU,      KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(CELU,      KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(NEG,       KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(RCP,       KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(SIGN,      KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(SOFTSIGN,  KERNEL_SOURCE1)
 
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, F16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, F16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, F16,  U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, F16,  I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, I16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, I16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, U8,   U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, U8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, I8,   I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, I8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, BF16, BF16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, F16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, F16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, F16,  U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, F16,  I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, I16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, I16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, U8,   U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, U8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, I8,   I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, I8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, BF16, BF16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, F16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, F16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, F16,  U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, F16,  I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, I16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, I16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, U8,   U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, U8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, I8,   I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, I8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, BF16, BF16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, F16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, F16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, F16,  U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, F16,  I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, I16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, I16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, U8,   U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, U8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, I8,   I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, I8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, BF16, BF16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16,  U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16,  I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, U8,   U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, U8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I8,   I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, BF16, BF16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  F16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  F16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  F16,  U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  F16,  I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  I16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  I16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  U8,   U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  U8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  I8,   I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  I8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  BF16, BF16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16,  U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16,  I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I16,  I16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I16,  F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, U8,   U8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, U8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I8,   I8, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I8,   F16, KERNEL_SOURCE1_2D)
-    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, BF16, BF16, KERNEL_SOURCE1_2D)
-
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  I16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  U8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  I8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16,  I16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16,  F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,   U8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,   F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8,   I8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8,   F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, BF16, BF16, KERNEL_SOURCE0_3D)
-
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,   U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8,   I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, BF16, BF16, KERNEL_SOURCE0_2D)
-
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16,  F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16,  I16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16,  U8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16,  I8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I16,  I16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I16,  F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, U8,   U8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, U8,   F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I8,   I8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I8,   F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, BF16, BF16, KERNEL_SOURCE0_3D)
-
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16,  U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16,  I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8,   U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8,   I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, BF16, BF16, KERNEL_SOURCE0_2D)
-
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16,  F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16,  I16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16,  U8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16,  I8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I16,  I16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I16,  F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, U8,   U8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, U8,   F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I8,   I8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I8,   F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16, KERNEL_SOURCE0_3D)
-
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8,   U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8,   I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16, KERNEL_SOURCE0_2D)
-
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16,  F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16,  I16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16,  U8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16,  I8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I16,  I16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I16,  F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, U8,   U8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, U8,   F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I8,   I8, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I8,   F16, KERNEL_SOURCE0_3D)
-    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, BF16, BF16, KERNEL_SOURCE0_3D)
-
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8,   U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8,   I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, BF16, BF16, KERNEL_SOURCE0_2D)
-
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16,  I16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16,  F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8,   U8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8,   I8, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8,   F16, KERNEL_SOURCE0_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, BF16, BF16, KERNEL_SOURCE0_2D)
+    ADD_UNARY_SH_KERNELS(HSIGMOID,  KERNEL_SOURCE0)
+    ADD_UNARY_SH_KERNELS(MISH,      KERNEL_SOURCE0)
+    ADD_UNARY_SH_KERNELS(ROUND,     KERNEL_SOURCE0)
+    ADD_UNARY_SH_KERNELS(GELU,      KERNEL_SOURCE0)
+    ADD_UNARY_SH_KERNELS(HGELU,     KERNEL_SOURCE0)
 };
 
 #undef SIN_OPERATION
@@ -378,6 +162,9 @@ static const struct {
 #undef GELU_OPERATION
 #undef HGELU_OPERATION
 #undef CELU_OPERATION
+#undef RCP_OPERATION
+#undef SIGN_OPERATION
+#undef SOFTSIGN_OPERATION
 /*
  * Kernel params
  */
@@ -509,6 +296,9 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
         case _PACK_SELECT_KEY( UNARY_GELU, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_HGELU, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_CELU, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_RCP, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_SIGN, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_SOFTSIGN, BF16, BF16 ):
         {
             gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
                 0x11111111, // TCfg
@@ -815,5 +605,8 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( gelu, UNARY_GELU )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_gelu, UNARY_HGELU )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( selu, UNARY_SELU )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( celu, UNARY_CELU )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( rcp, UNARY_RCP )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( sign, UNARY_SIGN )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( softsign, UNARY_SOFTSIGN )
 
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
index 3dc67d2..499bc5a 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
@@ -222,7 +222,7 @@ static vsi_status get_gather_tensor_reshape_size
     uint32_t i = 0;
     vsi_size_t elementCnt = 1;
     vsi_size_t outerCnt = 1;
-#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
 
     for(i = 0; i < dims_num - batch_dims; ++i)
     {
@@ -751,7 +751,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_t             * kernel
     )
 {
-#define VSI_NN_MAX_BLOCK_SIZE  (65536)
+#define VSI_NN_MAX_BLOCK_SIZE  GPU_TENSOR_MAX_WIDTH
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_node_param_t tmp_params[_GATHER_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
@@ -795,12 +795,6 @@ static vsi_nn_kernel_node_t _setup
     reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
         outputs[0], shapes[2], rs_dim );
 
-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
-    {
-        return NULL;
-    }
-
     status = _query_kernel( inputs, outputs, kernel, params, axis0_flg, is_array, is_batch);
     if ( VSI_SUCCESS == status)
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
index 0692c07..05362bb 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
@@ -136,7 +136,7 @@ static vsi_status get_gather_nd_tensor_reshape_size
     vsi_size_t *input_size = inputs[0]->attr.size;
     uint32_t i = 0;
     vsi_size_t elementCnt = 1;
-#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
 
     newDim[0] = 0;
     for(i = 0; i < dims_num; ++i)
diff --git a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
index 9693c29..8a9971f 100644
--- a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
@@ -34,8 +34,8 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "utils/vsi_nn_dtype_util.h"
 
 __BEGIN_DECLS
 
@@ -45,78 +45,57 @@ __BEGIN_DECLS
 
 typedef enum
 {
-    INTERNAL_KERNEL_SUM_SQR,
-    INTERNAL_KERNEL_MEAN_VARI,
+    INTERNAL_KERNEL_SUMS,
+    INTERNAL_KERNEL_MEANS,
     INTERNAL_KERNEL_NORM,
 } _internal_kernel_e;
 
-#define KERNEL_SOURCE_1    "group_normalization_i8"
-#define KERNEL_SOURCE_2    "group_normalization_u8"
-#define KERNEL_SOURCE_3    "group_normalization_i16"
-#define KERNEL_SOURCE_4    "group_normalization_f16"
-#define KERNEL_SOURCE_5    "group_normalization_u8_f16"
-#define KERNEL_SOURCE_6    "group_normalization_i8_scale"
-#define KERNEL_SOURCE_7    "group_normalization_i16_scale"
-#define KERNEL_SOURCE_8    "group_normalization_f16_scale"
+#define KERNEL_SOURCE_0    "group_normalization_0"
+#define KERNEL_SOURCE_1    "group_normalization_1"
+#define KERNEL_SOURCE_2    "group_normalization_2"
 
-#define HASH_GROUPNORM_SUM_SQR_SH_KERNEL_NAME(SRC0_TYPE) \
-    CVIVANTE_NAMESPACE("evis.group_norm_sumsqr_"#SRC0_TYPE)
+#define HASH_GROUPNORM_SUMS_SH_KERNEL_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("evis.group_norm_sums_"#SRC0_TYPE)
 
-#define HASH_GROUPNORM_SUM_SQR_SH_KERNEL_2D_NAME(SRC0_TYPE) \
-    CVIVANTE_NAMESPACE("evis.group_norm_sumsqr_"#SRC0_TYPE"_2D")
+#define HASH_GROUPNORM_SUMS_SH_KERNEL_2D_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("evis.group_norm_sums_"#SRC0_TYPE"_2D")
 
-#define HASH_GROUPNORM_MEAN_VARI_SH_KERNEL_NAME \
-    CVIVANTE_NAMESPACE("evis.group_norm_meanvari")
-
-#define HASH_GROUPNORM_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE"to"#DST_TYPE)
-
-#define HASH_GROUPNORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D")
+#define HASH_GROUPNORM_MEANS_SH_KERNEL_NAME \
+    CVIVANTE_NAMESPACE("evis.group_norm_means")
 
 #define HASH_GROUPNORM_SCALE_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE)
+    CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE"_"#SRC1_TYPE"to"#DST_TYPE)
 
 #define HASH_GROUPNORM_SCALE_SH_KERNEL_2D_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_2D")
+    CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE"_"#SRC1_TYPE"to"#DST_TYPE"_2D")
 
 // Add kernel hashtable here
 // Sum Sqr
-#define HASH_GROUPNORM_SUM_SQR_KEY(_input0_type, _output_type, _reshape_flag) \
+#define HASH_GROUPNORM_SUMS_KEY(_input0_type, _output_type, _reshape_flag) \
     ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
 
-#define TENSOR_GROUPNORM_SUM_SQR_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GROUPNORM_SUM_SQR_KEY(IN0_TYPE, OUT_TYPE, 0), \
-        HASH_GROUPNORM_SUM_SQR_SH_KERNEL_NAME(IN0_TYPE), \
+#define TENSOR_GROUPNORM_SUMS_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GROUPNORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        HASH_GROUPNORM_SUMS_SH_KERNEL_NAME(IN0_TYPE), \
         SOURCE },
 
-#define TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GROUPNORM_SUM_SQR_KEY(IN0_TYPE, OUT_TYPE, 1), \
-        HASH_GROUPNORM_SUM_SQR_SH_KERNEL_2D_NAME(IN0_TYPE), \
+#define TENSOR_GROUPNORM_SUMS_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GROUPNORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 1), \
+        HASH_GROUPNORM_SUMS_SH_KERNEL_2D_NAME(IN0_TYPE), \
         SOURCE },
 
-#define HASH_GROUPNORM_MEAN_VARI_KEY(_input0_type, _output_type) \
+#define HASH_GROUPNORM_MEANS_KEY(_input0_type, _output_type) \
     ((_input0_type << 24) | (_output_type << 16))
 
 #define TENSOR_GROUPNORM_MEAN_VARI_KERNELS(SOURCE) \
-    { HASH_GROUPNORM_MEAN_VARI_KEY(F32, F32), \
-        HASH_GROUPNORM_MEAN_VARI_SH_KERNEL_NAME, \
+    { HASH_GROUPNORM_MEANS_KEY(F32, F32), \
+        HASH_GROUPNORM_MEANS_SH_KERNEL_NAME, \
         SOURCE },
 
 // normalization
 #define HASH_GROUPNORM_KEY(_input0_type, _input1_type, _output_type, _reshape_flag) \
     ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_reshape_flag << 4))
 
-#define TENSOR_GROUPNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GROUPNORM_KEY(IN0_TYPE, F16, OUT_TYPE, 0), \
-        HASH_GROUPNORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
-        SOURCE },
-
-#define TENSOR_GROUPNORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GROUPNORM_KEY(IN0_TYPE, F16, OUT_TYPE, 1), \
-        HASH_GROUPNORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
-        SOURCE },
-
 #define TENSOR_GROUPNORM_SCALE_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
     { HASH_GROUPNORM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
         HASH_GROUPNORM_SCALE_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
@@ -134,73 +113,73 @@ typedef struct
     const char * source_name;
 } _kernel_map_type;
 
-static const _kernel_map_type _groupnorm_sum_sqr_kernel_map[] =
+static const _kernel_map_type _groupnorm_sums_kernel_map[] =
 {
     // Register kernel here
-    TENSOR_GROUPNORM_SUM_SQR_KERNELS( I8, F32, KERNEL_SOURCE_1 )
-    TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( I8, F32, KERNEL_SOURCE_1 )
-    TENSOR_GROUPNORM_SUM_SQR_KERNELS( U8, F32, KERNEL_SOURCE_2 )
-    TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( U8, F32, KERNEL_SOURCE_2 )
-    TENSOR_GROUPNORM_SUM_SQR_KERNELS( I16, F32, KERNEL_SOURCE_3 )
-    TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( I16, F32, KERNEL_SOURCE_3 )
-    TENSOR_GROUPNORM_SUM_SQR_KERNELS( F16, F32, KERNEL_SOURCE_4 )
-    TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( F16, F32, KERNEL_SOURCE_4 )
+    TENSOR_GROUPNORM_SUMS_KERNELS(I8, F32, KERNEL_SOURCE_0)
+    TENSOR_GROUPNORM_SUMS_KERNELS_2D( I8, F32, KERNEL_SOURCE_0 )
+    TENSOR_GROUPNORM_SUMS_KERNELS( U8, F32, KERNEL_SOURCE_0 )
+    TENSOR_GROUPNORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_0 )
+    TENSOR_GROUPNORM_SUMS_KERNELS( I16, F32, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SUMS_KERNELS_2D( I16, F32, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SUMS_KERNELS( F16, F32, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SUMS_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 )
 };
 
 static const _kernel_map_type _groupnorm_mean_vari_kernel_map[] =
 {
     // Register kernel here
-    TENSOR_GROUPNORM_MEAN_VARI_KERNELS( KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_MEAN_VARI_KERNELS( KERNEL_SOURCE_0 )
 };
 
 static const _kernel_map_type _groupnorm_kernel_map[] =
 {
     // Register kernel here
-    TENSOR_GROUPNORM_KERNELS( I8, I8, KERNEL_SOURCE_1 )
-    TENSOR_GROUPNORM_KERNELS_2D( I8, I8, KERNEL_SOURCE_1 )
-    TENSOR_GROUPNORM_KERNELS( I8, F16, KERNEL_SOURCE_1 )
-    TENSOR_GROUPNORM_KERNELS_2D( I8, F16, KERNEL_SOURCE_1 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( I8, F16, I8, KERNEL_SOURCE_0 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( I8, F16, I8, KERNEL_SOURCE_0 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( I8, F16, F16, KERNEL_SOURCE_0 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( I8, F16, F16, KERNEL_SOURCE_0 )
 
-    TENSOR_GROUPNORM_KERNELS( U8, U8, KERNEL_SOURCE_2 )
-    TENSOR_GROUPNORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_2 )
-    TENSOR_GROUPNORM_KERNELS( U8, F16, KERNEL_SOURCE_5 )
-    TENSOR_GROUPNORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_5 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( U8, F16, U8, KERNEL_SOURCE_0 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( U8, F16, U8, KERNEL_SOURCE_0 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( U8, F16, F16, KERNEL_SOURCE_1 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( U8, F16, F16, KERNEL_SOURCE_1 )
 
-    TENSOR_GROUPNORM_KERNELS( I16, I16, KERNEL_SOURCE_3 )
-    TENSOR_GROUPNORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_3 )
-    TENSOR_GROUPNORM_KERNELS( I16, F16, KERNEL_SOURCE_3 )
-    TENSOR_GROUPNORM_KERNELS_2D( I16, F16, KERNEL_SOURCE_3 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( I16, F16, I16, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( I16, F16, I16, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( I16, F16, F16, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( I16, F16, F16, KERNEL_SOURCE_2 )
 
-    TENSOR_GROUPNORM_KERNELS( F16, F16, KERNEL_SOURCE_4 )
-    TENSOR_GROUPNORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_4 )
-    TENSOR_GROUPNORM_KERNELS( F16, U8, KERNEL_SOURCE_4 )
-    TENSOR_GROUPNORM_KERNELS_2D( F16, U8, KERNEL_SOURCE_4 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( F16, F16, F16, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F16, F16, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( F16, F16, U8, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F16, U8, KERNEL_SOURCE_2 )
 
-    TENSOR_GROUPNORM_SCALE_KERNELS( U8, F32, U8, KERNEL_SOURCE_2 )
-    TENSOR_GROUPNORM_SCALE_KERNELS_2D( U8, F32, U8, KERNEL_SOURCE_2 )
-    TENSOR_GROUPNORM_SCALE_KERNELS( U8, F32, F16, KERNEL_SOURCE_5 )
-    TENSOR_GROUPNORM_SCALE_KERNELS_2D( U8, F32, F16, KERNEL_SOURCE_5 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( U8, F32, U8, KERNEL_SOURCE_0 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( U8, F32, U8, KERNEL_SOURCE_0 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( U8, F32, F16, KERNEL_SOURCE_1 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( U8, F32, F16, KERNEL_SOURCE_1 )
 
-    TENSOR_GROUPNORM_SCALE_KERNELS( I8, F32, I8, KERNEL_SOURCE_6 )
-    TENSOR_GROUPNORM_SCALE_KERNELS_2D( I8, F32, I8, KERNEL_SOURCE_6 )
-    TENSOR_GROUPNORM_SCALE_KERNELS( I8, F32, F16, KERNEL_SOURCE_6 )
-    TENSOR_GROUPNORM_SCALE_KERNELS_2D( I8, F32, F16, KERNEL_SOURCE_6 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( I8, F32, I8, KERNEL_SOURCE_0 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( I8, F32, I8, KERNEL_SOURCE_0 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( I8, F32, F16, KERNEL_SOURCE_0 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( I8, F32, F16, KERNEL_SOURCE_0 )
 
-    TENSOR_GROUPNORM_SCALE_KERNELS( I16, F32, I16, KERNEL_SOURCE_7 )
-    TENSOR_GROUPNORM_SCALE_KERNELS_2D( I16, F32, I16, KERNEL_SOURCE_7 )
-    TENSOR_GROUPNORM_SCALE_KERNELS( I16, F32, F16, KERNEL_SOURCE_7 )
-    TENSOR_GROUPNORM_SCALE_KERNELS_2D( I16, F32, F16, KERNEL_SOURCE_7 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( I16, F32, I16, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( I16, F32, I16, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( I16, F32, F16, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( I16, F32, F16, KERNEL_SOURCE_2 )
 
-    TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, U8, KERNEL_SOURCE_8 )
-    TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, U8, KERNEL_SOURCE_8 )
-    TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, F16, KERNEL_SOURCE_8 )
-    TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, F16, KERNEL_SOURCE_8 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, U8, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, U8, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, F16, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, F16, KERNEL_SOURCE_2 )
 };
 
 /*
  * Kernel params
  */
-static vx_param_description_t _groupnorm_sum_sqr_kernel_param_def[] =
+static vx_param_description_t _groupnorm_sums_kernel_param_def[] =
 {
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@@ -208,9 +187,9 @@ static vx_param_description_t _groupnorm_sum_sqr_kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
-#define _GROUPNORM_SUM_SQR_PARAM_NUM  _cnt_of_array( _groupnorm_sum_sqr_kernel_param_def )
+#define _GROUPNORM_SUMS_PARAM_NUM  _cnt_of_array( _groupnorm_sums_kernel_param_def )
 
-static vx_param_description_t _groupnorm_mean_vari_kernel_param_def[] =
+static vx_param_description_t _groupnorm_means_kernel_param_def[] =
 {
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@@ -218,7 +197,7 @@ static vx_param_description_t _groupnorm_mean_vari_kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
-#define _GROUPNORM_MEAN_VARI_PARAM_NUM  _cnt_of_array( _groupnorm_mean_vari_kernel_param_def )
+#define _GROUPNORM_MEANS_PARAM_NUM  _cnt_of_array( _groupnorm_means_kernel_param_def )
 
 static vx_param_description_t _groupnorm_kernel_param_def[] =
 {
@@ -238,7 +217,7 @@ static vx_param_description_t _groupnorm_kernel_param_def[] =
 /*
  * Kernel initializer
  */
-DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer)
+DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
     (
     vsi_nn_kernel_node_t                node,
     const vsi_nn_kernel_node_param_t  * param,
@@ -255,19 +234,17 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer)
 
     vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
     vsi_size_array_t * input_shape = NULL;
-    float scaleIn = 1;
-    int32_t input_zp = 0;
-    vx_uint32 iter = 0;
-    int32_t sumInZp = 0;
-    int32_t tmpZp1 = 0;
-    float tmpZp2 = 0;
-    float e2InScale = 0;
-    float rowSumScale = 0;
     int32_t is2D = 0;
     int32_t width = 0;
     int32_t height = 0;
     int32_t chn = 0;
-    float in_scale_fl = 1, inFlScale_s2 = 1;
+    float input_scale = 1;
+    float input_scale2 = 1;
+    float input_zp = 1;
+    float sum_x_tail = 1;
+    float sum_x2_tail0 = 1;
+    float sum_x2_tail1 = 1;
+    float work_item_pixels = 1;
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -277,26 +254,10 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer)
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &is2D);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    input_shape  = attr[0]->shape;
-
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        input_zp     = attr[0]->asymm.zero_point;
-        scaleIn      = attr[0]->asymm.scale;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            in_scale_fl = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        inFlScale_s2 = in_scale_fl * in_scale_fl;
-    }
-
+    input_shape = attr[0]->shape;
+    input_scale = attr[0]->scale;
+    input_scale2 = input_scale * input_scale;
+    input_zp    = (float)attr[0]->zero_point;
     width = (int32_t)(input_shape->data[0]);
     height = (int32_t)(input_shape->data[1]);
     chn = (int32_t)(attr[1]->shape->data[1]);
@@ -304,16 +265,12 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer)
     {
         height = 1;
     }
-    iter = height * 16;
 
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        sumInZp = input_zp * iter * (-1);
-        tmpZp1 = (-2) * input_zp;
-        e2InScale = scaleIn * scaleIn;
-        tmpZp2 = input_zp * input_zp * e2InScale;
-        rowSumScale = height * 16 * tmpZp2;
-    }
+    work_item_pixels = (float)height * 16;
+
+    sum_x_tail = -work_item_pixels * input_zp * input_scale;
+    sum_x2_tail0 = work_item_pixels * input_zp * input_zp * input_scale2;
+    sum_x2_tail1 = -2 * input_zp * input_scale2;
 
     shaderParam.global_scale[0]  = 1;
     shaderParam.global_scale[1]  = 1;
@@ -336,9 +293,9 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer)
     status = vsi_nn_kernel_gpu_config( node, &shaderParam );
     CHECK_STATUS_FAIL_GOTO(status, OnError);
 
-    if (attr[0]->dtype == U8)
+    if (attr[0]->dtype == U8 || attr[0]->dtype == I8)
     {
-        gpu_dp_inst_t uniSumU8_16x1 = {{
+        gpu_dp_inst_t uniSumX_16x1 = {{
             0x55555555, // TCfg
             0x00000000, // ASelt
             0x76543210, 0xfedcba98, // ABin
@@ -347,7 +304,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer)
             0x00002400, // AccumType, ConstantType, and PostShift
             0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniSqrSum_16x1 = {{
+        gpu_dp_inst_t uniSumX2_16x1 = {{
             0x55555555, // TCfg
             0x00000000, // ASelt
             0x76543210, 0xfedcba98, // ABin
@@ -356,70 +313,33 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer)
             0x00000400, // AccumType, ConstantType, and PostShift
             0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        status  = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
-        status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
-        status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
-        status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1);
-        status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
-        status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
-        status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale);
+        status  = vsi_nn_kernel_gpu_add_param(node, "uniSumX_16x1", &uniSumX_16x1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniSumX2_16x1", &uniSumX2_16x1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale);
+        status |= vsi_nn_kernel_gpu_add_param(node, "input_scale2", &input_scale2);
+        status |= vsi_nn_kernel_gpu_add_param(node, "sum_x_tail", &sum_x_tail);
+        status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail0", &sum_x2_tail0);
+        status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
     }
-    else if (attr[0]->dtype == I8)
+    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
     {
-        gpu_dp_inst_t uniSumInt8_16x1 = {{
-            0x55555555, // TCfg
-            0x00000000, // ASelt
-            0x76543210, 0xfedcba98, // ABin
-            0xaaaaaaaa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniSqrSumInt8_16x1 = {{
-            0x55555555, // TCfg
-            0x00000000, // ASelt
-            0x76543210, 0xfedcba98, // ABin
-            0x55555555, // BSelt
-            0x76543210, 0xfedcba98, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        status  = vsi_nn_kernel_gpu_add_param(node, "uniSumInt8_16x1", &uniSumInt8_16x1);
-        status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSumInt8_16x1", &uniSqrSumInt8_16x1);
-        status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl);
-        status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2);
-        CHECK_STATUS_FAIL_GOTO(status, OnError );
-    }
-    else if (attr[0]->dtype == I16)
-    {
-        gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{
+        gpu_dp_inst_t uniSum_X_X2_8x2 = {{
             0x55555555, // TCfg
             0x00000000, // ASelt
             0x76543210, 0x76543210, // ABin
-            0x5555aaaa, // BSelt
-            0x00000000, 0x76543210, // BBin
-            0x00000300, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        status  = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2", &uniInt16SumSqr_dp8x2);
-        status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl);
-        status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2);
-        CHECK_STATUS_FAIL_GOTO(status, OnError );
-    }
-    else if (attr[0]->dtype == F16)
-    {
-        gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{
-            0x55555555, // TCfg
-            0x00000000, // ASelt
-            0x76543210, 0x76543210, // ABin
-            0x5555aaaa, // BSelt
+            0x0000aaaa, // BSelt
             0x00000000, 0x76543210, // BBin
             0x00000100, // AccumType, ConstantType, and PostShift
-            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
+        status  = vsi_nn_kernel_gpu_add_param(node, "uniSum_X_X2_8x2", &uniSum_X_X2_8x2);
+        status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale);
+        status |= vsi_nn_kernel_gpu_add_param(node, "input_scale2", &input_scale2);
+        status |= vsi_nn_kernel_gpu_add_param(node, "sum_x_tail", &sum_x_tail);
+        status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail0", &sum_x2_tail0);
+        status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
     }
 
@@ -442,7 +362,7 @@ OnError:
     return status;
 }
 
-DEF_KERNEL_INITIALIZER(_groupnorm_mean_vari_initializer)
+DEF_KERNEL_INITIALIZER(_groupnorm_means_initializer)
     (
     vsi_nn_kernel_node_t                node,
     const vsi_nn_kernel_node_param_t  * param,
@@ -523,13 +443,10 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
 
     vsi_nn_kernel_tensor_attr_t* attr[4] = {NULL, NULL, NULL, NULL};
     vsi_size_array_t * input_shape = NULL;
-    float scaleIn = 1.0f;
-    float scaleOut = 1.0f;
-    float reScaleOut_u8 = 1.0f;
-    float scale_inOut = 1.0f;
-    int32_t output_zp = 0;
-    int32_t input_zp = 0;
-    float in_scale_fl = 1, out_scale_fl = 1, inOut_fl_scale = 1;
+    float input_scale = 1;
+    float input_zp = 0;
+    float output_scale = 1.0f;
+    float output_zp = 0;
     int32_t height = 0, width = 0, chn = 0;
     int32_t is2D = 0;
 
@@ -546,49 +463,10 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     input_shape  = attr[0]->shape;
-
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        input_zp     = attr[0]->asymm.zero_point;
-        scaleIn      = attr[0]->asymm.scale;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            in_scale_fl = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        input_zp = 0;
-    }
-
-    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        output_zp    = attr[2]->asymm.zero_point;
-        scaleOut     = attr[2]->asymm.scale;
-        reScaleOut_u8 = 1 / scaleOut;
-    }
-    else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            out_scale_fl = (float)((int64_t)1 << attr[2]->dfp.fl);
-        }
-        else
-        {
-            out_scale_fl = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
-        }
-        output_zp = 0;
-    }
-
-    if ((attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-        && (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP))
-    {
-        inOut_fl_scale = in_scale_fl * out_scale_fl;
-    }
+    input_scale  = attr[0]->scale;
+    input_zp = (float)attr[0]->zero_point;
+    output_scale = 1.0f / attr[2]->scale;
+    output_zp = (float)attr[2]->zero_point;
 
     width = (int32_t)(input_shape->data[0]);
     height = (int32_t)(input_shape->data[1]);
@@ -624,149 +502,65 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError);
 
     {
-        gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{
+        gpu_dp_inst_t uniDataToFP32_0_4x4 = {{
             0x01010101, // TCfg
             0x00000000, // ASelt
             0x00010000, 0x00030002, // ABin
             0x02020202, // BSelt
             0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
-            0x33333333, // TCfg
-            0x11110000, // ASelt
-            0x03020100, 0x03020100, // ABin
-            0x00000000, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000,
-            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertEndInt16Fp32_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvert3rdUint8SubZpToFp32_4x4 = {{
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00090008, 0x000b000a, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvert4thUint8SubZpToFp32_4x4 = {{
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x000d000c, 0x000f000e, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertInt16Fp32Fst_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000300, // AccumType, ConstantType, and PostShift
             0x00000001, 0x00000000, 0x00000001, 0x00000000,
             0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertInt16Fp32Secd_4x4 = {{
+        gpu_dp_inst_t uniDataToFP32_1_4x4 = {{
             0x01010101, // TCfg
             0x00000000, // ASelt
             0x00050004, 0x00070006, // ABin
             0x02020202, // BSelt
             0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000300, // AccumType, ConstantType, and PostShift
             0x00000001, 0x00000000, 0x00000001, 0x00000000,
             0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertInt32toInt16_2x8 = {{
-            0x33333333, // TCfg
-            0x11110000, // ASelt
-            0x03020100, 0x03020100, // ABin
-            0x00000000, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000,
-            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertDirUint8Fp32_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertEndUint8Fp32_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertTrdUint8Fp32_4x4 = {{
+        gpu_dp_inst_t uniDataToFP32_2_4x4 = {{
             0x01010101, // TCfg
             0x00000000, // ASelt
             0x00090008, 0x000b000a, // ABin
             0x02020202, // BSelt
             0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+            0x00000300, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertFthUint8Fp32_4x4 = {{
+        gpu_dp_inst_t uniDataToFP32_3_4x4 = {{
             0x01010101, // TCfg
             0x00000000, // ASelt
             0x000d000c, 0x000f000e, // ABin
             0x02020202, // BSelt
             0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+            0x00000300, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{
+        gpu_dp_inst_t uniExtractHalf8_2x8 = {{
             0x11111111, // TCfg
             0x11110000, // ASelt
             0x06040200, 0x06040200, // ABin
             0x22222222, // BSelt
             0x00000000, 0x00000000, // BBin
             0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractInteger_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
 
         uint32_t pack_key      = 0;
@@ -775,116 +569,67 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
 
         pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype );
 
-        if (attr[3]->dtype != F32)
-        {
-            status  = vsi_nn_kernel_gpu_add_param(node, "height", &height);
-        }
-        if (!(attr[3]->dtype == F32 && (attr[0]->dtype == I16 || attr[0]->dtype == I8)))
-        {
-            status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
-        }
-        CHECK_STATUS_FAIL_GOTO(status, OnError );
-
         switch( pack_key )
         {
             case _PACK_SELECT_KEY( I8, I8 ):
+            case _PACK_SELECT_KEY( U8, U8 ):
+            case _PACK_SELECT_KEY( U8, F16 ):
             case _PACK_SELECT_KEY( I8, F16 ):
                 {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
-                        &uniConvertInt32toUint8_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertDirInt8Fp32_4x4",
-                        &uniConvertDirUint8Fp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt8Fp32_4x4",
-                        &uniConvertEndUint8Fp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertTrdInt8Fp32_4x4",
-                        &uniConvertTrdUint8Fp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFthInt8Fp32_4x4",
-                        &uniConvertFthUint8Fp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
-                        &uniConvertHalfToFp16_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl);
-
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            case _PACK_SELECT_KEY( U8, U8 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
-                        &uniConvertInt32toUint8_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
-                        &uniConvert1stUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
-                        &uniConvert2ndUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
-                        &uniConvert3rdUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
-                        &uniConvert4thUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
-
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &reScaleOut_u8);
-
-                    scale_inOut = reScaleOut_u8 * scaleIn;
-                    status |= vsi_nn_kernel_gpu_add_param(node, "scale_inOut", &scale_inOut);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            case _PACK_SELECT_KEY( U8, F16 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
-                        &uniConvert1stUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
-                        &uniConvert2ndUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
-                        &uniConvert3rdUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
-                        &uniConvert4thUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
-                        &uniConvertHalfToFp16_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+                    if (attr[2]->dtype == F16)
+                    {
+                        status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
+                            &uniExtractHalf8_2x8);
+                    }
+                    else
+                    {
+                        status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
+                            &uniExtractInteger_2x8);
+                    }
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4",
+                        &uniDataToFP32_0_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4",
+                        &uniDataToFP32_1_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_2_4x4",
+                        &uniDataToFP32_2_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_3_4x4",
+                        &uniDataToFP32_3_4x4);
+                    if (attr[2]->dtype != F16)
+                    {
+                        status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+                        status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
+                    }
                     CHECK_STATUS_FAIL_GOTO(status, OnError );
                 }
                 break;
             case _PACK_SELECT_KEY( I16, I16 ):
             case _PACK_SELECT_KEY( I16, F16 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Fst_4x4",
-                        &uniConvertInt16Fp32Fst_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Secd_4x4",
-                        &uniConvertInt16Fp32Secd_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl);
-
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8",
-                         &uniConvertInt32toInt16_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
-                        &uniConvertHalfToFp16_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl);
-
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
             case _PACK_SELECT_KEY( F16, F16 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt16Fp32_4x4",
-                        &uniConvertEndInt16Fp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
-                        &uniConvertHalfToFp16_2x8);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
+            case _PACK_SELECT_KEY( F16, I16 ):
             case _PACK_SELECT_KEY( F16, U8 ):
+            case _PACK_SELECT_KEY( F16, I8 ):
                 {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt16Fp32_4x4",
-                        &uniConvertEndInt16Fp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
-                        &uniConvertInt32toUint8_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &reScaleOut_u8);
+                    if (attr[2]->dtype == F16)
+                    {
+                        status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
+                            &uniExtractHalf8_2x8);
+                    }
+                    else
+                    {
+                        status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
+                            &uniExtractInteger_2x8);
+                    }
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4",
+                        &uniDataToFP32_0_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4",
+                        &uniDataToFP32_1_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
+
+                    status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
                     CHECK_STATUS_FAIL_GOTO(status, OnError );
                 }
                 break;
@@ -941,19 +686,19 @@ static vsi_status _query_kernel
 
     switch( kernel_id )
     {
-        case INTERNAL_KERNEL_SUM_SQR:
-            initializer = _groupnorm_sum_sqr_initializer;
-            kernel_map = _groupnorm_sum_sqr_kernel_map;
-            kernel_map_size = _cnt_of_array( _groupnorm_sum_sqr_kernel_map );
-            param_def = _groupnorm_sum_sqr_kernel_param_def;
-            param_size = _GROUPNORM_SUM_SQR_PARAM_NUM;
+        case INTERNAL_KERNEL_SUMS:
+            initializer = _groupnorm_sums_initializer;
+            kernel_map = _groupnorm_sums_kernel_map;
+            kernel_map_size = _cnt_of_array( _groupnorm_sums_kernel_map );
+            param_def = _groupnorm_sums_kernel_param_def;
+            param_size = _GROUPNORM_SUMS_PARAM_NUM;
             break;
-        case INTERNAL_KERNEL_MEAN_VARI:
-            initializer = _groupnorm_mean_vari_initializer;
+        case INTERNAL_KERNEL_MEANS:
+            initializer = _groupnorm_means_initializer;
             kernel_map = _groupnorm_mean_vari_kernel_map;
             kernel_map_size = _cnt_of_array( _groupnorm_mean_vari_kernel_map );
-            param_def = _groupnorm_mean_vari_kernel_param_def;
-            param_size = _GROUPNORM_MEAN_VARI_PARAM_NUM;
+            param_def = _groupnorm_means_kernel_param_def;
+            param_size = _GROUPNORM_MEANS_PARAM_NUM;
             break;
         case INTERNAL_KERNEL_NORM:
             initializer = _groupnorm_initializer;
@@ -1008,8 +753,8 @@ static vsi_nn_kernel_node_t _setup
 #define SUM_SQR_INDEX           (0)
 #define MEAN_VARI_INDEX         (1)
     vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t sum_sqr_node_params[_GROUPNORM_SUM_SQR_PARAM_NUM] = { NULL };
-    vsi_nn_kernel_node_param_t mean_vari_node_params[_GROUPNORM_MEAN_VARI_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_param_t sums_node_params[_GROUPNORM_SUMS_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_param_t means_node_params[_GROUPNORM_MEANS_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_param_t node_params[_GROUPNORM_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t tmp_node = NULL, tmp_node1 = NULL;
     vsi_nn_kernel_node_t node = NULL;
@@ -1026,9 +771,9 @@ static vsi_nn_kernel_node_t _setup
     uint32_t hashkey = 0;
     int32_t i = 0;
     float rSpaceOrg = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1]);
-    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
-    int32_t group_num  = vsi_nn_kernel_param_get_int32( params, "group_num" );
-    vsi_size_t group_size  = inputs[0]->attr.size[2] / group_num;
+    float eps = vsi_nn_kernel_param_get_float32(params, "eps");
+    int32_t group_num = vsi_nn_kernel_param_get_int32( params, "group_num" );
+    vsi_size_t group_size = inputs[0]->attr.size[2] / group_num;
     float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size);
 
     // Check if gpu can support the size
@@ -1038,7 +783,7 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
-    status =  vsi_nn_kernel_optimize_group_norm_shape( (const vsi_size_t*)inputs[0]->attr.size,
+    status = vsi_nn_kernel_optimize_group_norm_shape( (const vsi_size_t*)inputs[0]->attr.size,
         inputs[0]->attr.dim_num, group_num, 0, new_shape);
     if ( VSI_SUCCESS != status )
     {
@@ -1048,7 +793,7 @@ static vsi_nn_kernel_node_t _setup
     rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4);
     rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4);
 
-    for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
+    for ( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
     {
         ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
         // Assign unique_id
@@ -1059,16 +804,16 @@ static vsi_nn_kernel_node_t _setup
     in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    hashkeys[SUM_SQR_INDEX]= HASH_GROUPNORM_SUM_SQR_KEY( in0_dtype, F32, is2D_flg );
-    hashkeys[MEAN_VARI_INDEX]= HASH_GROUPNORM_MEAN_VARI_KEY( F32, F32 );
+    hashkeys[SUM_SQR_INDEX]= HASH_GROUPNORM_SUMS_KEY( in0_dtype, F32, is2D_flg );
+    hashkeys[MEAN_VARI_INDEX]= HASH_GROUPNORM_MEANS_KEY( F32, F32 );
     hashkey = HASH_GROUPNORM_KEY( in0_dtype, in2_dtype, out_dtype, is2D_flg );
 
-    status = _query_kernel( ikernels[SUM_SQR_INDEX], hashkeys[SUM_SQR_INDEX], INTERNAL_KERNEL_SUM_SQR );
+    status = _query_kernel( ikernels[SUM_SQR_INDEX], hashkeys[SUM_SQR_INDEX], INTERNAL_KERNEL_SUMS );
     if ( VSI_SUCCESS != status )
     {
         goto final;
     }
-    status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
+    status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEANS );
     if ( VSI_SUCCESS != status )
     {
         goto final;
@@ -1103,26 +848,21 @@ static vsi_nn_kernel_node_t _setup
     if (tmp_node)
     {
         uint32_t index = 0;
-        sum_sqr_node_params[index++] = rs_input;
-        sum_sqr_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t;
-        sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
-        sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg );
+        sums_node_params[index++] = rs_input;
+        sums_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t;
+        sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+        sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg );
 
-        status  = vsi_nn_kernel_node_pass_param( tmp_node, sum_sqr_node_params,
-            _GROUPNORM_SUM_SQR_PARAM_NUM );
+        status  = vsi_nn_kernel_node_pass_param( tmp_node, sums_node_params,
+            _GROUPNORM_SUMS_PARAM_NUM );
         CHECK_STATUS(status);
-        vsi_nn_kernel_scalar_release( &sum_sqr_node_params[2] );
-        vsi_nn_kernel_scalar_release( &sum_sqr_node_params[3] );
+        vsi_nn_kernel_scalar_release( &sums_node_params[2] );
+        vsi_nn_kernel_scalar_release( &sums_node_params[3] );
         {
             // Set default border mode.
             vx_border_t border;
             border.mode = VX_BORDER_CONSTANT;
-            border.constant_value.U8 = 0;
-            border.constant_value.U16 = 0;
-            if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
-            {
-                border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
-            }
+            vsi_nn_Float32ToDtype(0, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype);
             status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) );
             CHECK_STATUS(status);
         }
@@ -1133,26 +873,21 @@ static vsi_nn_kernel_node_t _setup
     if (tmp_node1)
     {
         uint32_t index = 0;
-        mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t;
-        mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
-        mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
-        mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &group_ratio );
+        means_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t;
+        means_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
+        means_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+        means_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &group_ratio );
 
-        status  = vsi_nn_kernel_node_pass_param( tmp_node1, mean_vari_node_params,
-            _GROUPNORM_MEAN_VARI_PARAM_NUM );
+        status  = vsi_nn_kernel_node_pass_param( tmp_node1, means_node_params,
+            _GROUPNORM_MEANS_PARAM_NUM );
         CHECK_STATUS(status);
-        vsi_nn_kernel_scalar_release( &mean_vari_node_params[2] );
-        vsi_nn_kernel_scalar_release( &mean_vari_node_params[3] );
+        vsi_nn_kernel_scalar_release( &means_node_params[2] );
+        vsi_nn_kernel_scalar_release( &means_node_params[3] );
         {
             // Set default border mode.
             vx_border_t border;
             border.mode = VX_BORDER_CONSTANT;
-            border.constant_value.U8 = 0;
             border.constant_value.U16 = 0;
-            if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
-            {
-                border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
-            }
             status = vxSetNodeAttribute( (vx_node)tmp_node1, VX_NODE_BORDER, &border, sizeof(border) );
             CHECK_STATUS(status);
         }
@@ -1186,19 +921,6 @@ static vsi_nn_kernel_node_t _setup
         vsi_nn_kernel_scalar_release( &node_params[6] );
         vsi_nn_kernel_scalar_release( &node_params[7] );
         vsi_nn_kernel_scalar_release( &node_params[8] );
-        {
-            // Set default border mode.
-            vx_border_t border;
-            border.mode = VX_BORDER_CONSTANT;
-            border.constant_value.U8 = 0;
-            border.constant_value.U16 = 0;
-            if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
-            {
-                border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
-            }
-            status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
-            CHECK_STATUS(status);
-        }
     }
 
     /* Pass parameters to node. */
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
index 69057be..510069b 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
@@ -44,7 +44,7 @@ __BEGIN_DECLS
 typedef enum _grucell_nn_activation_type_e
 {
     SIGMOID = VSI_NN_ACT_SIGMOID,
-    HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
+    HSIGMOID = VSI_NN_ACT_HARD_SIGMOID,
 }grucell_nn_activation_type_e;
 
 #define _GRUCELL_ACTIVATION_Z_H_KERNEL_SOURCE      "grucell_activation_z_h"
@@ -72,6 +72,10 @@ static const _kernel_map_type _grucell_activation_z_h_kernel_map[] =
     PACK_KERNEL_MAP( I8,  F16, I8,  SIGMOID ),
     PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ),
     PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
+    PACK_KERNEL_MAP( U8,  F16, U8,  HSIGMOID ),
+    PACK_KERNEL_MAP( I8,  F16, I8,  HSIGMOID ),
+    PACK_KERNEL_MAP( I16, F16, I16, HSIGMOID ),
+    PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ),
 };
 
 /*
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
index 5ba28e6..8522000 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
@@ -22,7 +22,6 @@
 *
 *****************************************************************************/
 
-
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -45,7 +44,7 @@ __BEGIN_DECLS
 typedef enum _grucell_nn_activation_type_e
 {
     SIGMOID = VSI_NN_ACT_SIGMOID,
-    HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
+    HSIGMOID = VSI_NN_ACT_HARD_SIGMOID,
 }grucell_nn_activation_type_e;
 
 #define _GRUCELL_H_TIMES_ACTIVATION_R_KERNEL_SOURCE      "grucell_h_times_activation_r"
@@ -72,9 +71,12 @@ static const _kernel_map_type _grucell_h_times_activation_r_kernel_map[] =
     PACK_KERNEL_MAP( I8,  F16, F16, SIGMOID ),
     PACK_KERNEL_MAP( I16, F16, F16, SIGMOID ),
     PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
+    PACK_KERNEL_MAP( U8,  F16, F16, HSIGMOID ),
+    PACK_KERNEL_MAP( I8,  F16, F16, HSIGMOID ),
+    PACK_KERNEL_MAP( I16, F16, F16, HSIGMOID ),
+    PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ),
 };
 
-
 /*
  * Kernel params
  */
@@ -256,8 +258,6 @@ final:
     return status;
 } /* _grucell_h_times_activation_r_initializer() */
 
-
-
 /*
  * Query kernel
  */
@@ -313,7 +313,6 @@ static vsi_status _query_kernel
     return status;
 } /* _query_kernel() */
 
-
 static vsi_nn_kernel_node_t _setup
     (
     vsi_nn_graph_t              * graph,
diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
index 4f3367e..f641e10 100644
--- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
@@ -35,7 +35,8 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "utils/vsi_nn_dtype_util.h"
 
 __BEGIN_DECLS
 
@@ -45,67 +46,46 @@ __BEGIN_DECLS
 
 typedef enum
 {
-    INTERNAL_KERNEL_MEAN_VARI,
+    INTERNAL_KERNEL_SUMS,
     INTERNAL_KERNEL_NORM,
 } _internal_kernel_e;
 
-#define KERNEL_SOURCE_1    "instance_normalization_i8"
-#define KERNEL_SOURCE_2    "instance_normalization_u8"
-#define KERNEL_SOURCE_3    "instance_normalization_i16"
-#define KERNEL_SOURCE_4    "instance_normalization_f16"
-#define KERNEL_SOURCE_5    "instance_normalization_u8_f16"
-#define KERNEL_SOURCE_6    "instance_normalization_scale_f32"
-#define KERNEL_SOURCE_7    "instance_normalization_scale_f32_f16"
-#define KERNEL_SOURCE_8    "instance_normalization_scale_f32_bf16"
+#define KERNEL_SOURCE_0    "instance_normalization_0"
+#define KERNEL_SOURCE_1    "instance_normalization_1"
+#define KERNEL_SOURCE_2    "instance_normalization_2"
+#define KERNEL_SOURCE_3    "instance_normalization_3"
 
-#define HASH_INSTANCENORM_MEAN_VARI_SH_KERNEL_NAME(SRC0_TYPE) \
-    CVIVANTE_NAMESPACE("evis.instance_norm_meanvari_"#SRC0_TYPE)
+#define HASH_INSTANCENORM_SUMS_SH_KERNEL_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("evis.instance_norm_sums_"#SRC0_TYPE)
 
-#define HASH_INSTANCENORM_MEAN_VARI_SH_KERNEL_2D_NAME(SRC0_TYPE) \
-    CVIVANTE_NAMESPACE("evis.instance_norm_meanvari_"#SRC0_TYPE"_2D")
-
-#define HASH_INSTANCENORM_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"to"#DST_TYPE)
-
-#define HASH_INSTANCENORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D")
+#define HASH_INSTANCENORM_SUMS_SH_KERNEL_2D_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("evis.instance_norm_sums_"#SRC0_TYPE"_2D")
 
 #define HASH_INSTANCENORM_SCALE_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"F32to"#DST_TYPE)
+    CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"_F32to"#DST_TYPE)
 
 #define HASH_INSTANCENORM_SCALE_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"F32to"#DST_TYPE"_2D")
+    CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"_F32to"#DST_TYPE"_2D")
 
 // Add kernel hashtable here
-// mean vari
-#define HASH_INSTANCENORM_MEAN_VARI_KEY(_input0_type, _output_type, _reshape_flag) \
+#define HASH_INSTANCENORM_SUMS_KEY(_input0_type, _output_type, _reshape_flag) \
     ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
 
-#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 0), \
-        HASH_INSTANCENORM_MEAN_VARI_SH_KERNEL_NAME(IN0_TYPE), \
+#define TENSOR_INSTANCENORM_SUMS_KERNELS_3D(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_INSTANCENORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        HASH_INSTANCENORM_SUMS_SH_KERNEL_NAME(IN0_TYPE), \
         SOURCE },
 
-#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 1), \
-        HASH_INSTANCENORM_MEAN_VARI_SH_KERNEL_2D_NAME(IN0_TYPE), \
+#define TENSOR_INSTANCENORM_SUMS_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_INSTANCENORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 1), \
+        HASH_INSTANCENORM_SUMS_SH_KERNEL_2D_NAME(IN0_TYPE), \
         SOURCE },
 
 // normalization
 #define HASH_INSTANCENORM_KEY(_input0_type, _input1_type, _output_type, _reshape_flag) \
     ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_reshape_flag << 4))
 
-#define TENSOR_INSTANCENORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_INSTANCENORM_KEY(IN0_TYPE, F16, OUT_TYPE, 0), \
-        HASH_INSTANCENORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
-        SOURCE },
-
-#define TENSOR_INSTANCENORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_INSTANCENORM_KEY(IN0_TYPE, F16, OUT_TYPE, 1), \
-        HASH_INSTANCENORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
-        SOURCE },
-
-#define TENSOR_INSTANCENORM_SCALE_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+#define TENSOR_INSTANCENORM_SCALE_KERNELS_3D(IN0_TYPE, OUT_TYPE, SOURCE) \
     { HASH_INSTANCENORM_KEY(IN0_TYPE, F32, OUT_TYPE, 0), \
         HASH_INSTANCENORM_SCALE_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
         SOURCE },
@@ -122,62 +102,57 @@ typedef struct
     const char * source_name;
 } _kernel_map_type;
 
-static const _kernel_map_type _instancenorm_mean_vari_kernel_map[] =
+static const _kernel_map_type _instancenorm_sums_kernel_map[] =
 {
     // Register kernel here
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( I8, F32, KERNEL_SOURCE_1 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( I8, F32, KERNEL_SOURCE_1 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( U8, F32, KERNEL_SOURCE_2 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( U8, F32, KERNEL_SOURCE_2 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( I16, F32, KERNEL_SOURCE_3 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( I16, F32, KERNEL_SOURCE_3 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F16, F32, KERNEL_SOURCE_4 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F16, F32, KERNEL_SOURCE_4 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( BF16, F32, KERNEL_SOURCE_8 )
-    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( BF16, F32, KERNEL_SOURCE_8 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS_3D( I8,   F32, KERNEL_SOURCE_0 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS_2D( I8,   F32, KERNEL_SOURCE_0 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS_3D( U8,   F32, KERNEL_SOURCE_0 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS_2D( U8,   F32, KERNEL_SOURCE_0 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS_3D( I16,  F32, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS_2D( I16,  F32, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS_3D( F16,  F32, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS_2D( F16,  F32, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS_3D( BF16, F32, KERNEL_SOURCE_3 )
+    TENSOR_INSTANCENORM_SUMS_KERNELS_2D( BF16, F32, KERNEL_SOURCE_3 )
 };
 
 static const _kernel_map_type _instancenorm_kernel_map[] =
 {
     // Register kernel here
-    TENSOR_INSTANCENORM_KERNELS( I8, I8, KERNEL_SOURCE_1 )
-    TENSOR_INSTANCENORM_KERNELS_2D( I8, I8, KERNEL_SOURCE_1 )
-    TENSOR_INSTANCENORM_KERNELS( I8, F16, KERNEL_SOURCE_1 )
-    TENSOR_INSTANCENORM_KERNELS_2D( I8, F16, KERNEL_SOURCE_1 )
 
-    TENSOR_INSTANCENORM_KERNELS( U8, U8, KERNEL_SOURCE_2 )
-    TENSOR_INSTANCENORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_2 )
-    TENSOR_INSTANCENORM_KERNELS( U8, F16, KERNEL_SOURCE_5 )
-    TENSOR_INSTANCENORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_5 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_3D( U8,  U8,  KERNEL_SOURCE_0 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( U8,  U8,  KERNEL_SOURCE_0 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_3D( I8,  I8,  KERNEL_SOURCE_0 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I8,  I8,  KERNEL_SOURCE_0 )
 
-    TENSOR_INSTANCENORM_KERNELS( I16, I16, KERNEL_SOURCE_3 )
-    TENSOR_INSTANCENORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_3 )
-    TENSOR_INSTANCENORM_KERNELS( I16, F16, KERNEL_SOURCE_3 )
-    TENSOR_INSTANCENORM_KERNELS_2D( I16, F16, KERNEL_SOURCE_3 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_3D( U8,  F16, KERNEL_SOURCE_1 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( U8,  F16, KERNEL_SOURCE_1 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_3D( I8,  F16, KERNEL_SOURCE_1 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I8,  F16, KERNEL_SOURCE_1 )
 
-    TENSOR_INSTANCENORM_KERNELS( F16, F16, KERNEL_SOURCE_4 )
-    TENSOR_INSTANCENORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_4 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_3D( I16, I16, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I16, I16, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_3D( F16, F16, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( F16, F16, KERNEL_SOURCE_2 )
 
-    TENSOR_INSTANCENORM_SCALE_KERNELS( U8, U8, KERNEL_SOURCE_6 )
-    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( U8, U8, KERNEL_SOURCE_6 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_3D( I16, F16, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I16, F16, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_3D( F16, I16, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( F16, I16, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_3D( F16, I8,  KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( F16, I8,  KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_3D( F16, U8,  KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( F16, U8,  KERNEL_SOURCE_2 )
 
-    TENSOR_INSTANCENORM_SCALE_KERNELS( I8, I8, KERNEL_SOURCE_6 )
-    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I8, I8, KERNEL_SOURCE_6 )
-
-    TENSOR_INSTANCENORM_SCALE_KERNELS( I16, I16, KERNEL_SOURCE_6 )
-    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I16, I16, KERNEL_SOURCE_6 )
-
-    TENSOR_INSTANCENORM_SCALE_KERNELS( F16, F16, KERNEL_SOURCE_7 )
-    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( F16, F16, KERNEL_SOURCE_7 )
-
-    TENSOR_INSTANCENORM_SCALE_KERNELS( BF16, BF16, KERNEL_SOURCE_8 )
-    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( BF16, BF16, KERNEL_SOURCE_8 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_3D( BF16, BF16, KERNEL_SOURCE_3 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( BF16, BF16, KERNEL_SOURCE_3 )
 };
 
 /*
  * Kernel params
  */
-static vx_param_description_t _instancenorm_mean_vari_kernel_param_def[] =
+static vx_param_description_t _instancenorm_sums_kernel_param_def[] =
 {
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@@ -185,7 +160,7 @@ static vx_param_description_t _instancenorm_mean_vari_kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
-#define _INSTANCENORM_MEAN_VARI_PARAM_NUM  _cnt_of_array( _instancenorm_mean_vari_kernel_param_def )
+#define _INSTANCENORM_SUMS_PARAM_NUM  _cnt_of_array( _instancenorm_sums_kernel_param_def )
 
 static vx_param_description_t _instancenorm_kernel_param_def[] =
 {
@@ -203,7 +178,7 @@ static vx_param_description_t _instancenorm_kernel_param_def[] =
 /*
  * Kernel initializer
  */
-DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
+DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
     (
     vsi_nn_kernel_node_t                node,
     const vsi_nn_kernel_node_param_t  * param,
@@ -220,65 +195,44 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
 
     vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
     vsi_size_array_t * input_shape = NULL;
-    float scaleIn = 1;
-    int32_t input_zp = 0;
-    vx_uint32 iter = 0;
-    int32_t sumInZp = 0;
-    int32_t tmpZp1 = 0;
-    float tmpZp2 = 0;
-    float e2InScale = 0;
-    float rowSumScale = 0;
-    int32_t rsFlg = 0;
+    int32_t rs_flag = 0;
     int32_t width = 0;
     int32_t height = 0;
     int32_t chn = 0;
-    float in_scale_fl = 1, inFlScale_s2 = 1;
+    float input_scale = 1;
+    float input_scale2 = 1;
+    float input_zp = 1;
+    float sum_x_tail = 1;
+    float sum_x2_tail0 = 1;
+    float sum_x2_tail1 = 1;
+    float work_item_pixels = 1;
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
 
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &rsFlg);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &rs_flag);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    input_shape  = attr[0]->shape;
-
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        input_zp     = attr[0]->asymm.zero_point;
-        scaleIn      = attr[0]->asymm.scale;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            in_scale_fl = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        inFlScale_s2 = in_scale_fl * in_scale_fl;
-    }
+    input_shape = attr[0]->shape;
+    input_scale = attr[0]->scale;
+    input_scale2 = input_scale * input_scale;
+    input_zp    = (float)attr[0]->zero_point;
 
     width = (int32_t)(input_shape->data[0]);
     height = (int32_t)(input_shape->data[1]);
     chn = (int32_t)(attr[1]->shape->data[1]);
-    if (rsFlg)
+    if (rs_flag)
     {
         height = height / chn;
     }
-    iter = height * 16;
 
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        sumInZp = input_zp * iter * (-1);
-        tmpZp1 = (-2) * input_zp;
-        e2InScale = scaleIn * scaleIn;
-        tmpZp2 = input_zp * input_zp * e2InScale;
-        rowSumScale = height * 16 * tmpZp2;
-    }
+    work_item_pixels = (float)height * 16;
+
+    sum_x_tail = -work_item_pixels * input_zp * input_scale;
+    sum_x2_tail0 = work_item_pixels * input_zp * input_zp * input_scale2;
+    sum_x2_tail1 = -2 * input_zp * input_scale2;
 
     shaderParam.global_scale[0]  = 1;
     shaderParam.global_scale[1]  = 1;
@@ -301,9 +255,9 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
     status = vsi_nn_kernel_gpu_config( node, &shaderParam );
     CHECK_STATUS_FAIL_GOTO(status, OnError);
 
-    if (attr[0]->dtype == U8)
+    if (attr[0]->dtype == U8 || attr[0]->dtype == I8)
     {
-        gpu_dp_inst_t uniSumU8_16x1 = {{
+        gpu_dp_inst_t uniSumX_16x1 = {{
             0x55555555, // TCfg
             0x00000000, // ASelt
             0x76543210, 0xfedcba98, // ABin
@@ -312,36 +266,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
             0x00002400, // AccumType, ConstantType, and PostShift
             0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniSqrSum_16x1 = {{
-            0x55555555, // TCfg
-            0x00000000, // ASelt
-            0x76543210, 0xfedcba98, // ABin
-            0x55555555, // BSelt
-            0x76543210, 0xfedcba98, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        status  = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
-        status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
-        status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
-        status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1);
-        status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
-        status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
-        status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale);
-        CHECK_STATUS_FAIL_GOTO(status, OnError );
-    }
-    else if (attr[0]->dtype == I8)
-    {
-        gpu_dp_inst_t uniSumInt8_16x1 = {{
-            0x55555555, // TCfg
-            0x00000000, // ASelt
-            0x76543210, 0xfedcba98, // ABin
-            0xaaaaaaaa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniSqrSumInt8_16x1 = {{
+        gpu_dp_inst_t uniSumX2_16x1 = {{
             0x55555555, // TCfg
             0x00000000, // ASelt
             0x76543210, 0xfedcba98, // ABin
@@ -351,40 +276,33 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
             0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
 
-        status  = vsi_nn_kernel_gpu_add_param(node, "uniSumInt8_16x1", &uniSumInt8_16x1);
-        status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSumInt8_16x1", &uniSqrSumInt8_16x1);
-        status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl);
-        status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2);
+        status  = vsi_nn_kernel_gpu_add_param(node, "uniSumX_16x1", &uniSumX_16x1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniSumX2_16x1", &uniSumX2_16x1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale);
+        status |= vsi_nn_kernel_gpu_add_param(node, "input_scale2", &input_scale2);
+        status |= vsi_nn_kernel_gpu_add_param(node, "sum_x_tail", &sum_x_tail);
+        status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail0", &sum_x2_tail0);
+        status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
     }
-    else if (attr[0]->dtype == I16)
+    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
     {
-        gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{
+        gpu_dp_inst_t uniSum_X_X2_8x2 = {{
             0x55555555, // TCfg
             0x00000000, // ASelt
             0x76543210, 0x76543210, // ABin
-            0x5555aaaa, // BSelt
-            0x00000000, 0x76543210, // BBin
-            0x00000300, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        status  = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2", &uniInt16SumSqr_dp8x2);
-        status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl);
-        status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2);
-        CHECK_STATUS_FAIL_GOTO(status, OnError );
-    }
-    else if (attr[0]->dtype == F16)
-    {
-        gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{
-            0x55555555, // TCfg
-            0x00000000, // ASelt
-            0x76543210, 0x76543210, // ABin
-            0x5555aaaa, // BSelt
+            0x0000aaaa, // BSelt
             0x00000000, 0x76543210, // BBin
             0x00000100, // AccumType, ConstantType, and PostShift
-            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
+        status  = vsi_nn_kernel_gpu_add_param(node, "uniSum_X_X2_8x2", &uniSum_X_X2_8x2);
+        status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale);
+        status |= vsi_nn_kernel_gpu_add_param(node, "input_scale2", &input_scale2);
+        status |= vsi_nn_kernel_gpu_add_param(node, "sum_x_tail", &sum_x_tail);
+        status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail0", &sum_x2_tail0);
+        status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
     }
     else if (attr[0]->dtype == BF16)
@@ -450,15 +368,14 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
 
     vsi_nn_kernel_tensor_attr_t* attr[4] = {NULL, NULL};
     vsi_size_array_t * input_shape = NULL;
-    float scaleIn = 1.0f;
-    float scaleOut = 1.0f;
-    float scale_inOut = 1.0f;
-    int32_t output_zp = 0;
-    int32_t input_zp = 0;
-    float dimRatio = 0;
+    float input_scale = 1;
+    float output_scale = 1;
+    float input_zp = 0;
+    float output_zp = 0;
+    float inv_multiplier = 0;
     vx_uint32 group_num = 0;
     vx_int32 height = 0, width = 0, chn = 0;
-    int32_t rsFlg = 0;
+    int32_t rs_flag = 0;
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -469,59 +386,24 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
     attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] );
     CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError );
 
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &rsFlg);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &rs_flag);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     input_shape  = attr[0]->shape;
-
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        input_zp     = attr[0]->asymm.zero_point;
-        scaleIn      = attr[0]->asymm.scale;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        input_zp = 0;
-    }
-
-    if (attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        output_zp    = attr[3]->asymm.zero_point;
-        scaleOut     = attr[3]->asymm.scale;
-        scaleOut     = 1 / scaleOut;
-    }
-    else if (attr[3]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[3]->dfp.fl > 0)
-        {
-            scaleOut = (float)((int64_t)1 << attr[3]->dfp.fl);
-        }
-        else
-        {
-            scaleOut = (1.0f / (float)((int64_t)1 << -attr[3]->dfp.fl));
-        }
-        output_zp = 0;
-    }
-
-    scale_inOut = scaleIn * scaleOut;
+    input_scale  = attr[0]->scale;
+    input_zp = (float)attr[0]->zero_point;
+    output_scale = 1.0f / attr[3]->scale;
+    output_zp = (float)attr[3]->zero_point;
 
     width = (int32_t)(input_shape->data[0]);
     height = (int32_t)(input_shape->data[1]);
     chn = (int32_t)(attr[2]->shape->data[1]);
-    if (rsFlg)
+    if (rs_flag)
     {
         height = height / chn;
     }
 
-    dimRatio = (float)(1.0 / (width * height));
+    inv_multiplier = (float)(1.0 / (width * height));
 
     group_num = (width + 255) / 256;
 
@@ -544,151 +426,66 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError);
 
     {
-        gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{
+        gpu_dp_inst_t uniDataToFP32_0_4x4 = {{
             0x01010101, // TCfg
             0x00000000, // ASelt
             0x00010000, 0x00030002, // ABin
             0x02020202, // BSelt
             0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
-            0x33333333, // TCfg
-            0x11110000, // ASelt
-            0x03020100, 0x03020100, // ABin
-            0x00000000, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000,
-            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertEndInt16Fp32_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvert3rdUint8SubZpToFp32_4x4 = {{
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00090008, 0x000b000a, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvert4thUint8SubZpToFp32_4x4 = {{
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x000d000c, 0x000f000e, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertInt16Fp32Fst_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000300, // AccumType, ConstantType, and PostShift
             0x00000001, 0x00000000, 0x00000001, 0x00000000,
             0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertInt16Fp32Secd_4x4 = {{
+        gpu_dp_inst_t uniDataToFP32_1_4x4 = {{
             0x01010101, // TCfg
             0x00000000, // ASelt
             0x00050004, 0x00070006, // ABin
             0x02020202, // BSelt
             0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000300, // AccumType, ConstantType, and PostShift
             0x00000001, 0x00000000, 0x00000001, 0x00000000,
             0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertInt32toInt16_2x8 = {{
-            0x33333333, // TCfg
-            0x11110000, // ASelt
-            0x03020100, 0x03020100, // ABin
-            0x00000000, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000,
-            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertDirUint8Fp32_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertEndUint8Fp32_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertTrdUint8Fp32_4x4 = {{
+        gpu_dp_inst_t uniDataToFP32_2_4x4 = {{
             0x01010101, // TCfg
             0x00000000, // ASelt
             0x00090008, 0x000b000a, // ABin
             0x02020202, // BSelt
             0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+            0x00000300, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertFthUint8Fp32_4x4 = {{
+        gpu_dp_inst_t uniDataToFP32_3_4x4 = {{
             0x01010101, // TCfg
             0x00000000, // ASelt
             0x000d000c, 0x000f000e, // ABin
             0x02020202, // BSelt
             0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+            0x00000300, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{
+        gpu_dp_inst_t uniExtractHalf8_2x8 = {{
             0x11111111, // TCfg
             0x11110000, // ASelt
             0x06040200, 0x06040200, // ABin
             0x22222222, // BSelt
             0x00000000, 0x00000000, // BBin
             0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractInteger_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-
         gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
             0x11111111, // TCfg
             0x01010101, // ASelt
@@ -721,151 +518,77 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
         }, GPU_DP_TYPE_16};
 
         uint32_t pack_key      = 0;
-#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE )    \
-        (IN0_TYPE | (IN1_TYPE << 8) | (OUT_TYPE << 16))
+#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE )    \
+        (IN0_TYPE | (OUT_TYPE << 16))
 
-        pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, attr[3]->dtype );
+        pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[3]->dtype );
 
         status  = vsi_nn_kernel_gpu_add_param(node, "height", &height);
-        status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio);
+        status |= vsi_nn_kernel_gpu_add_param(node, "inv_multiplier", &inv_multiplier);
         status |= vsi_nn_kernel_gpu_add_param(node, "group_num", &group_num);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
 
         switch( pack_key )
         {
-            case _PACK_SELECT_KEY( I8, F16, I8 ):
-            case _PACK_SELECT_KEY( I8, F16, F16 ):
+            case _PACK_SELECT_KEY( U8, F16 ):
+            case _PACK_SELECT_KEY( I8, F16 ):
+            case _PACK_SELECT_KEY( U8, U8 ):
+            case _PACK_SELECT_KEY( I8, I8 ):
                 {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
-                        &uniConvertInt32toUint8_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertDirInt8Fp32_4x4",
-                        &uniConvertDirUint8Fp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt8Fp32_4x4",
-                        &uniConvertEndUint8Fp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertTrdInt8Fp32_4x4",
-                        &uniConvertTrdUint8Fp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFthInt8Fp32_4x4",
-                        &uniConvertFthUint8Fp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
-                        &uniConvertHalfToFp16_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
-                        &UniFP16toFP32Lo4_dp4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &scaleIn);
-
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut);
+                    if (attr[3]->dtype == F16)
+                    {
+                        status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
+                            &uniExtractHalf8_2x8);
+                    }
+                    else
+                    {
+                        status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
+                            &uniExtractInteger_2x8);
+                        status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
+                        status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+                    }
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4",
+                        &uniDataToFP32_0_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4",
+                        &uniDataToFP32_1_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_2_4x4",
+                        &uniDataToFP32_2_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_3_4x4",
+                        &uniDataToFP32_3_4x4);
                     CHECK_STATUS_FAIL_GOTO(status, OnError );
                 }
                 break;
-            case _PACK_SELECT_KEY( U8, F16, U8 ):
+            case _PACK_SELECT_KEY( I16, F16 ):
+            case _PACK_SELECT_KEY( F16, F16 ):
+            case _PACK_SELECT_KEY( I16, I16 ):
+            case _PACK_SELECT_KEY( F16, I16 ):
+            case _PACK_SELECT_KEY( F16, U8 ):
+            case _PACK_SELECT_KEY( F16, I8 ):
                 {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
-                        &uniConvertInt32toUint8_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
-                        &uniConvert1stUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
-                        &uniConvert2ndUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
-                        &uniConvert3rdUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
-                        &uniConvert4thUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
-                        &UniFP16toFP32Lo4_dp4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
-
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "scale_inOut", &scale_inOut);
+                    if (attr[3]->dtype == F16)
+                    {
+                        status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
+                            &uniExtractHalf8_2x8);
+                    }
+                    else
+                    {
+                        status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
+                            &uniExtractInteger_2x8);
+                    }
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4",
+                        &uniDataToFP32_0_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4",
+                        &uniDataToFP32_1_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
                     CHECK_STATUS_FAIL_GOTO(status, OnError );
                 }
                 break;
-            case _PACK_SELECT_KEY( U8, F32, U8 ):
-            case _PACK_SELECT_KEY( I8, F32, I8 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
-                        &uniConvertInt32toUint8_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
-                        &uniConvert1stUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
-                        &uniConvert2ndUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
-                        &uniConvert3rdUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
-                        &uniConvert4thUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
-
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "scale_inOut", &scale_inOut);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            case _PACK_SELECT_KEY( U8, F16, F16 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
-                        &uniConvert1stUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
-                        &uniConvert2ndUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
-                        &uniConvert3rdUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
-                        &uniConvert4thUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
-                        &uniConvertHalfToFp16_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
-                        &UniFP16toFP32Lo4_dp4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            case _PACK_SELECT_KEY( I16, F16, I16 ):
-            case _PACK_SELECT_KEY( I16, F16, F16 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Fst_4x4",
-                        &uniConvertInt16Fp32Fst_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Secd_4x4",
-                        &uniConvertInt16Fp32Secd_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &scaleIn);
-
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8",
-                         &uniConvertInt32toInt16_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
-                        &uniConvertHalfToFp16_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
-                        &UniFP16toFP32Lo4_dp4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            case _PACK_SELECT_KEY( I16, F32, I16 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Fst_4x4",
-                        &uniConvertInt16Fp32Fst_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Secd_4x4",
-                        &uniConvertInt16Fp32Secd_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8",
-                         &uniConvertInt32toInt16_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            case _PACK_SELECT_KEY( F16, F16, F16 ):
-            case _PACK_SELECT_KEY( F16, F32, F16 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt16Fp32_4x4",
-                        &uniConvertEndInt16Fp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
-                        &uniConvertHalfToFp16_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
-                        &UniFP16toFP32Lo4_dp4x4);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            case _PACK_SELECT_KEY( BF16, F32, BF16 ):
+            case _PACK_SELECT_KEY( BF16, BF16 ):
                 {
                     status  = vsi_nn_kernel_gpu_add_param( node,
                                 "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
@@ -927,14 +650,14 @@ static vsi_status _query_kernel
     size_t param_size = 0;
     uint32_t i = 0;
 
-    switch( kernel_id )
+    switch ( kernel_id )
     {
-        case INTERNAL_KERNEL_MEAN_VARI:
-            initializer = _instancenorm_mean_vari_initializer;
-            kernel_map = _instancenorm_mean_vari_kernel_map;
-            kernel_map_size = _cnt_of_array( _instancenorm_mean_vari_kernel_map );
-            param_def = _instancenorm_mean_vari_kernel_param_def;
-            param_size = _INSTANCENORM_MEAN_VARI_PARAM_NUM;
+        case INTERNAL_KERNEL_SUMS:
+            initializer = _instancenorm_sums_initializer;
+            kernel_map = _instancenorm_sums_kernel_map;
+            kernel_map_size = _cnt_of_array( _instancenorm_sums_kernel_map );
+            param_def = _instancenorm_sums_kernel_param_def;
+            param_size = _INSTANCENORM_SUMS_PARAM_NUM;
             break;
         case INTERNAL_KERNEL_NORM:
             initializer = _instancenorm_initializer;
@@ -948,7 +671,7 @@ static vsi_status _query_kernel
             return VSI_FAILURE;
     }
 
-    for( i = 0; i < kernel_map_size; i ++ )
+    for ( i = 0; i < kernel_map_size; i ++ )
     {
         if ( kernel_map[i].key == hashkey )
         {
@@ -989,7 +712,7 @@ static vsi_nn_kernel_node_t _setup
 #define INTERNAL_KERNEL_SIZE    (1)
 #define MEAN_VARI_INDEX  (0)
     vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t mean_vari_node_params[_INSTANCENORM_MEAN_VARI_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_param_t sums_node_params[_INSTANCENORM_SUMS_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_param_t node_params[_INSTANCENORM_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t tmp_node = NULL;
     vsi_nn_kernel_node_t node = NULL;
@@ -1004,14 +727,53 @@ static vsi_nn_kernel_node_t _setup
     uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
     uint32_t hashkey = 0;
     int32_t i = 0;
+    int32_t axis[VSI_NN_MAX_DIM_NUM] = {0, 1};
+    int32_t axis_num  = 2;
+    int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0};
+    vsi_size_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1 };
+    uint32_t axis_size = 0;
     uint32_t rank = outputs[0]->attr.dim_num;
+    vsi_nn_tensor_t *reshape_tensor[2] = {NULL};
     float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
-    int32_t reshape_flg  = outputs[0]->attr.size[1] * outputs[0]->attr.size[2] < GPU_TENSOR_MAX_WIDTH
-            && rank > 2;
+    int32_t reshape_flg  = 0;
+    vsi_size_t batch = 1;
+    vsi_bool ret = FALSE;
 
+    ret = vsi_nn_kernel_optimize_tensor_shape(
+        inputs[0]->attr.size, inputs[0]->attr.dim_num,
+        axis, axis_num, new_shape, &rank, new_axis, &axis_size);
+    if ( ret == FALSE || axis_size > 2 )
+    {
+        return NULL;
+    }
+
+    for (i = 3; i < (int32_t)inputs[0]->attr.dim_num; i++)
+    {
+        batch *= inputs[0]->attr.size[i];
+    }
+
+    if (axis_size == 1)
+    {
+        for (i = rank; i > 1; i--)
+        {
+            new_shape[i] = new_shape[i - 1];
+        }
+        new_shape[1] = 1;
+        rank ++;
+    }
+    new_shape[2] = rank == 2 ? 1 : new_shape[2] / batch;
+    new_shape[3] = batch;
+    rank = 4;
+
+    reshape_tensor[0] = vsi_nn_reshape_tensor( graph,
+            inputs[0], new_shape, rank );
+    reshape_tensor[1] = vsi_nn_reshape_tensor( graph,
+            outputs[0], new_shape, rank );
+
+    reshape_flg = rank > 2 && new_shape[1] * new_shape[2] < GPU_TENSOR_MAX_WIDTH;
     // Check if gpu can support the size
     if ( !vsi_nn_kernel_gpu_check_shape(
-        outputs[0]->attr.size, outputs[0]->attr.dim_num ) ||
+        reshape_tensor[1]->attr.size, reshape_tensor[1]->attr.dim_num ) ||
         rank > 4 )
     {
         return NULL;
@@ -1024,14 +786,15 @@ static vsi_nn_kernel_node_t _setup
         ikernels[i]->unique_id = kernel->unique_id;
     }
 
-    in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in0_dtype = vsi_nn_kernel_map_dtype( reshape_tensor[0]->attr.dtype.vx_type );
     in1_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
-    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( reshape_tensor[1]->attr.dtype.vx_type );
+    in1_dtype = in1_dtype == F16 ? F32 : in1_dtype;
 
-    hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_MEAN_VARI_KEY( in0_dtype, F32, reshape_flg );
+    hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_SUMS_KEY( in0_dtype, F32, reshape_flg );
     hashkey = HASH_INSTANCENORM_KEY( in0_dtype, in1_dtype, out_dtype, reshape_flg );
 
-    status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
+    status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_SUMS );
     if ( VSI_SUCCESS != status )
     {
         goto final;
@@ -1044,34 +807,27 @@ static vsi_nn_kernel_node_t _setup
 
     if (reshape_flg)
     {
-        shape[0] = inputs[0]->attr.size[0];
-        shape[1] = inputs[0]->attr.size[1] * inputs[0]->attr.size[2];
+        shape[0] = new_shape[0];
+        shape[1] = new_shape[1] * new_shape[2];
         shape[2] = 1;
-        shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
-        rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 );
-        rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
+        shape[3] = reshape_tensor[0]->attr.dim_num > 3 ? new_shape[3] : 1;
+        rs_input = vsi_nn_kernel_tensor_reshape( reshape_tensor[0]->t, shape, 4 );
+        rs_output = vsi_nn_kernel_tensor_reshape( reshape_tensor[1]->t, shape, 4 );
     }
-    else if (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] < GPU_TENSOR_MAX_WIDTH)
+    else if (new_shape[0] < new_shape[1])
     {
-        shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
-        shape[1] = 1;
-        shape[2] = inputs[0]->attr.size[2];
-        shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
-        rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 );
-        rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
-    }
-    else if (inputs[0]->attr.size[0] < inputs[0]->attr.size[1])
-    {
-        shape[0] = inputs[0]->attr.size[1];
-        shape[1] = inputs[0]->attr.size[0];
-        shape[2] = inputs[0]->attr.size[2];
-        shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
-        rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 );
-        rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
+        shape[0] = new_shape[1];
+        shape[1] = new_shape[0];
+        shape[2] = new_shape[2];
+        shape[3] = inputs[0]->attr.dim_num > 3 ? new_shape[3] : 1;
+        rs_input = vsi_nn_kernel_tensor_reshape( reshape_tensor[0]->t, shape, 4 );
+        rs_output = vsi_nn_kernel_tensor_reshape( reshape_tensor[1]->t, shape, 4 );
     }
     else
     {
-        shape[0] = inputs[0]->attr.size[0];
+        shape[0] = new_shape[0];
+        rs_input = vsi_nn_kernel_tensor_reshape( reshape_tensor[0]->t, new_shape, rank );
+        rs_output = vsi_nn_kernel_tensor_reshape( reshape_tensor[1]->t, new_shape, rank );
     }
 
     memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
@@ -1091,58 +847,37 @@ static vsi_nn_kernel_node_t _setup
     attr.dim_num = 4;
     tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr );
 
-    if (inputs[1]->attr.dim_num < 2)
-    {
-        shape[0] = inputs[1]->attr.size[0];
-        shape[1] = 1;
-        shape[2] = 1;
-        shape[3] = 1;
-        rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 );
-    }
-    if (inputs[2]->attr.dim_num < 2)
-    {
-        shape[0] = inputs[2]->attr.size[0];
-        shape[1] = 1;
-        shape[2] = 1;
-        shape[3] = 1;
-        rs_gamma = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shape, 4 );
-    }
+    shape[0] = 1;
+    shape[1] = rank > 2 ? new_shape[2] : 1;
+    rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 2 );
+    rs_gamma = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shape, 2 );
+
     // Mean Vari
     {
         tmp_node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] );
         if (tmp_node)
         {
             uint32_t index = 0;
-            if (rs_input)
-            {
-                mean_vari_node_params[index++] = rs_input;
-                vsi_nn_kernel_node_pack_io( &mean_vari_node_params[index],
-                                _INSTANCENORM_MEAN_VARI_PARAM_NUM, NULL, 0, tensors, 1 );
-            }
-            else
-            {
-                vsi_nn_kernel_node_pack_io( mean_vari_node_params,
-                                _INSTANCENORM_MEAN_VARI_PARAM_NUM, inputs, 1, tensors, 1 );
-            }
-            index = 2;
-            mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
-            mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
 
-            status  = vsi_nn_kernel_node_pass_param( tmp_node, mean_vari_node_params,
-                        _INSTANCENORM_MEAN_VARI_PARAM_NUM );
+            sums_node_params[index++] = rs_input;
+            vsi_nn_kernel_node_pack_io( &sums_node_params[index],
+                            _INSTANCENORM_SUMS_PARAM_NUM, NULL, 0, tensors, 1 );
+            index = 2;
+            sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+            sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
+
+            status  = vsi_nn_kernel_node_pass_param( tmp_node, sums_node_params,
+                        _INSTANCENORM_SUMS_PARAM_NUM );
             CHECK_STATUS(status);
-            vsi_nn_kernel_scalar_release( &mean_vari_node_params[2] );
-            vsi_nn_kernel_scalar_release( &mean_vari_node_params[3] );
+            vsi_nn_kernel_scalar_release( &sums_node_params[2] );
+            vsi_nn_kernel_scalar_release( &sums_node_params[3] );
             {
                 // Set default border mode.
                 vx_border_t border;
                 border.mode = VX_BORDER_CONSTANT;
-                border.constant_value.U8 = 0;
-                border.constant_value.U16 = 0;
-                if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
-                {
-                    border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
-                }
+
+                vsi_nn_Float32ToDtype(0, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype);
+
                 status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) );
                 CHECK_STATUS(status);
             }
@@ -1155,39 +890,11 @@ static vsi_nn_kernel_node_t _setup
         if (node)
         {
             uint32_t index = 0;
-            if (rs_input)
-            {
-                node_params[index++] = rs_input;
-            }
-            else
-            {
-                node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
-            }
-            if (inputs[1]->attr.dim_num < 2)
-            {
-                node_params[index++] = rs_beta;
-            }
-            else
-            {
-                node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
-            }
-            if (inputs[2]->attr.dim_num < 2)
-            {
-                node_params[index++] = rs_gamma;
-            }
-            else
-            {
-                node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
-            }
+            node_params[index++] = rs_input;
+            node_params[index++] = rs_beta;
+            node_params[index++] = rs_gamma;
             node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
-            if (rs_output)
-            {
-                node_params[index++] = rs_output;
-            }
-            else
-            {
-                node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t;
-            }
+            node_params[index++] = rs_output;
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
 
@@ -1196,24 +903,13 @@ static vsi_nn_kernel_node_t _setup
             CHECK_STATUS(status);
             vsi_nn_kernel_scalar_release( &node_params[5] );
             vsi_nn_kernel_scalar_release( &node_params[6] );
-            {
-                // Set default border mode.
-                vx_border_t border;
-                border.mode = VX_BORDER_CONSTANT;
-                border.constant_value.U8 = 0;
-                border.constant_value.U16 = 0;
-                if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
-                {
-                    border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
-                }
-                status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
-                CHECK_STATUS(status);
-            }
         }
     }
 
     /* Pass parameters to node. */
 final:
+    vsi_safe_release_tensor(reshape_tensor[0]);
+    vsi_safe_release_tensor(reshape_tensor[1]);
     if (rs_beta)
     {
         vsi_nn_kernel_tensor_release( &rs_beta );
@@ -1230,16 +926,13 @@ final:
     {
         vsi_nn_kernel_tensor_release( &rs_output );
     }
-    for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
+    for ( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
     {
         if ( ikernels[i] )
         {
             vsi_nn_kernel_release( &ikernels[i] );
         }
-        if ( tensors[i] )
-        {
-            vsi_nn_ReleaseTensor( &tensors[i] );
-        }
+        vsi_safe_release_tensor(tensors[i]);
     }
     if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
     return node;
diff --git a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
index 34c51f8..be4a299 100644
--- a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
@@ -38,16 +38,24 @@
 
 __BEGIN_DECLS
 
+
 #define HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, _image_2d) \
     ((AXIS << 28) | (IN1_DTYPE << 20) | (IN0_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
 
- #define HASH_L2NORMALIZESCALE_KERNEL_SOURCE_NAME(AXIS) \
-    "l2normalizescale_axis"#AXIS
+#define KERNEL_SOURCE_1    "l2normalizescale_axis0"
+#define KERNEL_SOURCE_2    "l2normalizescale_axis0_2d"
+#define KERNEL_SOURCE_3    "l2normalizescale_axis1"
 
-#define HASH_L2NORMALIZESCALE_KERNELS_2D( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+
+#define HASH_L2NORMALIZESCALE_KERNELS_2D( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE) \
         { HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1), \
         CVIVANTE_NAMESPACE("evis.l2normalizescale_axis"#AXIS"_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D"), \
-        HASH_L2NORMALIZESCALE_KERNEL_SOURCE_NAME(AXIS) },
+        SOURCE },
+
+#define HASH_L2NORMALIZESCALE_KERNELS( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE) \
+        { HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0), \
+        CVIVANTE_NAMESPACE("evis.l2normalizescale_axis"#AXIS"_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE), \
+        SOURCE },
 
 typedef struct
 {
@@ -58,20 +66,27 @@ typedef struct
 
 static const _kernel_map_type _l2normalizescale_kernel_map[] =
 {
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, F16, F16, F16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, I8  )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, F16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, U8  )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, F16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, I16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, F16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, F16, F16, F16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, I8  )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, F16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, U8  )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, F16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, I16 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, F16 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, F16, F16, F16, KERNEL_SOURCE_2 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, I8,  KERNEL_SOURCE_2 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, F16, KERNEL_SOURCE_2 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, U8,  KERNEL_SOURCE_2 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, F16, KERNEL_SOURCE_2 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, I16, KERNEL_SOURCE_2 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, F16, KERNEL_SOURCE_2 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, F16, F16, F16, KERNEL_SOURCE_3 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, I8,  KERNEL_SOURCE_3 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, F16, KERNEL_SOURCE_3 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, U8,  KERNEL_SOURCE_3 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, F16, KERNEL_SOURCE_3 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, I16, KERNEL_SOURCE_3 )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, F16, KERNEL_SOURCE_3 )
+    HASH_L2NORMALIZESCALE_KERNELS( 0, F16, F16, F16, KERNEL_SOURCE_1 )
+    HASH_L2NORMALIZESCALE_KERNELS( 0, I8 , F16, I8,  KERNEL_SOURCE_1 )
+    HASH_L2NORMALIZESCALE_KERNELS( 0, I8 , F16, F16, KERNEL_SOURCE_1 )
+    HASH_L2NORMALIZESCALE_KERNELS( 0, U8 , F16, U8,  KERNEL_SOURCE_1 )
+    HASH_L2NORMALIZESCALE_KERNELS( 0, U8 , F16, F16, KERNEL_SOURCE_1 )
+    HASH_L2NORMALIZESCALE_KERNELS( 0, I16, F16, I16, KERNEL_SOURCE_1 )
+    HASH_L2NORMALIZESCALE_KERNELS( 0, I16, F16, F16, KERNEL_SOURCE_1 )
 };
 
 /*
@@ -119,6 +134,10 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
     int32_t   outputZP      = 0;
     float     outputScale   = 1.0f;
     float     r_inputScale  = 1.0f;
+    float     e2InScale     = 1.0f;
+    float     inOutScale    = 1.0f;
+    int32_t   axis2Dflg     = 0;
+    int32_t   inputWidth    = 0;
 
     input_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
@@ -168,7 +187,10 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
         outputScale  = 1.0f / output_attr->asymm.scale;
     }
 
+    e2InScale    = inputScale * inputScale;
     r_inputScale = 1.0f / inputScale;
+    inOutScale   = inputScale * outputScale;
+    inputWidth   = (int32_t)(output_shape->data[0]);
 
     if (1 == axis)
     {
@@ -190,6 +212,13 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
         gpu_param.local_size[1]    = 1;
         gpu_param.global_size[0]   = 16;
         gpu_param.global_size[1]   = output_shape->data[1];
+
+        if (output_shape->data[0] < GPU_TENSOR_MAX_WIDTH
+            && output_shape->data[1] < GPU_TENSOR_MAX_WIDTH
+            && (output_shape->size == 2 || (output_shape->size == 3 && output_shape->data[2] == 1)))
+        {
+            axis2Dflg = 1;
+        }
     }
     else
     {
@@ -257,8 +286,105 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
             0x00000400, // AccumType, ConstantType, and PostShift
             0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
         }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0x76543210, // ABin
+            0x5555aaaa, // BSelt
+            0x00000000, 0x76543210, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertSecFp16Fp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
+            0x05050505, // TCfg
+            0x04040404, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{
+            0x05050505, // TCfg
+            0x04040404, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0x76543210, // ABin
+            0x5555aaaa, // BSelt
+            0x00000000, 0x76543210, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
 
-        if (1 == axis)
+        if (axis2Dflg)
+        {
+            float zP2x = 2 * (float)inputZP;
+            float zpSqr8x =  8 * (float)inputZP * (float)inputZP;
+            float output_ZP = (float)outputZP;
+            status = vsi_nn_kernel_gpu_add_param( node, "inputWidth", &inputWidth);
+            status |= vsi_nn_kernel_gpu_add_param( node, "zP2x", &zP2x);
+            status |= vsi_nn_kernel_gpu_add_param( node, "zpSqr8x", &zpSqr8x);
+            status |= vsi_nn_kernel_gpu_add_param( node, "e2InScale", &e2InScale);
+            status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale);
+            status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &output_ZP);
+            status |= vsi_nn_kernel_gpu_add_param( node, "inputZP", &inputZP);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniInt16SumSqr_dp8x2", &uniInt16SumSqr_dp8x2);
+            status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", &uniConvertSecFp16Fp32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
+                        &uniConvert1stUint8SubZpToFp32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
+                        &uniConvert2ndUint8SubZpToFp32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", &uniConvertHalfToFp16_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }
+        else if (1 == axis)
         {
             int32_t L2NorS_depth = (int32_t)(output_shape->data[1]);
             status = vsi_nn_kernel_gpu_add_param( node, "L2NorS_depth",  &L2NorS_depth);
@@ -277,8 +403,7 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
         }
         else if (0 == axis)
         {
-            int32_t inputWidth, inputWidthCount, inputWidthRemain256;
-            inputWidth          = (int32_t)(output_shape->data[0]);
+            int32_t inputWidthCount, inputWidthRemain256;
             inputWidthRemain256 = (int32_t)(output_shape->data[0] % 256);
             inputWidthCount     = (int32_t)(output_shape->data[0] / 256);
             vsi_nn_kernel_gpu_add_param( node, "inputWidth", &inputWidth);
@@ -298,7 +423,8 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
         }
     }
 
-   {
+    if (axis2Dflg == 0)
+    {
         float IntergerScale = inputScale;
         float output_ZP      = (float)outputZP;
         gpu_dp_inst_t uniExtact8Bin_2x8 = {{
@@ -473,7 +599,8 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
-    image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
+    image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1) &&
+               (inputs[0]->attr.size[0] < GPU_TENSOR_MAX_WIDTH && inputs[0]->attr.size[1] < GPU_TENSOR_MAX_WIDTH);
     status = _query_kernel( kernel, inputs, outputs, axis, image_2d );
     if ( VSI_SUCCESS == status)
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
index e6ecaa5..e525f5e 100644
--- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
@@ -34,40 +34,21 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 
 __BEGIN_DECLS
 
-/*
- * Define kernel meta.
- */
- typedef enum
-{
-    LAYERNORM_KERNEL,
-    LAYERNORM_2D_KERNEL,
-    SUMSQR_KERNEL,
-    SUMSQR_2D_KERNEL,
-    LAYERNORM_WH_KERNEL,
-    LAYERNORM_WH_2D_KERNEL,
-} _kernel_type_e;
+#define SOURCE_AXIS0_0     "layer_normalization_0"
+#define SOURCE_AXIS0_1     "layer_normalization_1"
+#define SOURCE_AXIS0_2     "layer_normalization_2"
+#define SOURCE_AXIS0_3     "layer_normalization_3"
+#define SOURCE_AXIS01      "layer_normalization_axis01"
 
-#define KERNEL_SOURCE_1    "layer_normalization"
-#define KERNEL_SOURCE_2    "layer_normalization_2d"
-#define KERNEL_SOURCE_3    "layer_normalization_u8_f16"
-#define KERNEL_SOURCE_4    "layer_normalization_wh_u8"
-#define KERNEL_SOURCE_5    "layer_normalization_wh_f16"
-#define KERNEL_SOURCE_6    "layer_normalization_i16"
-#define KERNEL_SOURCE_7    "layer_normalization_wh_i16"
-#define KERNEL_SOURCE_8    "layer_normalization_scale_f32"
-#define KERNEL_SOURCE_9    "layer_normalization_scale_f32_2d"
-#define KERNEL_SOURCE_10   "layer_normalization_scale_f32_bf16"
+#define HASH_LAYERNORM_SH_KERNEL_NAME(SRC0_TYPE, SCALE_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.layer_norm_axis0_"#SRC0_TYPE"_"#SCALE_TYPE"to"#DST_TYPE)
 
-#define HASH_LAYERNORM_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"to"#DST_TYPE)
-
-#define HASH_LAYERNORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D")
+#define HASH_LAYERNORM_SH_KERNEL_2D_NAME(SRC0_TYPE, SCALE_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.layer_norm_axis0_"#SRC0_TYPE"_"#SCALE_TYPE"to"#DST_TYPE"_2D")
 
 #define HASH_LAYERNORM_SH_KERNEL_SCALE_NAME(SRC0_TYPE, DST_TYPE) \
     CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"F32to"#DST_TYPE)
@@ -79,58 +60,43 @@ __BEGIN_DECLS
 #define HASH_LAYERNORM_KEY(_input0_type, _input2_type, _output_type, _reshape_flag) \
     ((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | _reshape_flag)
 
-#define TENSOR_LAYERNORM_KERNELS(IN0_TYPE, SCALE_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_LAYERNORM_KEY(IN0_TYPE, SCALE_TYPE, OUT_TYPE, LAYERNORM_KERNEL), \
-        HASH_LAYERNORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
+#define LAYERNORM_KERNELS_3D(IN0_TYPE, SCALE_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, SCALE_TYPE, OUT_TYPE, 0), \
+        HASH_LAYERNORM_SH_KERNEL_NAME(IN0_TYPE, SCALE_TYPE, OUT_TYPE), \
         SOURCE },
 
-#define TENSOR_LAYERNORM_KERNELS_2D(IN0_TYPE, SCALE_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_LAYERNORM_KEY(IN0_TYPE, SCALE_TYPE, OUT_TYPE, LAYERNORM_2D_KERNEL), \
-        HASH_LAYERNORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
+#define LAYERNORM_KERNELS_2D(IN0_TYPE, SCALE_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, SCALE_TYPE, OUT_TYPE, 1), \
+        HASH_LAYERNORM_SH_KERNEL_2D_NAME(IN0_TYPE, SCALE_TYPE, OUT_TYPE), \
         SOURCE },
 
 #define TENSOR_LAYERNORM_SCALE_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, LAYERNORM_KERNEL), \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, 0), \
         HASH_LAYERNORM_SH_KERNEL_SCALE_NAME(IN0_TYPE, OUT_TYPE), \
         SOURCE },
 
 #define TENSOR_LAYERNORM_SCALE_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, LAYERNORM_2D_KERNEL), \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, 1), \
         HASH_LAYERNORM_SH_KERNEL_SCALE_2D_NAME(IN0_TYPE, OUT_TYPE), \
         SOURCE },
 
-// greater than max size
-#define HASH_SUMSQR_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.layernorm_wh_sumSqr_"#SRC0_TYPE"to"#DST_TYPE)
+// layer norm on aix 0 and 1
 
-#define HASH_SUMSQR_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.layernorm_wh_sumSqr_"#SRC0_TYPE"to"#DST_TYPE"_2D")
+#define HASH_LN_AXIS01_SUMS_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.layernorm_axis01_sums_"#SRC0_TYPE"to"#DST_TYPE)
 
-#define HASH_LAYERNORM_WH_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.layernorm_wh_"#SRC0_TYPE"to"#DST_TYPE)
+#define HASH_LN_AXIS01_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.layernorm_axis01_"#SRC0_TYPE"_"#SRC1_TYPE"to"#DST_TYPE)
 
-#define HASH_LAYERNORM_WH_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.layernorm_wh_"#SRC0_TYPE"to"#DST_TYPE"_2D")
+#define LN_AXIS01_SUMS_KERNELS(IN0_TYPE, OUT_TYPE) \
+    {   HASH_LAYERNORM_KEY(IN0_TYPE, U4, OUT_TYPE, 0), \
+        HASH_LN_AXIS01_SUMS_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE_AXIS01 },
 
-#define TENSOR_SUMSQR_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, SUMSQR_KERNEL), \
-        HASH_SUMSQR_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
-        SOURCE },
-
-#define TENSOR_SUMSQR_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, SUMSQR_2D_KERNEL), \
-        HASH_SUMSQR_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
-        SOURCE },
-
-#define TENSOR_LAYERNORM_WH_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_WH_KERNEL), \
-        HASH_LAYERNORM_WH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
-        SOURCE },
-
-#define TENSOR_LAYERNORM_WH_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_WH_2D_KERNEL), \
-        HASH_LAYERNORM_WH_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
-        SOURCE },
+#define LAYERNORM_AXIS01_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
+      HASH_LN_AXIS01_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
+      SOURCE_AXIS01 },
 
 typedef struct
 {
@@ -142,50 +108,84 @@ typedef struct
 static const _kernel_map_type _layernorm_kernel_map[] =
 {
     // Register kernel here
-    TENSOR_LAYERNORM_KERNELS( U8, F16, U8, KERNEL_SOURCE_1 )
-    TENSOR_LAYERNORM_KERNELS_2D( U8, F16, U8, KERNEL_SOURCE_2 )
-    TENSOR_LAYERNORM_KERNELS( U8, F16, F16, KERNEL_SOURCE_3 )
-    TENSOR_LAYERNORM_KERNELS_2D( U8, F16, F16, KERNEL_SOURCE_3 )
-    TENSOR_LAYERNORM_KERNELS( U8, F32, F16, KERNEL_SOURCE_3 )
-    TENSOR_LAYERNORM_KERNELS_2D( U8, F32, F16, KERNEL_SOURCE_3 )
+    LAYERNORM_KERNELS_3D( U8,  F16, U8,  SOURCE_AXIS0_0 )
+    LAYERNORM_KERNELS_2D( U8,  F16, U8,  SOURCE_AXIS0_0 )
+    LAYERNORM_KERNELS_3D( U8,  F16, F16, SOURCE_AXIS0_0 )
+    LAYERNORM_KERNELS_2D( U8,  F16, F16, SOURCE_AXIS0_0 )
+    LAYERNORM_KERNELS_3D( I8,  F16, I8,  SOURCE_AXIS0_0 )
+    LAYERNORM_KERNELS_2D( I8,  F16, I8,  SOURCE_AXIS0_0 )
+    LAYERNORM_KERNELS_3D( I8,  F16, F16, SOURCE_AXIS0_0 )
+    LAYERNORM_KERNELS_2D( I8,  F16, F16, SOURCE_AXIS0_0 )
 
-    TENSOR_LAYERNORM_KERNELS( F16, F16, F16, KERNEL_SOURCE_1 )
-    TENSOR_LAYERNORM_KERNELS_2D( F16, F16, F16, KERNEL_SOURCE_2 )
-    TENSOR_LAYERNORM_KERNELS( F16, F16, U8, KERNEL_SOURCE_1 )
-    TENSOR_LAYERNORM_KERNELS_2D( F16, F16, U8, KERNEL_SOURCE_2 )
-    TENSOR_LAYERNORM_KERNELS( I16, F16, I16, KERNEL_SOURCE_6 )
-    TENSOR_LAYERNORM_KERNELS_2D( I16, F16, I16, KERNEL_SOURCE_6 )
+    LAYERNORM_KERNELS_3D( F16, F16, F16, SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_2D( F16, F16, F16, SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_3D( I16, F16, I16, SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_2D( I16, F16, I16, SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_3D( F16, F16, I16, SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_2D( F16, F16, I16, SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_3D( F16, F16, I8,  SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_2D( F16, F16, I8,  SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_3D( F16, F16, U8,  SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_2D( F16, F16, U8,  SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_3D( I16, F16, F16, SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_2D( I16, F16, F16, SOURCE_AXIS0_1 )
 
-    TENSOR_LAYERNORM_SCALE_KERNELS( U8, U8, KERNEL_SOURCE_8 )
-    TENSOR_LAYERNORM_SCALE_KERNELS_2D( U8, U8, KERNEL_SOURCE_9 )
-    TENSOR_LAYERNORM_SCALE_KERNELS( I8, I8, KERNEL_SOURCE_8 )
-    TENSOR_LAYERNORM_SCALE_KERNELS_2D( I8, I8, KERNEL_SOURCE_9 )
-    TENSOR_LAYERNORM_SCALE_KERNELS( I16, I16, KERNEL_SOURCE_8 )
-    TENSOR_LAYERNORM_SCALE_KERNELS_2D( I16, I16, KERNEL_SOURCE_9 )
-    TENSOR_LAYERNORM_SCALE_KERNELS( F16, F16, KERNEL_SOURCE_8 )
-    TENSOR_LAYERNORM_SCALE_KERNELS_2D( F16, F16, KERNEL_SOURCE_9 )
-    TENSOR_LAYERNORM_SCALE_KERNELS( BF16, BF16, KERNEL_SOURCE_10 )
-    TENSOR_LAYERNORM_SCALE_KERNELS_2D( BF16, BF16, KERNEL_SOURCE_10 )
+    LAYERNORM_KERNELS_3D( F16, F32, F16, SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_2D( F16, F32, F16, SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_3D( I16, F32, I16, SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_2D( I16, F32, I16, SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_3D( F16, F32, I16, SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_2D( F16, F32, I16, SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_3D( F16, F32, I8,  SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_2D( F16, F32, I8,  SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_3D( F16, F32, U8,  SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_2D( F16, F32, U8,  SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_3D( I16, F32, F16, SOURCE_AXIS0_1 )
+    LAYERNORM_KERNELS_2D( I16, F32, F16, SOURCE_AXIS0_1 )
+
+    LAYERNORM_KERNELS_3D( U8,  F32, U8,  SOURCE_AXIS0_2 )
+    LAYERNORM_KERNELS_2D( U8,  F32, U8,  SOURCE_AXIS0_2 )
+    LAYERNORM_KERNELS_3D( U8,  F32, F16, SOURCE_AXIS0_2 )
+    LAYERNORM_KERNELS_2D( U8,  F32, F16, SOURCE_AXIS0_2 )
+    LAYERNORM_KERNELS_3D( I8,  F32, I8,  SOURCE_AXIS0_2 )
+    LAYERNORM_KERNELS_2D( I8,  F32, I8,  SOURCE_AXIS0_2 )
+    LAYERNORM_KERNELS_3D( I8,  F32, F16, SOURCE_AXIS0_2 )
+    LAYERNORM_KERNELS_2D( I8,  F32, F16, SOURCE_AXIS0_2 )
+
+    LAYERNORM_KERNELS_3D( BF16,  F32, BF16, SOURCE_AXIS0_3 )
+    LAYERNORM_KERNELS_2D( BF16,  F32, BF16, SOURCE_AXIS0_3 )
 };
 
-static const _kernel_map_type _sumsqr_kernel_map[] =
+static const _kernel_map_type _layernorm_axis01_kernel_map[] =
 {
     // Register kernel here
-    TENSOR_SUMSQR_KERNELS( U8, F32, KERNEL_SOURCE_4 )
-    TENSOR_SUMSQR_KERNELS_2D( U8, F32, KERNEL_SOURCE_4 )
-    TENSOR_SUMSQR_KERNELS( F16, F32, KERNEL_SOURCE_5 )
-    TENSOR_SUMSQR_KERNELS_2D( F16, F32, KERNEL_SOURCE_5 )
-    TENSOR_SUMSQR_KERNELS( I16, F32, KERNEL_SOURCE_7 )
-    TENSOR_SUMSQR_KERNELS_2D( I16, F32, KERNEL_SOURCE_7 )
+    LN_AXIS01_SUMS_KERNELS( I8,  F32 )
+    LN_AXIS01_SUMS_KERNELS( U8,  F32 )
+    LN_AXIS01_SUMS_KERNELS( F16, F32 )
+    LN_AXIS01_SUMS_KERNELS( I16, F32 )
+
+    LAYERNORM_AXIS01_KERNELS( U8,  F16, U8 )
+    LAYERNORM_AXIS01_KERNELS( U8,  F16, F16 )
+    LAYERNORM_AXIS01_KERNELS( I8,  F16, I8 )
+    LAYERNORM_AXIS01_KERNELS( I8,  F16, F16 )
+    LAYERNORM_AXIS01_KERNELS( F16, F16, F16 )
+    LAYERNORM_AXIS01_KERNELS( F16, F16, I16 )
+    LAYERNORM_AXIS01_KERNELS( F16, F16, I8 )
+    LAYERNORM_AXIS01_KERNELS( F16, F16, U8 )
+    LAYERNORM_AXIS01_KERNELS( I16, F16, I16 )
+    LAYERNORM_AXIS01_KERNELS( I16, F16, F16 )
+
+    LAYERNORM_AXIS01_KERNELS( U8,  F32, U8 )
+    LAYERNORM_AXIS01_KERNELS( U8,  F32, F16 )
+    LAYERNORM_AXIS01_KERNELS( I8,  F32, I8 )
+    LAYERNORM_AXIS01_KERNELS( I8,  F32, F16 )
+    LAYERNORM_AXIS01_KERNELS( F16, F32, F16 )
+    LAYERNORM_AXIS01_KERNELS( F16, F32, I16 )
+    LAYERNORM_AXIS01_KERNELS( F16, F32, I8 )
+    LAYERNORM_AXIS01_KERNELS( F16, F32, U8 )
+    LAYERNORM_AXIS01_KERNELS( I16, F32, I16 )
+    LAYERNORM_AXIS01_KERNELS( I16, F32, F16 )
 
-    TENSOR_LAYERNORM_WH_KERNELS( U8, U8, KERNEL_SOURCE_4 )
-    TENSOR_LAYERNORM_WH_KERNELS_2D( U8, U8, KERNEL_SOURCE_4 )
-    TENSOR_LAYERNORM_WH_KERNELS( U8, F16, KERNEL_SOURCE_4 )
-    TENSOR_LAYERNORM_WH_KERNELS_2D( U8, F16, KERNEL_SOURCE_4 )
-    TENSOR_LAYERNORM_WH_KERNELS( F16, F16, KERNEL_SOURCE_5 )
-    TENSOR_LAYERNORM_WH_KERNELS_2D( F16, F16, KERNEL_SOURCE_5 )
-    TENSOR_LAYERNORM_WH_KERNELS( I16, I16, KERNEL_SOURCE_7 )
-    TENSOR_LAYERNORM_WH_KERNELS_2D( I16, I16, KERNEL_SOURCE_7 )
 };
 
 /*
@@ -202,14 +202,14 @@ static vx_param_description_t _layernorm_kernel_param_def[] =
     // Add kererl parameters here
 };
 
-static vx_param_description_t _sumSqr_kernel_param_def[] =
+static vx_param_description_t _layernorm_axis01_sums_param_def[] =
 {
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
 
-static vx_param_description_t _layernorm_wh_kernel_param_def[] =
+static vx_param_description_t _layernorm_axis01_kernel_param_def[] =
 {
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@@ -221,8 +221,8 @@ static vx_param_description_t _layernorm_wh_kernel_param_def[] =
 };
 
 #define _LAYERNORM_PARAM_NUM    _cnt_of_array( _layernorm_kernel_param_def )
-#define _SUMSQR_PARAM_NUM       _cnt_of_array( _sumSqr_kernel_param_def )
-#define _LAYERNORM_WH_PARAM_NUM    _cnt_of_array( _layernorm_wh_kernel_param_def )
+#define _LAYERNORM_SUMS_PARAM_NUM       _cnt_of_array( _layernorm_axis01_sums_param_def )
+#define _LAYERNORM_AXIS01_PARAM_NUM     _cnt_of_array( _layernorm_axis01_kernel_param_def )
 
 /*
  * Kernel initializer
@@ -245,15 +245,9 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
 
     vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
     vsi_size_array_t * input_shape = NULL;
-    float scaleIn = 1;
-    float scaleOut = 1;
+    float output_scale = 1;
     float output_zp = 0;
-    int32_t input_zp = 0;
-    int32_t iter = 0;
-    int32_t sumInZp = 0;
-    int32_t tmpZp1 = 0;
-    int32_t tmpZp2 = 0;
-    float e2InScale = 0;
+    float inv_multiplier = 0;
     int32_t height = 0, width = 0, chn = 0;
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
@@ -265,61 +259,14 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
 
     input_shape  = attr[0]->shape;
 
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        input_zp     = attr[0]->asymm.zero_point;
-        scaleIn      = attr[0]->asymm.scale;
-    }
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        input_zp = 0;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
-    {
-        scaleIn = 1;
-        input_zp = 0;
-    }
-
-    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        output_zp    = (float)attr[2]->asymm.zero_point;
-        scaleOut     = 1.0f / attr[2]->asymm.scale;
-    }
-    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            scaleOut = (float)((int64_t)1 << attr[2]->dfp.fl);
-        }
-        else
-        {
-            scaleOut = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
-        }
-        output_zp = 0;
-    }
-    else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE)
-    {
-        scaleOut = 1;
-        output_zp = 0.0f;
-    }
+    output_scale = 1.0f / attr[2]->scale;
+    output_zp = (float)attr[2]->zero_point;
 
     width = (int32_t)(input_shape->data[0]);
     height = (int32_t)(input_shape->data[1]);
     chn = (int32_t)((input_shape->size <= 2) ? 1 : input_shape->data[2]);
 
-    iter = ((width + 15) / 16) * 16;
-    sumInZp = input_zp * iter * (-1);
-    tmpZp1 = (-2) * input_zp;
-    tmpZp2 = iter * input_zp * input_zp;
-    e2InScale = scaleIn * scaleIn;
+    inv_multiplier = 1.0f / (float)width;
 
     shaderParam.global_scale[0]  = width;
     shaderParam.global_scale[1]  = 1;
@@ -332,125 +279,95 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError);
 
     {
-        float dimRatio = 1.0f / (float)width;
-        float dimRatio_scale = dimRatio * scaleIn;
-        gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{
-            0x55555555, // TCfg
-            0x00000000, // ASelt
-            0x76543210, 0x76543210, // ABin
-            0x5555aaaa, // BSelt
-            0x00000000, 0x76543210, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{
+        gpu_dp_inst_t uniDataToFP32_0_4x4 = {{
             0x01010101, // TCfg
             0x00000000, // ASelt
             0x00010000, 0x00030002, // ABin
             0x02020202, // BSelt
             0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+            0x00000300, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniExtractHalf4_dp4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00020000, 0x00060004, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertSecFp16Fp32_4x4 = {{
+        gpu_dp_inst_t uniDataToFP32_1_4x4 = {{
             0x01010101, // TCfg
             0x00000000, // ASelt
             0x00050004, 0x00070006, // ABin
             0x02020202, // BSelt
             0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+            0x00000300, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniSumU8_16x1 = {{
-            0x55555555, // TCfg
+        gpu_dp_inst_t uniDataToFP32_2_4x4 = {{
+            0x01010101, // TCfg
             0x00000000, // ASelt
-            0x76543210, 0xfedcba98, // ABin
-            0xaaaaaaaa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniSqrSum_16x1 = {{
-            0x55555555, // TCfg
-            0x00000000, // ASelt
-            0x76543210, 0xfedcba98, // ABin
-            0x55555555, // BSelt
-            0x76543210, 0xfedcba98, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvert3rdUint8SubZpToFp32_4x4 = {{
-            0x05050505, // TCfg
-            0x04040404, // ASelt
             0x00090008, 0x000b000a, // ABin
-            0x0a0a0a0a, // BSelt
+            0x02020202, // BSelt
             0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+            0x00000300, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvert4thUint8SubZpToFp32_4x4 = {{
-            0x05050505, // TCfg
-            0x04040404, // ASelt
+        gpu_dp_inst_t uniDataToFP32_3_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
             0x000d000c, 0x000f000e, // ABin
-            0x0a0a0a0a, // BSelt
+            0x02020202, // BSelt
             0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+            0x00000300, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+        gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractInteger_2x8 = {{
             0x33333333, // TCfg
             0x11110000, // ASelt
             0x03020100, 0x03020100, // ABin
             0x00000000, // BSelt
             0x00000000, 0x00000000, // BBin
             0x00002400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t UniPackFP16even_2x8 = {{
-           0x11111111, // TCfg
-           0x11110000, // ASelt
-           0x06040200, 0x06040200, // ABin
-           0x22222222, // BSelt
-           0x00000000, 0x00000000, // BBin
-           0x00000100, // AccumType, ConstantType, and PostShift
-           0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        gpu_dp_inst_t uniSumX_16x1 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0xfedcba98, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00010001, 0x00010001, 0x00010001,
+            0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
         }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{
+        gpu_dp_inst_t uniSumX2_16x1 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0xfedcba98, // ABin
+            0x55555555, // BSelt
+            0x76543210, 0xfedcba98, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniSum_X_X2_8x2 = {{
             0x55555555, // TCfg
             0x00000000, // ASelt
             0x76543210, 0x76543210, // ABin
-            0x5555aaaa, // BSelt
+            0x0000aaaa, // BSelt
             0x00000000, 0x76543210, // BBin
-            0x00000300, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
 
         gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
@@ -491,143 +408,75 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
         pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, attr[2]->dtype );
 
         status  = vsi_nn_kernel_gpu_add_param(node, "width", &width);
-        status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio);
+        status |= vsi_nn_kernel_gpu_add_param(node, "inv_multiplier", &inv_multiplier);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
 
         switch( pack_key )
         {
             case _PACK_SELECT_KEY( U8, F16, F16 ):
             case _PACK_SELECT_KEY( U8, F32, F16 ):
+            case _PACK_SELECT_KEY( I8, F16, F16 ):
+            case _PACK_SELECT_KEY( I8, F32, F16 ):
+            case _PACK_SELECT_KEY( U8, F16, U8 ):
+            case _PACK_SELECT_KEY( U8, F32, U8 ):
+            case _PACK_SELECT_KEY( I8, F16, I8 ):
+            case _PACK_SELECT_KEY( I8, F32, I8 ):
                 {
-                    status = vsi_nn_kernel_gpu_add_param(node, "UniPackFP16even_2x8",
-                        &UniPackFP16even_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
-                        &uniConvert1stUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
-                        &uniConvert2ndUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
-                        &uniConvert3rdUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
-                        &uniConvert4thUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp2", &tmpZp2);
+                    if (attr[2]->dtype == F16)
+                    {
+                        status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
+                            &uniExtractHalf8_2x8);
+                    }
+                    else
+                    {
+                        status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
+                            &uniExtractInteger_2x8);
+                        status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
+                        status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+                    }
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniSumX_16x1", &uniSumX_16x1);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniSumX2_16x1", &uniSumX2_16x1);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4",
+                        &uniDataToFP32_0_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4",
+                        &uniDataToFP32_1_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_2_4x4",
+                        &uniDataToFP32_2_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_3_4x4",
+                        &uniDataToFP32_3_4x4);
                     CHECK_STATUS_FAIL_GOTO(status, OnError );
                 }
                 break;
-            case _PACK_SELECT_KEY( U8,  F16, U8 ):
+            case _PACK_SELECT_KEY( I16, F16, F16 ):
+            case _PACK_SELECT_KEY( I16, F32, F16 ):
             case _PACK_SELECT_KEY( F16, F16, F16 ):
-            case _PACK_SELECT_KEY( F16, F16, U8 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2",
-                        &uniFp16SumSqr_dp8x2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_dp4x4",
-                        &uniExtractHalf4_dp4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
-                        &uniConvertInt32toUint8_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
-                        &uniConvert1stUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
-                        &uniConvert2ndUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
-                        &uniConvert3rdUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
-                        &uniConvert4thUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4",
-                        &uniConvertSecFp16Fp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
-                        &UniFP16toFP32Lo4_dp4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp2", &tmpZp2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            case _PACK_SELECT_KEY( I16, F16, I16 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2",
-                        &uniInt16SumSqr_dp8x2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
-                        &uniConvert1stUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
-                        &uniConvert2ndUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
-                        &uniConvertInt32toUint8_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4",
-                        &uniConvertSecFp16Fp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
-                        &UniFP16toFP32Lo4_dp4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio_scale", &dimRatio_scale);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            case _PACK_SELECT_KEY( U8,  F32, U8 ):
             case _PACK_SELECT_KEY( F16, F32, F16 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2",
-                        &uniFp16SumSqr_dp8x2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_dp4x4",
-                        &uniExtractHalf4_dp4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
-                        &uniConvertInt32toUint8_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
-                        &uniConvert1stUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
-                        &uniConvert2ndUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
-                        &uniConvert3rdUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
-                        &uniConvert4thUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
-                        &UniFP16toFP32Lo4_dp4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp2", &tmpZp2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
+            case _PACK_SELECT_KEY( I16, F16, I16 ):
             case _PACK_SELECT_KEY( I16, F32, I16 ):
+            case _PACK_SELECT_KEY( F16, F16, I16 ):
+            case _PACK_SELECT_KEY( F16, F32, I16 ):
+            case _PACK_SELECT_KEY( F16, F16, U8 ):
+            case _PACK_SELECT_KEY( F16, F32, U8 ):
+            case _PACK_SELECT_KEY( F16, F16, I8 ):
+            case _PACK_SELECT_KEY( F16, F32, I8 ):
                 {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2",
-                        &uniInt16SumSqr_dp8x2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
-                        &uniConvert1stUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
-                        &uniConvert2ndUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
-                        &uniConvertInt32toUint8_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
-                        &UniFP16toFP32Lo4_dp4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+                    if (attr[2]->dtype == F16)
+                    {
+                        status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
+                            &uniExtractHalf8_2x8);
+                    }
+                    else
+                    {
+                        status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
+                            &uniExtractInteger_2x8);
+                    }
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniSum_X_X2_8x2", &uniSum_X_X2_8x2);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4",
+                        &uniDataToFP32_0_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4",
+                        &uniDataToFP32_1_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
                     status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio_scale", &dimRatio_scale);
                     CHECK_STATUS_FAIL_GOTO(status, OnError );
                 }
                 break;
@@ -669,7 +518,7 @@ OnError:
     return status;
 }
 
-DEF_KERNEL_INITIALIZER(_sumsqr_initializer)
+DEF_KERNEL_INITIALIZER(_layernorm_axis01_sums_initializer)
     (
     vsi_nn_kernel_node_t                node,
     const vsi_nn_kernel_node_param_t  * param,
@@ -686,14 +535,6 @@ DEF_KERNEL_INITIALIZER(_sumsqr_initializer)
 
     vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
     vsi_size_array_t * input_shape = NULL;
-    float scaleIn = 1.0f;
-    int32_t input_zp = 0;
-    vx_uint32 iter = 0;
-    int32_t sumInZp = 0;
-    int32_t tmpZp1 = 0;
-    float tmpZp2 = 0;
-    float e2InScale = 0;
-    float rowSumScale = 0;
     int32_t width = 0;
     int32_t height = 0;
     int32_t chn = 0;
@@ -705,37 +546,9 @@ DEF_KERNEL_INITIALIZER(_sumsqr_initializer)
 
     input_shape  = attr[0]->shape;
 
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        input_zp     = attr[0]->asymm.zero_point;
-        scaleIn      = attr[0]->asymm.scale;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        input_zp = 0;
-    }
-
     width = (int32_t)(input_shape->data[0]);
     height = (int32_t)(input_shape->data[1]);
     chn = (int32_t)(attr[1]->shape->data[1]);
-    iter = height * 16;
-
-    e2InScale = scaleIn * scaleIn;
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        sumInZp = input_zp * iter * (-1);
-        tmpZp1 = (-2) * input_zp;
-        tmpZp2 = input_zp * input_zp * e2InScale;
-        rowSumScale = height * 16 * tmpZp2;
-    }
 
     shaderParam.global_scale[0]  = 1;
     shaderParam.global_scale[1]  = 1;
@@ -758,9 +571,9 @@ DEF_KERNEL_INITIALIZER(_sumsqr_initializer)
     status = vsi_nn_kernel_gpu_config( node, &shaderParam );
     CHECK_STATUS_FAIL_GOTO(status, OnError);
 
-    if (attr[0]->dtype == U8)
+    if (attr[0]->dtype == U8 || attr[0]->dtype == I8)
     {
-        gpu_dp_inst_t uniSumU8_16x1 = {{
+        gpu_dp_inst_t uniSumX_16x1 = {{
             0x55555555, // TCfg
             0x00000000, // ASelt
             0x76543210, 0xfedcba98, // ABin
@@ -769,7 +582,7 @@ DEF_KERNEL_INITIALIZER(_sumsqr_initializer)
             0x00002400, // AccumType, ConstantType, and PostShift
             0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniSqrSum_16x1 = {{
+        gpu_dp_inst_t uniSumX2_16x1 = {{
             0x55555555, // TCfg
             0x00000000, // ASelt
             0x76543210, 0xfedcba98, // ABin
@@ -778,43 +591,23 @@ DEF_KERNEL_INITIALIZER(_sumsqr_initializer)
             0x00000400, // AccumType, ConstantType, and PostShift
             0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        status  = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
-        status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
-        status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
-        status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1);
-        status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
-        status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
-        status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale);
+        status  = vsi_nn_kernel_gpu_add_param(node, "uniSumX_16x1", &uniSumX_16x1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniSumX2_16x1", &uniSumX2_16x1);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
     }
-    else if (attr[0]->dtype == F16)
+    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
     {
-        gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{
+        gpu_dp_inst_t uniSum_X_X2_8x2 = {{
             0x55555555, // TCfg
             0x00000000, // ASelt
             0x76543210, 0x76543210, // ABin
-            0x5555aaaa, // BSelt
+            0x0000aaaa, // BSelt
             0x00000000, 0x76543210, // BBin
             0x00000100, // AccumType, ConstantType, and PostShift
-            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
-        CHECK_STATUS_FAIL_GOTO(status, OnError );
-    }
-    else if (attr[0]->dtype == I16)
-    {
-        gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{
-            0x55555555, // TCfg
-            0x00000000, // ASelt
-            0x76543210, 0x76543210, // ABin
-            0x5555aaaa, // BSelt
-            0x00000000, 0x76543210, // BBin
-            0x00000300, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        status  = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2", &uniInt16SumSqr_dp8x2);
-        status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
-        status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
+        status  = vsi_nn_kernel_gpu_add_param(node, "uniSum_X_X2_8x2", &uniSum_X_X2_8x2);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
     }
 
@@ -837,7 +630,7 @@ OnError:
     return status;
 }
 
-DEF_KERNEL_INITIALIZER(_layernorm_wh_initializer)
+DEF_KERNEL_INITIALIZER(_layernorm_axis01_initializer)
     (
     vsi_nn_kernel_node_t                node,
     const vsi_nn_kernel_node_param_t  * param,
@@ -854,13 +647,11 @@ DEF_KERNEL_INITIALIZER(_layernorm_wh_initializer)
 
     vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
     vsi_size_array_t * input_shape = NULL;
-    float scaleIn = 1.0f;
-    float scaleOut = 1.0f;
+    float output_scale = 1.0f;
     float output_zp = 0;
-    int32_t input_zp = 0;
-    float dimRatio = 0;
+    float inv_multiplier = 0;
     vx_uint32 group_num = 0;
-    vx_int32 height = 0, width = 0, chn = 0, height_chn_org = 0;
+    vx_int32 height = 0, width = 0, chn = 0;
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -870,49 +661,14 @@ DEF_KERNEL_INITIALIZER(_layernorm_wh_initializer)
     CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
 
     input_shape  = attr[0]->shape;
-
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        input_zp     = attr[0]->asymm.zero_point;
-        scaleIn      = attr[0]->asymm.scale;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        input_zp = 0;
-    }
-
-    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        output_zp    = (float)attr[2]->asymm.zero_point;
-        scaleOut     = 1.0f / attr[2]->asymm.scale;
-    }
-    else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            scaleOut = (float)((int64_t)1 << attr[2]->dfp.fl);
-        }
-        else
-        {
-            scaleOut = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
-        }
-        output_zp = 0;
-    }
+    output_scale = 1.0f / attr[2]->scale;
+    output_zp = (float)attr[2]->zero_point;
 
     width = (int32_t)(input_shape->data[0]);
     height = (int32_t)(input_shape->data[1]);
     chn = (int32_t)(attr[1]->shape->data[1]);
-    height_chn_org = (int32_t)((input_shape->size > 2 ? input_shape->data[2] : 1) / chn);
 
-    dimRatio = (float)(1.0 / (width * height));
+    inv_multiplier = (float)(1.0 / (width * height));
 
     group_num = (width + 255) / 256;
     if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
@@ -933,25 +689,37 @@ DEF_KERNEL_INITIALIZER(_layernorm_wh_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError);
 
     {
-        gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{
+        gpu_dp_inst_t uniDataToFP32_0_4x4 = {{
             0x01010101, // TCfg
             0x00000000, // ASelt
             0x00010000, 0x00030002, // ABin
             0x02020202, // BSelt
             0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+            0x00000300, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertSecFp16Fp32_4x4 = {{
+        gpu_dp_inst_t uniDataToFP32_1_4x4 = {{
             0x01010101, // TCfg
             0x00000000, // ASelt
             0x00050004, 0x00070006, // ABin
             0x02020202, // BSelt
             0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+            0x00000300, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+        gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractInteger_2x8 = {{
             0x33333333, // TCfg
             0x11110000, // ASelt
             0x03020100, 0x03020100, // ABin
@@ -961,91 +729,26 @@ DEF_KERNEL_INITIALIZER(_layernorm_wh_initializer)
             0x00000000, 0x00000000, 0x00000000, 0x00000000,
             0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{
-            0x11111111, // TCfg
-            0x11110000, // ASelt
-            0x06040200, 0x06040200, // ABin
-            0x22222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
-        }, GPU_DP_TYPE_16 };
 
-        uint32_t pack_key      = 0;
-#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE )    \
-        (IN0_TYPE | (OUT_TYPE << 8))
-
-        pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype );
 
         status  = vsi_nn_kernel_gpu_add_param(node, "height", &height);
-        status |= vsi_nn_kernel_gpu_add_param(node, "height_depth", &height_chn_org);
-        status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio);
+        status |= vsi_nn_kernel_gpu_add_param(node, "inv_multiplier", &inv_multiplier);
         status |= vsi_nn_kernel_gpu_add_param(node, "group_num", &group_num);
-        status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
-        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", &uniConvertSecFp16Fp32_4x4);
-        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4", &uniDataToFP32_0_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4", &uniDataToFP32_1_4x4);
         status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
-        status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
-        CHECK_STATUS_FAIL_GOTO(status, OnError );
-
-        switch( pack_key )
+        status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
+        if (attr[2]->dtype == F16)
         {
-            case _PACK_SELECT_KEY( U8, U8 ):
-            case _PACK_SELECT_KEY( U8, F16 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
-                        &uniConvert1stUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
-                        &uniConvert2ndUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
-                        &uniConvertHalfToFp16_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            case _PACK_SELECT_KEY( F16, F16 ):
-            case _PACK_SELECT_KEY( F16, U8 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
-                        &uniConvertHalfToFp16_2x8);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            case _PACK_SELECT_KEY( I16, I16 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
-                        &uniConvert1stUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
-                        &uniConvert2ndUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            default:
-                VSI_ASSERT( FALSE );
-                return VSI_FAILURE;
+            status |= vsi_nn_kernel_gpu_add_param(
+                node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
         }
-#undef _PACK_SELECT_KEY
+        else
+        {
+            status |= vsi_nn_kernel_gpu_add_param(
+                node, "uniExtract8Data_2x8", &uniExtractInteger_2x8);
+        }
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
     }
 
 OnError:
@@ -1076,7 +779,7 @@ static vsi_status _query_kernel
     vsi_nn_tensor_t* const* const inputs,
     vsi_nn_tensor_t* const* const outputs,
     vsi_nn_kernel_t* kernel,
-    int32_t reshape2D
+    int32_t is_img2d_input
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -1084,19 +787,13 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e input2_dtype = F16;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
-    int i = 0;
-    _kernel_type_e kernel_type = LAYERNORM_KERNEL;
-
-    if (reshape2D)
-    {
-        kernel_type = LAYERNORM_2D_KERNEL;
-    }
+    int32_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, output_dtype, kernel_type );
+    key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, output_dtype, is_img2d_input );
 
     for( i = 0; i < _cnt_of_array(_layernorm_kernel_map); i ++ )
     {
@@ -1122,14 +819,12 @@ static vsi_status _query_kernel
     return status;
 } /* _query_kernel() */
 
-static vsi_status _query_kernel_wh
+static vsi_status _query_kernel_axis01
     (
     vsi_nn_tensor_t* const* const inputs,
     vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel_sumSqr,
-    vsi_nn_kernel_t* kernel,
-    _kernel_type_e is2D_sumsqr,
-    _kernel_type_e is2D_wh
+    vsi_nn_kernel_t* kernel_sums,
+    vsi_nn_kernel_t* kernel
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -1143,56 +838,56 @@ static vsi_status _query_kernel_wh
     input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, F32, is2D_sumsqr );
+    key = HASH_LAYERNORM_KEY( input0_dtype, U4, F32, 0 );
 
-    for( i = 0; i < _cnt_of_array(_sumsqr_kernel_map); i ++ )
+    for( i = 0; i < _cnt_of_array(_layernorm_axis01_kernel_map); i ++ )
     {
-        if ( _sumsqr_kernel_map[i].key == key )
+        if ( _layernorm_axis01_kernel_map[i].key == key )
         {
             break;
         }
     }
-    if ( i < _cnt_of_array(_sumsqr_kernel_map) )
+    if ( i < _cnt_of_array(_layernorm_axis01_kernel_map) )
     {
-        snprintf( kernel_sumSqr->info.name, VX_MAX_KERNEL_NAME, "%s",  _sumsqr_kernel_map[i].function_name );
-        kernel_sumSqr->info.parameters = _sumSqr_kernel_param_def;
-        kernel_sumSqr->info.numParams = _SUMSQR_PARAM_NUM;
-        kernel_sumSqr->info.initialize = _sumsqr_initializer;
+        snprintf( kernel_sums->info.name, VX_MAX_KERNEL_NAME, "%s",  _layernorm_axis01_kernel_map[i].function_name );
+        kernel_sums->info.parameters = _layernorm_axis01_sums_param_def;
+        kernel_sums->info.numParams = _LAYERNORM_SUMS_PARAM_NUM;
+        kernel_sums->info.initialize = _layernorm_axis01_sums_initializer;
 
-        vsi_nn_kernel_add_source( kernel_sumSqr, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+        vsi_nn_kernel_add_source( kernel_sums, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                 "vsi_nn_kernel_header",
-                _sumsqr_kernel_map[i].source_name );
-        vsi_nn_kernel_add_source( kernel_sumSqr, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
-                _sumsqr_kernel_map[i].source_name );
+                _layernorm_axis01_kernel_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel_sums, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                _layernorm_axis01_kernel_map[i].source_name );
     }
 
-    key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, output_dtype, is2D_wh );
+    key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, output_dtype, 0 );
 
-    for( i = 0; i < _cnt_of_array(_sumsqr_kernel_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(_layernorm_axis01_kernel_map); i ++ )
     {
-        if ( _sumsqr_kernel_map[i].key == key )
+        if ( _layernorm_axis01_kernel_map[i].key == key )
         {
             break;
         }
     }
-    if ( i < _cnt_of_array(_sumsqr_kernel_map) )
+    if ( i < _cnt_of_array(_layernorm_axis01_kernel_map) )
     {
-        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _sumsqr_kernel_map[i].function_name );
-        kernel->info.parameters = _layernorm_wh_kernel_param_def;
-        kernel->info.numParams = _LAYERNORM_WH_PARAM_NUM;
-        kernel->info.initialize = _layernorm_wh_initializer;
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _layernorm_axis01_kernel_map[i].function_name );
+        kernel->info.parameters = _layernorm_axis01_kernel_param_def;
+        kernel->info.numParams = _LAYERNORM_AXIS01_PARAM_NUM;
+        kernel->info.initialize = _layernorm_axis01_initializer;
 
         vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                 "vsi_nn_kernel_header",
-                _sumsqr_kernel_map[i].source_name );
+                _layernorm_axis01_kernel_map[i].source_name );
         vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
-                _sumsqr_kernel_map[i].source_name );
+                _layernorm_axis01_kernel_map[i].source_name );
         status = VSI_SUCCESS;
     }
     return status;
-} /* _query_kernel_wh() */
+} /* _query_kernel_axis01() */
 
-static vsi_nn_kernel_node_t _setup_wh
+static vsi_nn_kernel_node_t _setup_axis01
     (
     vsi_nn_graph_t              * graph,
     vsi_nn_tensor_t            ** inputs,
@@ -1205,30 +900,22 @@ static vsi_nn_kernel_node_t _setup_wh
 {
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL;
-    vsi_nn_kernel_node_param_t sumSqr_node_params[_SUMSQR_PARAM_NUM] = { NULL };
-    vsi_nn_kernel_node_param_t node_params[_LAYERNORM_WH_PARAM_NUM] = { NULL };
-    vsi_nn_kernel_node_t tmp_node = NULL;
+    vsi_nn_kernel_node_param_t sums_node_params[_LAYERNORM_SUMS_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_param_t node_params[_LAYERNORM_AXIS01_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t sums_node = NULL;
     vsi_nn_kernel_node_t node = NULL;
     vsi_nn_tensor_attr_t attr;
-    _kernel_type_e is2D_sumsqr = SUMSQR_2D_KERNEL;
-    _kernel_type_e is2D_wh = LAYERNORM_WH_2D_KERNEL;
-    vsi_nn_kernel_t * kernel_sumSqr = NULL;
-    vsi_nn_tensor_t * tensor_sumSqr = NULL;
-    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
-
+    float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    vsi_nn_kernel_t * kernel_sums = NULL;
+    vsi_nn_tensor_t * tensor_sums = NULL;
+    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" ) /
+                (input_scale * input_scale);
     int32_t axis[VSI_NN_MAX_DIM_NUM] = {0};
     int32_t axis_num  = 1;
     int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0};
     vsi_size_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }};
     uint32_t axis_size = 0;
     uint32_t rank_in = 0, rank_para = 0;
-    vsi_size_t outer_size = 1;
-    uint32_t i = 0;
-
-    for(i = 1; i < inputs[0]->attr.dim_num; i++)
-    {
-        outer_size *= inputs[0]->attr.size[i];
-    }
 
     status = vsi_nn_kernel_optimize_tensor_shape(
         inputs[0]->attr.size, inputs[0]->attr.dim_num,
@@ -1254,15 +941,9 @@ static vsi_nn_kernel_node_t _setup_wh
 
     rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[0], rank_in);
 
-    if (rank_in > 2)
-    {
-        is2D_sumsqr = SUMSQR_KERNEL;
-        is2D_wh = LAYERNORM_WH_KERNEL;
-    }
-
-    kernel_sumSqr = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+    kernel_sums = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
     // Assign unique_id
-    kernel_sumSqr->unique_id = kernel->unique_id;
+    kernel_sums->unique_id = kernel->unique_id;
 
     memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
     attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
@@ -1275,76 +956,59 @@ static vsi_nn_kernel_node_t _setup_wh
     {
         attr.size[0] = ((new_shape[0][0] + 127) / 128) * 4;
     }
-    attr.size[1] = outer_size;
+    attr.size[1] = new_shape[0][2];
     attr.size[2] = 1;
-    attr.size[3] = 1;
-    attr.dim_num = 4;
-    tensor_sumSqr = vsi_nn_CreateTensor( graph, &attr );
+    attr.size[3] = new_shape[0][3];
+    attr.dim_num = rank_in;
+    tensor_sums = vsi_nn_CreateTensor( graph, &attr );
 
-    status = _query_kernel_wh(inputs, outputs, kernel_sumSqr, kernel, is2D_sumsqr, is2D_wh);
+    status = _query_kernel_axis01(inputs, outputs, kernel_sums, kernel);
     if ( VSI_SUCCESS != status )
     {
         goto final;
     }
 
+    /*
+    ** sum(x) and sumsq(x*x)
+    */
+    sums_node = vsi_nn_kernel_create_node(graph, kernel_sums);
+    if (sums_node)
     {
-        tmp_node = vsi_nn_kernel_create_node( graph, kernel_sumSqr );
-        if (tmp_node)
-        {
-            sumSqr_node_params[0] = rs_input;
-            sumSqr_node_params[1] = (vsi_nn_kernel_node_param_t)tensor_sumSqr->t;
+        sums_node_params[0] = rs_input;
+        sums_node_params[1] = (vsi_nn_kernel_node_param_t)tensor_sums->t;
 
-            status  = vsi_nn_kernel_node_pass_param( tmp_node, sumSqr_node_params,
-                        _SUMSQR_PARAM_NUM );
+        status = vsi_nn_kernel_node_pass_param(
+            sums_node, sums_node_params, _LAYERNORM_SUMS_PARAM_NUM);
+        CHECK_STATUS(status);
+        {
+            // Set default border mode.
+            vx_border_t border;
+            border.mode = VX_BORDER_CONSTANT;
+            border.constant_value.U16 = 0;
+            status = vxSetNodeAttribute(
+                (vx_node)sums_node, VX_NODE_BORDER, &border, sizeof(border));
             CHECK_STATUS(status);
-            {
-                // Set default border mode.
-                vx_border_t border;
-                border.mode = VX_BORDER_CONSTANT;
-                border.constant_value.U8 = 0;
-                border.constant_value.U16 = 0;
-                if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
-                {
-                    border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
-                }
-                status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) );
-                CHECK_STATUS(status);
-            }
         }
     }
 
+    node = vsi_nn_kernel_create_node( graph, kernel );
+    if (node)
     {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if (node)
-        {
-            uint32_t index = 0;
-            node_params[index++] = rs_input;
-            node_params[index++] = rs_beta;
-            node_params[index++] = rs_gamma;
-            node_params[index++] = (vsi_nn_kernel_node_param_t)tensor_sumSqr->t;
-            node_params[index++] = rs_output;
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+        uint32_t index = 0;
+        node_params[index++] = rs_input;
+        node_params[index++] = rs_beta;
+        node_params[index++] = rs_gamma;
+        node_params[index++] = (vsi_nn_kernel_node_param_t)tensor_sums->t;
+        node_params[index++] = rs_output;
+        node_params[index++] = vsi_nn_kernel_scalar_create(graph, F32, &eps);
 
-            status  = vsi_nn_kernel_node_pass_param( node, node_params,
-                        _LAYERNORM_WH_PARAM_NUM );
-            CHECK_STATUS(status);
-            vsi_nn_kernel_scalar_release( &node_params[5] );
-            {
-                // Set default border mode.
-                vx_border_t border;
-                border.mode = VX_BORDER_CONSTANT;
-                border.constant_value.U8 = 0;
-                border.constant_value.U16 = 0;
-                if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
-                {
-                    border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
-                }
-                status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
-                CHECK_STATUS(status);
-            }
-        }
+        status = vsi_nn_kernel_node_pass_param(
+            node, node_params, _LAYERNORM_AXIS01_PARAM_NUM);
+        CHECK_STATUS(status);
+        vsi_nn_kernel_scalar_release(&node_params[5]);
     }
 
+
 final:
     if (rs_beta)
     {
@@ -1362,20 +1026,20 @@ final:
     {
         vsi_nn_kernel_tensor_release( &rs_output );
     }
-    if ( kernel_sumSqr )
+    if ( kernel_sums )
     {
-        vsi_nn_kernel_release( &kernel_sumSqr );
+        vsi_nn_kernel_release( &kernel_sums );
     }
-    if ( tensor_sumSqr )
+    if ( tensor_sums )
     {
-        vsi_nn_ReleaseTensor( &tensor_sumSqr );
+        vsi_nn_ReleaseTensor( &tensor_sums );
     }
-    if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
+    if (sums_node) {vsi_nn_kernel_node_release( &sums_node );}
 
     return node;
 }
 
-static vsi_nn_kernel_node_t _setup
+static vsi_nn_kernel_node_t _setup_axis0
     (
     vsi_nn_graph_t              * graph,
     vsi_nn_tensor_t            ** inputs,
@@ -1389,104 +1053,48 @@ static vsi_nn_kernel_node_t _setup
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_node_param_t node_params[_LAYERNORM_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
-    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL;
-    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
-    vsi_size_t *input_size = inputs[0]->attr.size;
-    uint32_t dims_num = inputs[0]->attr.dim_num;
-    int32_t rs_flg = 0;
-    int32_t optFlg = 0;
+    float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    vsi_nn_tensor_t* rs_tensors[4] = { NULL };
+    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" ) /
+                (input_scale * input_scale);
+    int32_t axis[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t axis_num  = 1;
+    int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0};
+    vsi_size_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }};
+    uint32_t axis_size = 0;
+    uint32_t rank_in = 0;
+    int32_t is_img2d_input = 0;
 
-    if (input_size[0] >= GPU_TENSOR_MAX_WIDTH)
+    status = vsi_nn_kernel_optimize_tensor_shape(
+        inputs[0]->attr.size, inputs[0]->attr.dim_num,
+        axis, axis_num, new_shape[0], &rank_in, new_axis, &axis_size);
+    if ( status == FALSE)
     {
-       node = _setup_wh(graph, inputs, input_num, outputs, output_num, params, kernel);
-       goto final;
+        return NULL;
     }
 
-    if ((input_size[1] * input_size[2] < GPU_TENSOR_MAX_WIDTH)
-        && dims_num > 2)
-    {
-        rs_flg = 1;
-    }
-    optFlg = rs_flg || (outputs[0]->attr.dim_num < 3);
+    is_img2d_input = rank_in < 3 || (new_shape[0][2] == 1);
 
-    status = _query_kernel( inputs, outputs, kernel, optFlg);
+    status = _query_kernel( inputs, outputs, kernel, is_img2d_input);
     if (VSI_SUCCESS != status)
     {
         goto final;
     }
 
-    if (rs_flg)
-    {
-        vsi_size_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
-        shape[0] = inputs[0]->attr.size[0];
-        shape[1] = inputs[0]->attr.size[1] * inputs[0]->attr.size[2];
-        shape[2] = 1;
-        shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
-        rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 );
-
-        shape[0] = outputs[0]->attr.size[0];
-        shape[1] = outputs[0]->attr.size[1] * outputs[0]->attr.size[2];
-        shape[2] = 1;
-        shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
-        rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
-    }
-    if (inputs[1]->attr.dim_num < 2)
-    {
-        vsi_size_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
-        shape[0] = inputs[1]->attr.size[0];
-        shape[1] = 1;
-        shape[2] = 1;
-        shape[3] = 1;
-        rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 );
-    }
-    if (inputs[2]->attr.dim_num < 2)
-    {
-        vsi_size_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
-        shape[0] = inputs[2]->attr.size[0];
-        shape[1] = 1;
-        shape[2] = 1;
-        shape[3] = 1;
-        rs_gamma = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shape, 4 );
-    }
+    new_shape[1][0] = new_shape[0][0];
+    new_shape[1][1] = 1;
+    rs_tensors[0] = vsi_nn_reshape_tensor(graph, inputs[0], new_shape[0], rank_in);
+    rs_tensors[1] = vsi_nn_reshape_tensor(graph, inputs[1], new_shape[1], 2);
+    rs_tensors[2] = vsi_nn_reshape_tensor(graph, inputs[2], new_shape[1], 2);
+    rs_tensors[3] = vsi_nn_reshape_tensor(graph, outputs[0], new_shape[0], rank_in);
 
     // Nomalization
     node = vsi_nn_kernel_create_node( graph, kernel );
     if (node)
     {
-        uint32_t index = 0;
-        if (rs_flg)
-        {
-            node_params[index++] = rs_input;
-        }
-        else
-        {
-            node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
-        }
-        if (inputs[1]->attr.dim_num < 2)
-        {
-            node_params[index++] = rs_beta;
-        }
-        else
-        {
-            node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
-        }
-        if (inputs[2]->attr.dim_num < 2)
-        {
-            node_params[index++] = rs_gamma;
-        }
-        else
-        {
-            node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
-        }
-        if (rs_flg)
-        {
-            node_params[index++] = rs_output;
-        }
-        else
-        {
-            node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t;
-        }
-        node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+       vsi_nn_kernel_node_pack_io(node_params, _LAYERNORM_PARAM_NUM,
+           rs_tensors, 3, &rs_tensors[3], 1);
+        node_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
 
         status  = vsi_nn_kernel_node_pass_param( node, node_params,
             _LAYERNORM_PARAM_NUM );
@@ -1496,12 +1104,7 @@ static vsi_nn_kernel_node_t _setup
             // Set default border mode.
             vx_border_t border;
             border.mode = VX_BORDER_CONSTANT;
-            border.constant_value.U8 = 0;
             border.constant_value.U16 = 0;
-            if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
-            {
-                border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
-            }
             status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
             CHECK_STATUS(status);
         }
@@ -1509,19 +1112,37 @@ static vsi_nn_kernel_node_t _setup
 
     /* Pass parameters to node. */
 final:
-    if (rs_beta)
+    vsi_safe_release_tensor(rs_tensors[0]);
+    vsi_safe_release_tensor(rs_tensors[1]);
+    vsi_safe_release_tensor(rs_tensors[2]);
+    vsi_safe_release_tensor(rs_tensors[3]);
+
+    return node;
+} /* _setup() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t *input_size = inputs[0]->attr.size;
+
+    if (input_size[0] >= GPU_TENSOR_MAX_WIDTH)
     {
-        vsi_nn_kernel_tensor_release( &rs_beta );
+       node = _setup_axis01(graph, inputs, input_num, outputs, output_num, params, kernel);
     }
-    if (rs_gamma)
+    else
     {
-        vsi_nn_kernel_tensor_release( &rs_gamma );
-    }
-    if (rs_flg)
-    {
-        vsi_nn_kernel_tensor_release( &rs_input );
-        vsi_nn_kernel_tensor_release( &rs_output );
+        node = _setup_axis0(graph, inputs, input_num, outputs, output_num, params, kernel);
     }
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
index c03e942..5825491 100644
--- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
@@ -910,6 +910,7 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
                 status |= vsi_nn_kernel_gpu_add_param( node,
                         "uniGemmU8U8MulZptoFp32_8x4", &uniGemmU8U8MulZptoFp32_8x4 );
                 status |= vsi_nn_kernel_gpu_add_param( node, "input01Scale", &inScaleMul );
+                status |= vsi_nn_kernel_gpu_add_param( node, "mulKIn0In1Zp", &mulKIn0In1Zp );
                 CHECK_STATUS_FAIL_GOTO(status, OnError );
             }
             break;
diff --git a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
index 317e8a0..460ad87 100644
--- a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
@@ -202,7 +202,7 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
 
     if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
-        int32_t fl = (uint8_t)attr[2]->dfp.fl;
+        int32_t fl = attr[2]->dfp.fl;
         if (fl > 0)
         {
             output_scale = (float) ((int64_t)1 << fl);
diff --git a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
index 30dfc93..11478f5 100644
--- a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
@@ -202,7 +202,7 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
 
     if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
-        int32_t fl = (uint8_t)attr[2]->dfp.fl;
+        int32_t fl = attr[2]->dfp.fl;
         if (fl > 0)
         {
             output_scale = (float) ((int64_t)1 << fl);
diff --git a/src/tim/vx/internal/src/kernel/evis/mod_evis.c b/src/tim/vx/internal/src/kernel/evis/mod_evis.c
new file mode 100644
index 0000000..fe7edd7
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/mod_evis.c
@@ -0,0 +1,444 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+#define MOD_HASH_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
+    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
+
+ #define MOD_KERNEL_SOURCE_NAME "mod"
+
+#define MOD_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
+    { MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
+      CVIVANTE_NAMESPACE("evis.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE), \
+      MOD_KERNEL_SOURCE_NAME },
+
+#define MOD_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
+    { MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \
+      CVIVANTE_NAMESPACE("evis.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE"_2D"), \
+      MOD_KERNEL_SOURCE_NAME },
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _mod_kernel_map[] =
+{
+        // Register kernel here
+    MOD_KERNELS( F16,  F16,  F16 )
+    MOD_KERNELS( F16,  F16,  I16 )
+    MOD_KERNELS( F16,  F16,  I8 )
+    MOD_KERNELS( F16,  F16,  U8 )
+    MOD_KERNELS( I16,  I16,  I16 )
+    MOD_KERNELS( I8,   I8,   I8 )
+    MOD_KERNELS( U8,   U8,   U8 )
+    MOD_KERNELS( I16,  I16,  F16 )
+    MOD_KERNELS( I8,   I8,   F16 )
+    MOD_KERNELS( U8,   U8,   F16 )
+    MOD_KERNELS( BF16, BF16, BF16 )
+
+    MOD_KERNELS_2D( F16,  F16,  F16 )
+    MOD_KERNELS_2D( F16,  F16,  I16 )
+    MOD_KERNELS_2D( F16,  F16,  I8 )
+    MOD_KERNELS_2D( F16,  F16,  U8 )
+    MOD_KERNELS_2D( I16,  I16,  I16 )
+    MOD_KERNELS_2D( I8,   I8,   I8 )
+    MOD_KERNELS_2D( U8,   U8,   U8 )
+    MOD_KERNELS_2D( I16,  I16,  F16 )
+    MOD_KERNELS_2D( I8,   I8,   F16 )
+    MOD_KERNELS_2D( U8,   U8,   F16 )
+    MOD_KERNELS_2D( BF16, BF16, BF16 )
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _mod_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _MOD_PARAM_NUM  _cnt_of_array( _mod_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_mod_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vx_status     status             = VX_FAILURE;
+    vx_tensor     input0              = (vx_tensor)param[0];
+    vx_tensor     input1              = (vx_tensor)param[1];
+    vx_tensor     output              = (vx_tensor)param[2];
+    vsi_nn_kernel_tensor_attr_t *input0_attr  = NULL;
+    vsi_nn_kernel_tensor_attr_t *input1_attr  = NULL;
+    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
+    vsi_size_array_t             *output_shape = NULL;
+    vsi_nn_kernel_dtype_e        input0_dtype = F16;
+    int32_t                      input0_fl    = 0;
+    int32_t                      input1_fl    = 0;
+    int32_t                      output_fl    = 0;
+    float                        inScale0     = 1.0f;
+    float                        inScale1     = 1.0f;
+    float                        outScale     = 1.0f;
+    float                        in0Tail      = 0;
+    float                        in1Tail      = 0;
+    float                        outZp        = 0;
+
+    input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0 );
+    CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input1 );
+    CHECK_PTR_FAIL_GOTO( input1_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
+    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    output_shape = output_attr->shape;
+    input0_dtype = input0_attr->dtype;
+
+    gpu_param.dim = output_shape->size < 3 ? 2 : 3;
+    gpu_param.global_offset[0] = 0;
+    gpu_param.global_offset[1] = 0;
+    gpu_param.global_offset[2] = 0;
+    gpu_param.global_scale[0]  = 8;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1)
+                                             / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = (output_shape->data[1] + gpu_param.global_scale[1] - 1)
+                                             / gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = output_shape->size > 2 ?
+                                 (output_shape->data[2] + gpu_param.global_scale[2] - 1)
+                                             / gpu_param.global_scale[2] : 1;
+
+    if (input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        input0_fl = input0_attr->dfp.fl;
+        if (input0_fl > 0)
+        {
+            inScale0 = 1.0f / (float) ((int64_t)1 << input0_fl);
+        }
+        else
+        {
+            inScale0 = (float)((int64_t)1 << -input0_fl);
+        }
+    }
+    else if (input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        inScale0   = input0_attr->asymm.scale;
+        in0Tail    = -inScale0 * ((float)input0_attr->asymm.zero_point);
+    }
+
+    if (input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        input1_fl = input1_attr->dfp.fl;
+        if (input1_fl > 0)
+        {
+            inScale1 = 1.0f / (float) ((int64_t)1 << input1_fl);
+        }
+        else
+        {
+            inScale1 = (float)((int64_t)1 << -input1_fl);
+        }
+    }
+    else if (input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        inScale1   = input1_attr->asymm.scale;
+        in1Tail    = -inScale1 * ((float)input1_attr->asymm.zero_point);
+    }
+
+    if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        output_fl = output_attr->dfp.fl;
+        if (output_fl > 0)
+        {
+            outScale = (float) ((int64_t)1 << output_fl);
+        }
+        else
+        {
+            outScale = 1.0f / (float)((int64_t)1 << -output_fl);
+        }
+    }
+    else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        outScale    = 1.0f / output_attr->asymm.scale;
+        outZp       = (float)(output_attr->asymm.zero_point);
+    }
+
+    if (BF16 == input0_dtype)
+    {
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x05050404, 0x07070606, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtractOddData_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x07050301, 0x07050301, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+        status  = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else
+    {
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_dp_inst_t uniConvertFstToFp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_dp_inst_t uniConvertSecToFp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertFstToFp32_4x4", &uniConvertFstToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertSecToFp32_4x4", &uniConvertSecToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "in_scale0", &inScale0 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "in0Tail", &in0Tail );
+        status |= vsi_nn_kernel_gpu_add_param( node, "in_scale1", &inScale1 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "in1Tail", &in1Tail );
+        status |= vsi_nn_kernel_gpu_add_param( node, "out_scale", &outScale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "out_zp", &outZp );
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (input0_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&input0_attr);
+    }
+    if (input1_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&input1_attr);
+    }
+    if (output_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&output_attr);
+    }
+    return status;
+} /* _mod_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool image_2d
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _mod_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _mod_kernel_map );
+    vx_param_description_t * param_def  = _mod_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array( _mod_kernel_param_def );
+    vx_kernel_initialize_f  initializer = _mod_initializer;
+
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = MOD_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d);
+
+    for (i = 0; i < kernel_map_size; i ++)
+    {
+        if (kernel_map[i].key == key)
+        {
+            break;
+        }
+    }
+
+    if (i < kernel_map_size)
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_MOD_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_bool image_2d = FALSE;
+    int32_t isfmod = vsi_nn_kernel_param_get_int32(params, "isfmod");
+
+    if (!vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ))
+    {
+        return NULL;
+    }
+
+    image_2d = (outputs[0]->attr.dim_num == 2 || outputs[0]->attr.size[2] == 1);
+    if (vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type) == F16 ||
+        vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type) == F16 ||
+        vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type) == BF16 ||
+        vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type) == BF16)
+    {
+        isfmod = 1;
+    }
+    status = _query_kernel( kernel, inputs, outputs, image_2d);
+    if (VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if (node)
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _MOD_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &isfmod );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _MOD_PARAM_NUM );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( mod, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/pow_evis.c b/src/tim/vx/internal/src/kernel/evis/pow_evis.c
index 0ffd627..b4d4f21 100644
--- a/src/tim/vx/internal/src/kernel/evis/pow_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pow_evis.c
@@ -38,69 +38,20 @@
 
 __BEGIN_DECLS
 
-#define VX_KERNEL_NAME_POW_F16F16TOF16                     CVIVANTE_NAMESPACE("evis.pow_F16F16toF16")
-#define VX_KERNEL_NAME_POW_F16F16TOF16_2D                  CVIVANTE_NAMESPACE("evis.pow_F16F16toF16_2D")
-#define VX_KERNEL_NAME_POW_F16F16TOU8                      CVIVANTE_NAMESPACE("evis.pow_F16F16toU8")
-#define VX_KERNEL_NAME_POW_F16F16TOU8_2D                   CVIVANTE_NAMESPACE("evis.pow_F16F16toU8_2D")
-#define VX_KERNEL_NAME_POW_F16F16TOI8                      CVIVANTE_NAMESPACE("evis.pow_F16F16toI8")
-#define VX_KERNEL_NAME_POW_F16F16TOI8_2D                   CVIVANTE_NAMESPACE("evis.pow_F16F16toI8_2D")
-#define VX_KERNEL_NAME_POW_F16F16TOI16                     CVIVANTE_NAMESPACE("evis.pow_F16F16toI16")
-#define VX_KERNEL_NAME_POW_F16F16TOI16_2D                  CVIVANTE_NAMESPACE("evis.pow_F16F16toI16_2D")
-#define VX_KERNEL_NAME_POW_F16U8TOF16                      CVIVANTE_NAMESPACE("evis.pow_F16U8toF16")
-#define VX_KERNEL_NAME_POW_F16U8TOF16_2D                   CVIVANTE_NAMESPACE("evis.pow_F16U8toF16_2D")
-#define VX_KERNEL_NAME_POW_F16I8TOF16                      CVIVANTE_NAMESPACE("evis.pow_F16I8toF16")
-#define VX_KERNEL_NAME_POW_F16I8TOF16_2D                   CVIVANTE_NAMESPACE("evis.pow_F16I8toF16_2D")
-#define VX_KERNEL_NAME_POW_F16I16TOF16                     CVIVANTE_NAMESPACE("evis.pow_F16I16toF16")
-#define VX_KERNEL_NAME_POW_F16I16TOF16_2D                  CVIVANTE_NAMESPACE("evis.pow_F16I16toF16_2D")
-#define VX_KERNEL_NAME_POW_F16U8TOU8                       CVIVANTE_NAMESPACE("evis.pow_F16U8toU8")
-#define VX_KERNEL_NAME_POW_F16U8TOU8_2D                    CVIVANTE_NAMESPACE("evis.pow_F16U8toU8_2D")
-#define VX_KERNEL_NAME_POW_F16I8TOI8                       CVIVANTE_NAMESPACE("evis.pow_F16I8toI8")
-#define VX_KERNEL_NAME_POW_F16I8TOI8_2D                    CVIVANTE_NAMESPACE("evis.pow_F16I8toI8_2D")
-#define VX_KERNEL_NAME_POW_F16I16TOI16                     CVIVANTE_NAMESPACE("evis.pow_F16I16toI16")
-#define VX_KERNEL_NAME_POW_F16I16TOI16_2D                  CVIVANTE_NAMESPACE("evis.pow_F16I16toI16_2D")
-#define VX_KERNEL_NAME_POW_U8F16TOF16                      CVIVANTE_NAMESPACE("evis.pow_U8F16toF16")
-#define VX_KERNEL_NAME_POW_U8F16TOF16_2D                   CVIVANTE_NAMESPACE("evis.pow_U8F16toF16_2D")
-#define VX_KERNEL_NAME_POW_I8F16TOF16                      CVIVANTE_NAMESPACE("evis.pow_I8F16toF16")
-#define VX_KERNEL_NAME_POW_I8F16TOF16_2D                   CVIVANTE_NAMESPACE("evis.pow_I8F16toF16_2D")
-#define VX_KERNEL_NAME_POW_I16F16TOF16                     CVIVANTE_NAMESPACE("evis.pow_I16F16toF16")
-#define VX_KERNEL_NAME_POW_I16F16TOF16_2D                  CVIVANTE_NAMESPACE("evis.pow_I16F16toF16_2D")
-#define VX_KERNEL_NAME_POW_U8F16TOU8                       CVIVANTE_NAMESPACE("evis.pow_U8F16toU8")
-#define VX_KERNEL_NAME_POW_U8F16TOU8_2D                    CVIVANTE_NAMESPACE("evis.pow_U8F16toU8_2D")
-#define VX_KERNEL_NAME_POW_I8F16TOI8                       CVIVANTE_NAMESPACE("evis.pow_I8F16toI8")
-#define VX_KERNEL_NAME_POW_I8F16TOI8_2D                    CVIVANTE_NAMESPACE("evis.pow_I8F16toI8_2D")
-#define VX_KERNEL_NAME_POW_I16F16TOI16                     CVIVANTE_NAMESPACE("evis.pow_I16F16toI16")
-#define VX_KERNEL_NAME_POW_I16F16TOI16_2D                  CVIVANTE_NAMESPACE("evis.pow_I16F16toI16_2D")
-#define VX_KERNEL_NAME_POW_U8U8TOU8                        CVIVANTE_NAMESPACE("evis.pow_U8U8toU8")
-#define VX_KERNEL_NAME_POW_U8U8TOU8_2D                     CVIVANTE_NAMESPACE("evis.pow_U8U8toU8_2D")
-#define VX_KERNEL_NAME_POW_I8I8TOI8                        CVIVANTE_NAMESPACE("evis.pow_I8I8toI8")
-#define VX_KERNEL_NAME_POW_I8I8TOI8_2D                     CVIVANTE_NAMESPACE("evis.pow_I8I8toI8_2D")
-#define VX_KERNEL_NAME_POW_I16I16TOI16                     CVIVANTE_NAMESPACE("evis.pow_I16I16toI16")
-#define VX_KERNEL_NAME_POW_I16I16TOI16_2D                  CVIVANTE_NAMESPACE("evis.pow_I16I16toI16_2D")
-#define VX_KERNEL_NAME_POW_BF16BF16TOBF16                  CVIVANTE_NAMESPACE("evis.pow_BF16BF16toBF16")
-#define VX_KERNEL_NAME_POW_BF16BF16TOBF16_2D               CVIVANTE_NAMESPACE("evis.pow_BF16BF16toBF16_2D")
-#define VX_KERNEL_NAME_POW_U8U8TOF16                       CVIVANTE_NAMESPACE("evis.pow_U8U8toF16")
-#define VX_KERNEL_NAME_POW_U8U8TOF16_2D                    CVIVANTE_NAMESPACE("evis.pow_U8U8toF16_2D")
-
-#define KERNEL_SOURCE_1    "pow_fp16",
-#define KERNEL_SOURCE_2    "pow_fp16_i8",
-#define KERNEL_SOURCE_3    "pow_fp16_i16",
-#define KERNEL_SOURCE_4    "pow_u8",
-#define KERNEL_SOURCE_5    "pow_i8",
-#define KERNEL_SOURCE_6    "pow_i16"
-
+#define KERNEL_SOURCE    "pow",
 
 #define HASH_POW_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
     ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
 
-#define TENSOR_POW_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
+#define TENSOR_POW_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
     { HASH_POW_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
-        VX_KERNEL_NAME_POW_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE, \
-        SOURCE },
+        CVIVANTE_NAMESPACE("evis.pow_"#IN0_TYPE"_"#IN1_TYPE"to"#OUT_TYPE), \
+        KERNEL_SOURCE },
 
-#define TENSOR_POW_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
+#define TENSOR_POW_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
     { HASH_POW_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \
-        VX_KERNEL_NAME_POW_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE##_2D, \
-        SOURCE },
+        CVIVANTE_NAMESPACE("evis.pow_"#IN0_TYPE"_"#IN1_TYPE"to"#OUT_TYPE"_2D"), \
+        KERNEL_SOURCE },
 
 static const struct {
         uint32_t key;
@@ -108,59 +59,59 @@ static const struct {
         const char* source_name;
     } pow_map[] =
 {
-    TENSOR_POW_KERNELS(F16, F16, F16,       KERNEL_SOURCE_1)
-    TENSOR_POW_KERNELS(F16, F16, U8,        KERNEL_SOURCE_1)
-    TENSOR_POW_KERNELS(F16, U8, F16,        KERNEL_SOURCE_1)
-    TENSOR_POW_KERNELS(F16, U8, U8,         KERNEL_SOURCE_1)
+    TENSOR_POW_KERNELS(F16, F16, F16)
+    TENSOR_POW_KERNELS(F16, F16, U8)
+    TENSOR_POW_KERNELS(F16, U8, F16)
+    TENSOR_POW_KERNELS(F16, U8, U8)
 
-    TENSOR_POW_KERNELS(F16, F16, I8,        KERNEL_SOURCE_2)
-    TENSOR_POW_KERNELS(F16, I8, F16,        KERNEL_SOURCE_2)
-    TENSOR_POW_KERNELS(F16, I8, I8,         KERNEL_SOURCE_2)
+    TENSOR_POW_KERNELS(F16, F16, I8)
+    TENSOR_POW_KERNELS(F16, I8, F16)
+    TENSOR_POW_KERNELS(F16, I8, I8)
 
-    TENSOR_POW_KERNELS(F16, F16, I16,       KERNEL_SOURCE_3)
-    TENSOR_POW_KERNELS(F16, I16, F16,       KERNEL_SOURCE_3)
-    TENSOR_POW_KERNELS(F16, I16, I16,       KERNEL_SOURCE_3)
+    TENSOR_POW_KERNELS(F16, F16, I16)
+    TENSOR_POW_KERNELS(F16, I16, F16)
+    TENSOR_POW_KERNELS(F16, I16, I16)
 
-    TENSOR_POW_KERNELS(U8, F16, F16,        KERNEL_SOURCE_4)
-    TENSOR_POW_KERNELS(U8, F16, U8,         KERNEL_SOURCE_4)
-    TENSOR_POW_KERNELS(U8, U8, U8,          KERNEL_SOURCE_4)
-    TENSOR_POW_KERNELS(U8, U8, F16,         KERNEL_SOURCE_4)
+    TENSOR_POW_KERNELS(U8, F16, F16)
+    TENSOR_POW_KERNELS(U8, F16, U8)
+    TENSOR_POW_KERNELS(U8, U8, U8)
+    TENSOR_POW_KERNELS(U8, U8, F16)
 
-    TENSOR_POW_KERNELS(I8, F16, F16,        KERNEL_SOURCE_5)
-    TENSOR_POW_KERNELS(I8, F16, I8,         KERNEL_SOURCE_5)
-    TENSOR_POW_KERNELS(I8, I8, I8,          KERNEL_SOURCE_5)
+    TENSOR_POW_KERNELS(I8, F16, F16)
+    TENSOR_POW_KERNELS(I8, F16, I8)
+    TENSOR_POW_KERNELS(I8, I8, I8)
 
-    TENSOR_POW_KERNELS(I16, F16, F16,       KERNEL_SOURCE_6)
-    TENSOR_POW_KERNELS(I16, F16, I16,       KERNEL_SOURCE_6)
-    TENSOR_POW_KERNELS(I16, I16, I16,       KERNEL_SOURCE_6)
-    TENSOR_POW_KERNELS(BF16, BF16, BF16,    KERNEL_SOURCE_3)
+    TENSOR_POW_KERNELS(I16, F16, F16)
+    TENSOR_POW_KERNELS(I16, F16, I16)
+    TENSOR_POW_KERNELS(I16, I16, I16)
+    TENSOR_POW_KERNELS(BF16, BF16, BF16)
 
-    TENSOR_POW_KERNELS_2D(F16, F16, F16,    KERNEL_SOURCE_1)
-    TENSOR_POW_KERNELS_2D(F16, F16, U8,     KERNEL_SOURCE_1)
-    TENSOR_POW_KERNELS_2D(F16, U8, F16,     KERNEL_SOURCE_1)
-    TENSOR_POW_KERNELS_2D(F16, U8, U8,      KERNEL_SOURCE_1)
+    TENSOR_POW_KERNELS_2D(F16, F16, F16)
+    TENSOR_POW_KERNELS_2D(F16, F16, U8)
+    TENSOR_POW_KERNELS_2D(F16, U8, F16)
+    TENSOR_POW_KERNELS_2D(F16, U8, U8)
 
-    TENSOR_POW_KERNELS_2D(F16, F16, I8,     KERNEL_SOURCE_2)
-    TENSOR_POW_KERNELS_2D(F16, I8, F16,     KERNEL_SOURCE_2)
-    TENSOR_POW_KERNELS_2D(F16, I8, I8,      KERNEL_SOURCE_2)
+    TENSOR_POW_KERNELS_2D(F16, F16, I8)
+    TENSOR_POW_KERNELS_2D(F16, I8, F16)
+    TENSOR_POW_KERNELS_2D(F16, I8, I8)
 
-    TENSOR_POW_KERNELS_2D(F16, F16, I16,    KERNEL_SOURCE_3)
-    TENSOR_POW_KERNELS_2D(F16, I16, F16,    KERNEL_SOURCE_3)
-    TENSOR_POW_KERNELS_2D(F16, I16, I16,    KERNEL_SOURCE_3)
+    TENSOR_POW_KERNELS_2D(F16, F16, I16)
+    TENSOR_POW_KERNELS_2D(F16, I16, F16)
+    TENSOR_POW_KERNELS_2D(F16, I16, I16)
 
-    TENSOR_POW_KERNELS_2D(U8, F16, F16,     KERNEL_SOURCE_4)
-    TENSOR_POW_KERNELS_2D(U8, F16, U8,      KERNEL_SOURCE_4)
-    TENSOR_POW_KERNELS_2D(U8, U8, U8,       KERNEL_SOURCE_4)
-    TENSOR_POW_KERNELS_2D(U8, U8, F16,      KERNEL_SOURCE_4)
+    TENSOR_POW_KERNELS_2D(U8, F16, F16)
+    TENSOR_POW_KERNELS_2D(U8, F16, U8)
+    TENSOR_POW_KERNELS_2D(U8, U8, U8)
+    TENSOR_POW_KERNELS_2D(U8, U8, F16)
 
-    TENSOR_POW_KERNELS_2D(I8, F16, F16,     KERNEL_SOURCE_5)
-    TENSOR_POW_KERNELS_2D(I8, F16, I8,      KERNEL_SOURCE_5)
-    TENSOR_POW_KERNELS_2D(I8, I8, I8,       KERNEL_SOURCE_5)
+    TENSOR_POW_KERNELS_2D(I8, F16, F16)
+    TENSOR_POW_KERNELS_2D(I8, F16, I8)
+    TENSOR_POW_KERNELS_2D(I8, I8, I8)
 
-    TENSOR_POW_KERNELS_2D(I16, F16, F16,    KERNEL_SOURCE_6)
-    TENSOR_POW_KERNELS_2D(I16, F16, I16,    KERNEL_SOURCE_6)
-    TENSOR_POW_KERNELS_2D(I16, I16, I16,    KERNEL_SOURCE_6)
-    TENSOR_POW_KERNELS_2D(BF16, BF16, BF16, KERNEL_SOURCE_3)
+    TENSOR_POW_KERNELS_2D(I16, F16, F16)
+    TENSOR_POW_KERNELS_2D(I16, F16, I16)
+    TENSOR_POW_KERNELS_2D(I16, I16, I16)
+    TENSOR_POW_KERNELS_2D(BF16, BF16, BF16)
 };
 
 static vx_param_description_t vxPowKernel_param_def[] =
@@ -186,24 +137,13 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
         {0, 0, 0},  // localWorkSize: local group size in thread
         {0, 0, 0}}; // globalWorkSize: image size in thread
 
-    int8_t      in0_fl     = 0;
-    int32_t     src0ZP     = 0;
-    float       src0Scale  = 1.0f;
-    int8_t      in1_fl     = 0;
-    int32_t     src1ZP     = 0;
-    float       src1Scale  = 1.0f;
-    int8_t      out_fl     = 0;
-    float       dstZP      = 0;
-    float       dstScale   = 1.0f;
+    float    input0_scale = 1.0f;
+    float    input1_scale = 1.0f;
+    float    input0_tail = 0;
+    float    input1_tail = 0;
+    float    output_scale = 1.0f;
+    float    output_zp = 0;
 
-    int32_t    postshift0  = 0;
-    int32_t    postshift1  = 0;
-    float      outScale_fl = 1;
-
-    uint16_t M0            = 0;
-    uint16_t M1            = 0;
-
-    vsi_size_t    zAx        = 1;
     uint32_t pack_key      = 0;
     // dim number ???
     vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
@@ -220,58 +160,59 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
 
     if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
-        in0_fl = (int8_t)attr[0]->dfp.fl;
-        postshift0 = in0_fl - 0;
+        int32_t fl = attr[0]->dfp.fl;
+        if (fl > 0)
+        {
+            input0_scale = 1.0f / (float) ((int64_t)1 << fl);
+        }
+        else
+        {
+            input0_scale = (float)((int64_t)1 << -fl);
+        }
     }
     else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
         || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM )
     {
-        src0ZP     = attr[0]->asymm.zero_point;
-        src0Scale  = attr[0]->asymm.scale;
-
-        gpu_quantize_multiplier_16bit(src0Scale / 1.0f, &M0, &postshift0);
+        input0_scale  = attr[0]->asymm.scale;
+        input0_tail = 0 - (float)attr[0]->asymm.zero_point * input0_scale;
     }
 
     if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
-        in1_fl = (int8_t)attr[1]->dfp.fl;
-        postshift1 = in1_fl - 0;
+        int32_t fl = attr[1]->dfp.fl;
+        if (fl > 0)
+        {
+            input1_scale = 1.0f / (float) ((int64_t)1 << fl);
+        }
+        else
+        {
+            input1_scale = (float)((int64_t)1 << -fl);
+        }
     }
     else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
         || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
     {
-        src1ZP     = attr[1]->asymm.zero_point;
-        src1Scale  = attr[1]->asymm.scale;
-
-        gpu_quantize_multiplier_16bit(src1Scale / 1.0f, &M1, &postshift1);
+        input1_scale  = attr[1]->asymm.scale;
+        input1_tail = 0 - (float)attr[1]->asymm.zero_point * input1_scale;
     }
 
     if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
-        out_fl = (int8_t)attr[2]->dfp.fl;
-        if (out_fl > 0)
+        int32_t fl = attr[2]->dfp.fl;
+        if (fl > 0)
         {
-            outScale_fl = (vx_float32)((int64_t)1 << out_fl);
+            output_scale = (float) ((int64_t)1 << fl);
         }
         else
         {
-            outScale_fl = (1.0f / (vx_float32)((int64_t)1 << -out_fl));
+            output_scale = 1.0f / (float)((int64_t)1 << -fl);
         }
     }
     else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
         || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM )
     {
-        dstZP     = (float)attr[2]->asymm.zero_point;
-        dstScale  = 1.0f / attr[2]->asymm.scale;
-    }
-
-    if ( out_shape->size < 3 )
-    {
-        zAx = 1;
-    }
-    else
-    {
-        zAx = out_shape->data[2];
+        output_zp     = (float)attr[2]->asymm.zero_point;
+        output_scale  = 1.0f / attr[2]->asymm.scale;
     }
 
 #define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE )    \
@@ -287,269 +228,122 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
         / shaderParam.global_scale[0], 4);
     shaderParam.global_size[1]   = gpu_align_p2((out_shape->data[1] + shaderParam.global_scale[1] - 1)
         / shaderParam.global_scale[1], 2);
-    shaderParam.global_size[2]   = gpu_align_p2((zAx + shaderParam.global_scale[2] - 1)
-        / shaderParam.global_scale[2], 1);
+    shaderParam.global_size[2]   = out_shape->size > 2 ? out_shape->data[2] : 1;
 
     status = vsi_nn_kernel_gpu_config( node, &shaderParam );
     CHECK_STATUS_FAIL_GOTO(status, OnError);
 
+    switch( pack_key )
     {
-        gpu_dp_inst_t uniConvertFstDataToFp32_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertSecDataToFp32_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertFstDataToFp32_4x4_2 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertSecDataToFp32_4x4_2 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
-            0x33333333, // TCfg
-            0x11110000, // ASelt
-            0x03020100, 0x03020100, // ABin
-            0x00000000, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000,
-            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniConvertUint8SubZpToFp32_4x4 = {{
-            0x09090909, // TCfg
-            0x04040404, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00000000, 0x00010001, 0x00000000,
-            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertSecUint8SubZpToFp32_4x4 = {{
-            0x09090909, // TCfg
-            0x04040404, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00000000, 0x00010001, 0x00000000,
-            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniConvertUint8SubZpToFp32_4x4_2 = {{
-            0x09090909, // TCfg
-            0x04040404, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00000000, 0x00010001, 0x00000000,
-            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniConvertSecUint8SubZpToFp32_4x4_2 = {{
-            0x09090909, // TCfg
-            0x04040404, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00000000, 0x00010001, 0x00000000,
-            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{
-            0x11111111, // TCfg
-            0x11110000, // ASelt
-            0x06040200, 0x06040200, // ABin
-            0x22222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000100, // AccumType, ConstantType, and PostShift
-            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
-            0x11111111, // TCfg
-            0x01010101, // ASelt
-            0x01050004, 0x03070206, // ABin
-            0x22222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
-            0x11111111, // TCfg
-            0x01010101, // ASelt
-            0x05050404, 0x07070606, // ABin
-            0x22222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniExtractOddData_2x8 = {{
-            0x11111111, // TCfg
-            0x11110000, // ASelt
-            0x07050301, 0x07050301, // ABin
-            0x22222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
-            0x11111111, // TCfg
-            0x11110000, // ASelt
-            0x06040200, 0x06040200, // ABin
-            0x22222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        uint32_t multiplierA = (M0 << 16) | M0;
-        uint32_t multiplierB = (M1 << 16) | M1;
-        int32_t i = 8;
-
-        uniConvertUint8SubZpToFp32_4x4.data[7] |= (postshift0 & 0x1F);
-        uniConvertSecUint8SubZpToFp32_4x4.data[7] |= (postshift0 & 0x1F);
-        uniConvertUint8SubZpToFp32_4x4_2.data[7] |= (postshift1 & 0x1F);
-        uniConvertSecUint8SubZpToFp32_4x4_2.data[7] |= (postshift1 & 0x1F);
-        for ( i = 8; i < 16; i += 2 )
+    case _PACK_SELECT_KEY( BF16, BF16, BF16 ):
         {
-            uniConvertUint8SubZpToFp32_4x4.data[i] = multiplierA;
-            uniConvertSecUint8SubZpToFp32_4x4.data[i] = multiplierA;
-            uniConvertUint8SubZpToFp32_4x4_2.data[i] = multiplierB;
-            uniConvertSecUint8SubZpToFp32_4x4_2.data[i] = multiplierB;
+            gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+                0x11111111, // TCfg
+                0x01010101, // ASelt
+                0x01050004, 0x03070206, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+                0x11111111, // TCfg
+                0x01010101, // ASelt
+                0x05050404, 0x07070606, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniExtractOddData_2x8 = {{
+                0x11111111, // TCfg
+                0x11110000, // ASelt
+                0x07050301, 0x07050301, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };
+            status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8",
+                &uniConvBF16toF32_Part0_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8",
+                &uniConvBF16toF32_Part1_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
+                &uniExtractOddData_2x8);
+            CHECK_STATUS_FAIL_GOTO(status, OnError );
         }
-
-        if ( attr[0]->dtype == I8 || attr[0]->dtype == I16 )
+        break;
+    default:
         {
-            gpu_dp_inst_update_postshfit( &uniConvertFstDataToFp32_4x4, postshift0 );
-            gpu_dp_inst_update_postshfit( &uniConvertSecDataToFp32_4x4, postshift0 );
-        }
+            gpu_dp_inst_t uniConvertFstDataToFp32_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniConvertSecDataToFp32_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00050004, 0x00070006, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniExtact8Bit_2x8 = {{
+                0x33333333, // TCfg
+                0x11110000, // ASelt
+                0x03020100, 0x03020100, // ABin
+                0x00000000, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniExtactHalf8_2x8 = {{
+                0x11111111, // TCfg
+                0x11110000, // ASelt
+                0x06040200, 0x06040200, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+            }, GPU_DP_TYPE_16};
 
-        if ( attr[1]->dtype == I8 || attr[1]->dtype == I16 )
-        {
-            gpu_dp_inst_update_postshfit( &uniConvertFstDataToFp32_4x4_2, postshift1 );
-            gpu_dp_inst_update_postshfit( &uniConvertSecDataToFp32_4x4_2, postshift1 );
+            status = vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4",
+                &uniConvertFstDataToFp32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4",
+                &uniConvertSecDataToFp32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param( node, "input0_scale", &input0_scale);
+            status |= vsi_nn_kernel_gpu_add_param( node, "input1_scale", &input1_scale);
+            status |= vsi_nn_kernel_gpu_add_param( node, "input0_tail", &input0_tail);
+            status |= vsi_nn_kernel_gpu_add_param( node, "input1_tail", &input1_tail);
+            status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale);
+            status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp);
+            if (attr[2]->dtype == F16)
+            {
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtact8Bit_2x8",
+                    &uniExtactHalf8_2x8);
+            }
+            else
+            {
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtact8Bit_2x8",
+                    &uniExtact8Bit_2x8);
+            }
+            CHECK_STATUS_FAIL_GOTO(status, OnError );
         }
-
-        switch( pack_key )
-        {
-            case _PACK_SELECT_KEY( F16, F16, I8 ):
-            case _PACK_SELECT_KEY( F16, I8, F16 ):
-            case _PACK_SELECT_KEY( F16, I8, I8 ):
-            case _PACK_SELECT_KEY( F16, F16, I16 ):
-            case _PACK_SELECT_KEY( F16, I16, F16 ):
-            case _PACK_SELECT_KEY( F16, I16, I16 ):
-            case _PACK_SELECT_KEY( I8, F16, F16 ):
-            case _PACK_SELECT_KEY( I8, F16, I8 ):
-            case _PACK_SELECT_KEY( I8, I8, I8 ):
-            case _PACK_SELECT_KEY( I16, F16, F16 ):
-            case _PACK_SELECT_KEY( I16, F16, I16 ):
-            case _PACK_SELECT_KEY( I16, I16, I16 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4",
-                        &uniConvertFstDataToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4",
-                        &uniConvertSecDataToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4_2",
-                        &uniConvertFstDataToFp32_4x4_2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4_2",
-                        &uniConvertSecDataToFp32_4x4_2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "outScale_fl", &outScale_fl);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            case _PACK_SELECT_KEY( U8, F16, F16 ):
-            case _PACK_SELECT_KEY( U8, F16, U8 ):
-            case _PACK_SELECT_KEY( U8, U8, U8 ):
-            case _PACK_SELECT_KEY( U8, U8, F16 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4",
-                        &uniConvertUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4",
-                        &uniConvertSecUint8SubZpToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4_2",
-                        &uniConvertFstDataToFp32_4x4_2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4_2",
-                        &uniConvertSecDataToFp32_4x4_2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4_2",
-                        &uniConvertUint8SubZpToFp32_4x4_2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4_2",
-                        &uniConvertSecUint8SubZpToFp32_4x4_2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
-                        &uniConvertHalftoFp16_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP0", &src0ZP);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP1", &src1ZP);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &dstZP);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            case _PACK_SELECT_KEY( F16, F16, F16 ):
-            case _PACK_SELECT_KEY( F16, F16, U8 ):
-            case _PACK_SELECT_KEY( F16, U8, F16 ):
-            case _PACK_SELECT_KEY( F16, U8, U8 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4",
-                        &uniConvertFstDataToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4",
-                        &uniConvertSecDataToFp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4_2",
-                        &uniConvertUint8SubZpToFp32_4x4_2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4_2",
-                        &uniConvertSecUint8SubZpToFp32_4x4_2);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
-                        &uniConvertHalfToFp16_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP1", &src1ZP);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &dstZP);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-                break;
-            case _PACK_SELECT_KEY( BF16, BF16, BF16 ):
-                {
-                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8",
-                        &uniConvBF16toF32_Part0_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8",
-                        &uniConvBF16toF32_Part1_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
-                        &uniExtractOddData_2x8);
-                    CHECK_STATUS_FAIL_GOTO(status, OnError );
-                }
-            default:
-                break;
-        }
-#undef _PACK_SELECT_KEY
-        status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
-        CHECK_STATUS_FAIL_GOTO(status, OnError );
+        break;
     }
+#undef _PACK_SELECT_KEY
 
 OnError:
     if ( attr[0] )
@@ -646,7 +440,6 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_POW_PARAM_NUM,
                 inputs, 2, outputs, 1 );
             status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_POW_PARAM_NUM );
-
         }
     }
     return node;
@@ -655,4 +448,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( pow, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
index c543f96..498ee45 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
@@ -126,8 +126,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
-    dstZP      = attr[0]->asymm.zero_point;
-    outputScale   = attr[0]->asymm.scale;
     width      = (uint32_t)(out_shape->data[0]);
     height     = (uint32_t)(out_shape->data[1]);
 
@@ -152,7 +150,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
     }
     else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
     {
-        outputScale = 1.0f/outputScale;
+        outputScale = 1.0f / attr[0]->asymm.scale;
+        dstZP = attr[0]->asymm.zero_point;
     }
     else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
index 2201205..797c925 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
@@ -128,8 +128,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer)
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
     out_shape  = attr[0]->shape;
-    dstZP      = (float)attr[0]->asymm.zero_point;
-    outputScale   = attr[0]->asymm.scale;
     width      = (uint32_t)(out_shape->data[0]);
     height     = (uint32_t)(out_shape->data[1]);
 
@@ -147,7 +145,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer)
     }
     else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
     {
-        outputScale = 1.0f/outputScale;
+        outputScale = 1.0f / attr[0]->asymm.scale;
+        dstZP = (float)attr[0]->asymm.zero_point;
     }
     else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
index 23ae619..e92b248 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
@@ -148,8 +148,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
-    dstZP      = attr[0]->asymm.zero_point;
-    dstScale   = attr[0]->asymm.scale;
     width      = (uint32_t)(out_shape->data[0]);
     height     = (uint32_t)(out_shape->data[1]);
 
@@ -161,7 +159,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
 
     if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
     {
-        dstScale = 1.0f / dstScale;
+        dstScale = 1.0f / attr[0]->asymm.scale;
+        dstZP = attr[0]->asymm.zero_point;
     }
     else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
index 4089e0c..ddfc9b5 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
@@ -35,13 +35,15 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
-#define KERNEL_SOURCE_0    "pre_process_rgb888_planar_0",
-#define KERNEL_SOURCE_1    "pre_process_rgb888_planar_1",
-#define KERNEL_SOURCE_2    "pre_process_rgb888_planar_2",
+#define RGB888_SEP_SOURCE_0     "pre_process_rgb888_planar_sep_0",
+#define RGB888_SEP_SOURCE_1     "pre_process_rgb888_planar_sep_1",
+#define RGB888_SEP_SOURCE_2     "pre_process_rgb888_planar_sep_2",
+#define RGB888_SOURCE_0         "pre_process_rgb888_planar_0",
+#define RGB888_SOURCE_1         "pre_process_rgb888_planar_1",
+#define RGB888_SOURCE_2         "pre_process_rgb888_planar_2",
 
 #define STR(a) #a
 
@@ -53,28 +55,48 @@ typedef enum
     HALF
 } _internal_scale_e;
 // Add kernel hashtable here
-#define PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, SCALE_FLAG ) \
-        (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 ) | (SCALE_FLAG))
+#define PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, SEP, SCALE_FLAG ) \
+        (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 ) | ( SEP << 4 ) | (SCALE_FLAG))
 
 #define PACK_KERNEL_SCALE_MAP( IN_DTYPE, OUT_DTYPE ) \
-        { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, SCALE ), \
-          CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
-          KERNEL_SOURCE_0 }
+   { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, SCALE ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+     RGB888_SOURCE_0 }
+
+#define PACK_KERNEL_SEP_SCALE_MAP( IN_DTYPE, OUT_DTYPE ) \
+  { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, SCALE ), \
+    CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+    RGB888_SEP_SOURCE_0 }
 
 #define PACK_KERNEL_COPY_MAP( IN_DTYPE, OUT_DTYPE ) \
-        { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, COPY ), \
-          CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
-          KERNEL_SOURCE_1 }
+   { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, COPY ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+     RGB888_SOURCE_1 }
+
+#define PACK_KERNEL_SEP_COPY_MAP( IN_DTYPE, OUT_DTYPE ) \
+   { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, COPY ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+     RGB888_SEP_SOURCE_1 }
 
 #define PACK_KERNEL_4_OVER_3_MAP( IN_DTYPE, OUT_DTYPE ) \
-        { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, FOUR_OVER_THREE ), \
-          CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_4over3_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
-          KERNEL_SOURCE_2 }
+   { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, FOUR_OVER_THREE ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_4over3_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+     RGB888_SOURCE_2 }
+
+#define PACK_KERNEL_SEP_4_OVER_3_MAP( IN_DTYPE, OUT_DTYPE ) \
+   { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, FOUR_OVER_THREE ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_4over3_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+     RGB888_SEP_SOURCE_2 }
 
 #define PACK_KERNEL_HALF_MAP( IN_DTYPE, OUT_DTYPE ) \
-        { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, HALF ), \
-          CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
-          KERNEL_SOURCE_2 }
+   { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, HALF ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+     RGB888_SOURCE_2 }
+
+#define PACK_KERNEL_SEP_HALF_MAP( IN_DTYPE, OUT_DTYPE ) \
+   { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, HALF ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+     RGB888_SEP_SOURCE_2 }
 
 typedef struct
 {
@@ -98,6 +120,19 @@ static const _kernel_map_type pre_process_rgb888_planar_kernel_map[] =
 
     PACK_KERNEL_4_OVER_3_MAP( U8, U8 ),
     PACK_KERNEL_HALF_MAP( U8, U8 ),
+
+    PACK_KERNEL_SEP_SCALE_MAP( U8, F16 ),
+    PACK_KERNEL_SEP_SCALE_MAP( U8, I16 ),
+    PACK_KERNEL_SEP_SCALE_MAP( U8, I8 ),
+    PACK_KERNEL_SEP_SCALE_MAP( U8, U8 ),
+
+    PACK_KERNEL_SEP_COPY_MAP( U8, F16 ),
+    PACK_KERNEL_SEP_COPY_MAP( U8, I16 ),
+    PACK_KERNEL_SEP_COPY_MAP( U8, I8 ),
+    PACK_KERNEL_SEP_COPY_MAP( U8, U8 ),
+
+    PACK_KERNEL_SEP_4_OVER_3_MAP( U8, U8 ),
+    PACK_KERNEL_SEP_HALF_MAP( U8, U8 ),
 };
 
 
@@ -105,6 +140,23 @@ static const _kernel_map_type pre_process_rgb888_planar_kernel_map[] =
  * Kernel params
  */
 static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM  _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def )
+
+static vx_param_description_t _pre_process_rgb888_planar_sep_kernel_param_def[] =
 {
     {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@@ -121,7 +173,7 @@ static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
-#define _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM  _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def )
+#define _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM  _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def )
 
 /*
  * Kernel initializer
@@ -149,9 +201,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
-    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
+    {
+        attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    }
+    else
+    {
+        attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    }
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &output_scale);
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &output_scale);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
@@ -310,9 +369,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
-    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
+    {
+        attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    }
+    else
+    {
+        attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    }
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &output_scale);
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &output_scale);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
@@ -406,7 +472,14 @@ DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer)
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
+    {
+        attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    }
+    else
+    {
+        attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    }
     CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
 
     out_shape  = attr[1]->shape;
@@ -540,6 +613,7 @@ static vsi_status _query_kernel
     vsi_bool is_4_over_3 = FALSE;
     vsi_bool is_half_scale = FALSE;
     vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
+    vsi_bool is_rgb888_sep = (vsi_bool)(inputs[1] != NULL);
 
     is_4_over_3 = (width * 3 == (int32_t)outputs[0]->attr.size[0] * 4) &&
                   (height * 3 == (int32_t)outputs[0]->attr.size[1] * 4);
@@ -568,7 +642,7 @@ static vsi_status _query_kernel
         }
     }
 
-    key = PRE_PROCESS_RGB888_PLANAR_HASH_KEY( input0_dtype, output_dtype, scale_type);
+    key = PRE_PROCESS_RGB888_PLANAR_HASH_KEY( input0_dtype, output_dtype, is_rgb888_sep, scale_type);
 
     for ( i = 0; i < _cnt_of_array(pre_process_rgb888_planar_kernel_map); i ++ )
     {
@@ -581,8 +655,17 @@ static vsi_status _query_kernel
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",
             pre_process_rgb888_planar_kernel_map[i].function_name );
-        kernel->info.parameters = _pre_process_rgb888_planar_kernel_param_def;
-        kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def );
+
+        if (is_rgb888_sep)
+        {
+            kernel->info.parameters = _pre_process_rgb888_planar_sep_kernel_param_def;
+            kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def );
+        }
+        else
+        {
+            kernel->info.parameters = _pre_process_rgb888_planar_kernel_param_def;
+            kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def );
+        }
 
         if (enable_copy)
         {
@@ -620,8 +703,9 @@ static vsi_nn_kernel_node_t _setup
     )
 {
     vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_PRE_PROCESS_RGB888_PLANAR_PARAM_NUM];
+    vsi_nn_kernel_node_param_t* node_params = NULL;
     vsi_nn_kernel_node_t node = NULL;
+    int32_t param_count = _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM;
     int32_t width  = vsi_nn_kernel_param_get_int32( params, "width" );
     int32_t height = vsi_nn_kernel_param_get_int32( params, "height" );
     float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
@@ -630,7 +714,10 @@ static vsi_nn_kernel_node_t _setup
     float scale = vsi_nn_kernel_param_get_float32( params, "scale" );
     vsi_bool is_no_range_change = FALSE;
 
-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    input_num = inputs[1] == NULL ? 1 : input_num;
+    param_count = inputs[1] == NULL ? _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM : param_count;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
@@ -648,17 +735,19 @@ static vsi_nn_kernel_node_t _setup
     status = _query_kernel( inputs, outputs, kernel, params, is_no_range_change, width, height );
     if ( VSI_SUCCESS == status)
     {
+        node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count);
         node = vsi_nn_kernel_create_node( graph, kernel );
         if ( node )
         {
-            uint32_t index = 6;
+            uint32_t index = inputs[1] == NULL ? 4 : 6;
+            uint32_t scalar_index = index;
             int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
             int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" );
             int32_t left    = vsi_nn_kernel_param_get_int32( params, "left" );
             int32_t top     = vsi_nn_kernel_param_get_int32( params, "top" );
 
             /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM,
+            vsi_nn_kernel_node_pack_io( node_params, param_count,
                     inputs, input_num, outputs, output_num );
 
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
@@ -670,17 +759,21 @@ static vsi_nn_kernel_node_t _setup
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale );
             /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &node_params[6] );
-            vsi_nn_kernel_scalar_release( &node_params[7] );
-            vsi_nn_kernel_scalar_release( &node_params[8] );
-            vsi_nn_kernel_scalar_release( &node_params[9] );
-            vsi_nn_kernel_scalar_release( &node_params[10] );
-            vsi_nn_kernel_scalar_release( &node_params[11] );
-            vsi_nn_kernel_scalar_release( &node_params[12] );
-            vsi_nn_kernel_scalar_release( &node_params[13] );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, param_count );
+            index = scalar_index;
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
         }
     }
+
+    vsi_nn_safe_free(node_params);
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
index 4181414..5fda281 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
@@ -150,8 +150,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
-    outputZP   = (float)attr[0]->asymm.zero_point;
-    outputScale   = attr[0]->asymm.scale;
     width      = (uint32_t)(out_shape->data[0]);
     height     = (uint32_t)(out_shape->data[1]);
 
@@ -176,7 +174,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
     }
     else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
     {
-        outputScale = 1.0f / outputScale;
+        outputScale = 1.0f / attr[0]->asymm.scale;
+        outputZP = (float)attr[0]->asymm.zero_point;
     }
     else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
index 7a5c50c..a51eab1 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
@@ -135,8 +135,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
-    dstZP      = attr[0]->asymm.zero_point;
-    dstScale   = attr[0]->asymm.scale;
     width      = (uint32_t)(out_shape->data[0]);
     height     = (uint32_t)(out_shape->data[1]);
 
@@ -151,9 +149,22 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
         width = width / 3;
     }
 
-    if (attr[0]->dtype == U8)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
     {
-        dstScale = 1.0f / dstScale;
+        dstScale = 1.0f / attr[0]->asymm.scale;
+        dstZP = attr[0]->asymm.zero_point;
+    }
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        if (attr[0]->dfp.fl > 0)
+        {
+            dstScale = (float)((int64_t)1 << attr[0]->dfp.fl);
+        }
+        else
+        {
+            dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
+        }
+        dstZP = 0;
     }
 
     shaderParam.global_scale[0]  = 16;
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
index d96e81d..7c7efc7 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
@@ -130,8 +130,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
-    dstZP      = attr[0]->asymm.zero_point;
-    dstScale   = attr[0]->asymm.scale;
     width      = (uint32_t)(out_shape->data[0]);
     height     = (uint32_t)(out_shape->data[1]);
 
@@ -141,9 +139,22 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
         order1 = 0;
     }
 
-    if (attr[0]->dtype == U8)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
     {
-        dstScale = 1.0f / dstScale;
+        dstScale = 1.0f / attr[0]->asymm.scale;
+        dstZP = attr[0]->asymm.zero_point;
+    }
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        if (attr[0]->dfp.fl > 0)
+        {
+            dstScale = (float)((int64_t)1 << attr[0]->dfp.fl);
+        }
+        else
+        {
+            dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
+        }
+        dstZP = 0;
     }
 
     shaderParam.global_scale[0]  = 16;
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
index 394461f..6896307 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@@ -51,6 +51,7 @@ typedef enum
     UP_3X_HALF,
     UP_4X_HALF,
     UP_8X_HALF,
+    UP_8X_ALIGN,
 } _internal_scale_e;
 
 #define _RESIZE_BILINEAR_KERNEL_SOURCE(_input_type)      "resize_bilinear_"#_input_type
@@ -102,6 +103,12 @@ typedef enum
             "_SAME_3x_upsample_half_pixel_centers"), \
           _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
 
+#define PACK_KERNEL_MAP_UP_8X_ALIGN( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN ), \
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_SAME_8x_upsample_align_corners"), \
+          "resize_bilinear_align_corners" }
+
 typedef struct
 {
     uint32_t key;
@@ -128,6 +135,7 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
     PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
     PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
     PACK_KERNEL_MAP_UP_8X_HALF(U8, U8),
+    PACK_KERNEL_MAP_UP_8X_ALIGN(U8, U8),
 };
 
 
@@ -228,11 +236,6 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
     uint32_t    out_height;
     float       half_pixel_value = 0.0f;
     vsi_bool    is_use_scale_kernel = (vsi_bool)(_RESIZE_BILINEAR_PARAM_NUM == param_size);
-    vsi_bool    is_half_pixel_centers     = FALSE;
-    vsi_bool    is_2x_up_kernel  = FALSE;
-    vsi_bool    is_3x_up_kernel  = FALSE;
-    vsi_bool    is_4x_up_kernel  = FALSE;
-    vsi_bool    is_8x_up_kernel  = FALSE;
 
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
@@ -257,20 +260,20 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
 
     if (align_corners && out_width > 1)
     {
-        scale_factor[0] = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
+        scale_factor[0] = ((float)(in_width - 1) * 1.0f) / (float)(out_width - 1);
     }
     else
     {
-        scale_factor[0] = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
+        scale_factor[0] = ((float)in_width * 1.0f) / (float)out_width;
     }
 
     if (align_corners && out_height > 1)
     {
-        scale_factor[1] = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1);
+        scale_factor[1] = ((float)(in_height - 1) * 1.0f) / (float)(out_height - 1);
     }
     else
     {
-        scale_factor[1] = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height;
+        scale_factor[1] = ((float)in_height * 1.0f) / (float)out_height;
     }
 
     if (half_pixel_centers)
@@ -282,16 +285,6 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
         half_pixel_value = 0.0f;
     }
 
-    is_half_pixel_centers = (!align_corners) && (half_pixel_centers);
-
-    if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)) && is_half_pixel_centers)
-    {
-        is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
-        is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
-        is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
-        is_8x_up_kernel = (8 * in_width == out_width) && (8 * in_height == out_height);
-    }
-
     if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
     {
         input_scale    = input_attr->asymm.scale;
@@ -302,11 +295,11 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
         srcFixPointPos   = input_attr->dfp.fl;
         if (srcFixPointPos >= 0)
         {
-            input_scale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
+            input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
         }
         else if (srcFixPointPos < 0)
         {
-            input_scale = (vx_float32)((int64_t)1 << -srcFixPointPos);
+            input_scale = (float)((int64_t)1 << -srcFixPointPos);
         }
         inputZP = 0;
     }
@@ -326,11 +319,11 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
         dstFixPointPos = output_attr->dfp.fl;
         if (dstFixPointPos >= 0)
         {
-            output_scale = (vx_float32) ((int64_t)1 << dstFixPointPos);
+            output_scale = (float) ((int64_t)1 << dstFixPointPos);
         }
         else if (dstFixPointPos < 0)
         {
-            output_scale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
+            output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
         }
         outputZP = 0;
     }
@@ -340,226 +333,11 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
         outputZP     = 0;
     }
 
-    if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
-    {
-        gpu_param.global_scale[0] = 16;
-        gpu_param.global_scale[1] = 1;
-    }
-    else if (is_3x_up_kernel)
-    {
-        gpu_param.global_scale[0] = 15;
-        gpu_param.global_scale[1] = 6;
-        gpu_param.global_scale[2] = 1;
-    }
-    else
-    {
-        gpu_param.global_scale[0] = 4;
-        gpu_param.global_scale[1] = 1;
-        gpu_param.global_scale[2] = 1;
-    }
+    gpu_param.global_scale[0] = 4;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
 
-    if (is_2x_up_kernel)
-    {
-        gpu_dp_inst_t uniResize2xUp_0_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
-            0x00000704, // AccumType, ConstantType, and PostShift
-            0x09030301, 0x03090103, 0x09030301, 0x03090103,
-            0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize2xUp_1_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
-            0x00000704, // AccumType, ConstantType, and PostShift
-            0x09030301, 0x03090103, 0x09030301, 0x03090103,
-            0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
-        }, GPU_DP_TYPE_16};
-
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if (is_3x_up_kernel)
-    {
-        gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{
-            0x15515515, // TCfg
-            0x00000000, // ASelt
-            0x21210110, 0x03323202, // ABin
-            0x2aa2aa2a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000610, // AccumType, ConstantType, and PostShift
-            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
-            0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{
-            0x05155155, // TCfg
-            0x00000000, // ASelt
-            0x54044343, 0x00650554, // ABin
-            0x0a2aa2aa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000610, // AccumType, ConstantType, and PostShift
-            0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
-            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{
-            0x55551155, // TCfg
-            0x50501050, // ASelt
-            0x01011010, 0x21212121, // ABin
-            0xaaaa22aa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x0000060f, // AccumType, ConstantType, and PostShift
-            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
-            0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{
-            0x11555511, // TCfg
-            0x10505010, // ASelt
-            0x32320202, 0x03033232, // ABin
-            0x22aaaa22, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x0000060f, // AccumType, ConstantType, and PostShift
-            0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
-            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{
-            0x55115555, // TCfg
-            0x50105050, // ASelt
-            0x43434343, 0x54540404, // ABin
-            0xaa22aaaa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x0000060f, // AccumType, ConstantType, and PostShift
-            0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
-            0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{
-            0x00551155, // TCfg
-            0x00501050, // ASelt
-            0x05055454, 0x00006565, // ABin
-            0x00aa22aa, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x0000060f, // AccumType, ConstantType, and PostShift
-            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
-            0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
-
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if (is_4x_up_kernel)
-    {
-        gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
-            0x00000406, // AccumType, ConstantType, and PostShift
-            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
-            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
-            0x00000406, // AccumType, ConstantType, and PostShift
-            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
-            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
-            0x00000406, // AccumType, ConstantType, and PostShift
-            0x23150503, 0x31070701, 0x07310107, 0x15230305,
-            0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
-            0x00000406, // AccumType, ConstantType, and PostShift
-            0x23150503, 0x31070701, 0x07310107, 0x15230305,
-            0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
-        }, GPU_DP_TYPE_16};
-
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if (is_8x_up_kernel)
-    {
-        gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
-            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
-            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
-            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
-            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
-            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
-            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
-            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{
-            0x55555555, 0x55555555, // TCfg
-            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
-            0x00000708, // AccumType, ConstantType, and PostShift
-            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
-            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
-        }, GPU_DP_TYPE_16};
-
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
+    if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
     {
         float dfpScale = input_scale * output_scale;
         gpu_dp_inst_t uniConvertDFP2FP32_4x4 = {{
@@ -840,7 +618,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
     else if (F16 == input_dtype && (U8 == output_dtype || F16 == output_dtype))
     {
         float   uint8Scale             = 1.0f / output_scale;
-        float   uint8ZP_out            = (vx_float32)outputZP;
+        float   uint8ZP_out            = (float)outputZP;
         gpu_dp_inst_t uniExtact8Bit_2x8 = {{
             0x33333333, // TCfg
             0x11110000, // ASelt
@@ -1045,11 +823,299 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
         goto final;
     }
 
-    if (!is_2x_up_kernel && !is_3x_up_kernel && !is_4x_up_kernel&& !is_8x_up_kernel)
+    status  = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    gpu_param.global_size[0] = gpu_align_p2((out_width  + \
+        gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1];
+    gpu_param.global_size[2] = depth / gpu_param.global_scale[2];
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+final:
+    if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
+    if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
+    return status;
+} /* _resize_bilinear_initializer() */
+
+DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
+    vsi_nn_kernel_tensor_attr_t * input_attr    = NULL;
+    vsi_size_array_t             * out_shape     = NULL;
+    vsi_size_array_t             * in_shape      = NULL;
+    vsi_nn_kernel_dtype_e         input_dtype   = F16;
+    uint32_t    depth = 0;
+    uint32_t    in_width = 0;
+    uint32_t    in_height = 0;
+    uint32_t    out_width = 0;
+    uint32_t    out_height = 0;
+    vsi_bool    is_2x_up_kernel  = FALSE;
+    vsi_bool    is_3x_up_kernel  = FALSE;
+    vsi_bool    is_4x_up_kernel  = FALSE;
+    vsi_bool    is_8x_up_kernel  = FALSE;
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    out_shape     = output_attr->shape;
+    in_shape      = input_attr->shape;
+    input_dtype   = input_attr->dtype;
+
+    in_width          = (uint32_t)(in_shape->data[0]);
+    in_height         = (uint32_t)(in_shape->data[1]);
+    depth             = (uint32_t)(in_shape->data[2]);
+    out_width         = (uint32_t)(out_shape->data[0]);
+    out_height        = (uint32_t)(out_shape->data[1]);
+
+    if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
     {
-        status  = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value);
+        is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
+        is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
+        is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
+        is_8x_up_kernel = (8 * in_width == out_width) && (8 * in_height == out_height);
+    }
+
+    if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
+    {
+        gpu_param.global_scale[0] = 16;
+        gpu_param.global_scale[1] = 1;
+    }
+    else if (is_3x_up_kernel)
+    {
+        gpu_param.global_scale[0] = 15;
+        gpu_param.global_scale[1] = 6;
+        gpu_param.global_scale[2] = 1;
+    }
+    else
+    {
+        gpu_param.global_scale[0] = 4;
+        gpu_param.global_scale[1] = 1;
+        gpu_param.global_scale[2] = 1;
+    }
+
+    if (is_2x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize2xUp_0_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
+            0x00000704, // AccumType, ConstantType, and PostShift
+            0x09030301, 0x03090103, 0x09030301, 0x03090103,
+            0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize2xUp_1_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
+            0x00000704, // AccumType, ConstantType, and PostShift
+            0x09030301, 0x03090103, 0x09030301, 0x03090103,
+            0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
+    else if (is_3x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{
+            0x15515515, // TCfg
+            0x00000000, // ASelt
+            0x21210110, 0x03323202, // ABin
+            0x2aa2aa2a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
+            0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{
+            0x05155155, // TCfg
+            0x00000000, // ASelt
+            0x54044343, 0x00650554, // ABin
+            0x0a2aa2aa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
+            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{
+            0x55551155, // TCfg
+            0x50501050, // ASelt
+            0x01011010, 0x21212121, // ABin
+            0xaaaa22aa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
+            0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{
+            0x11555511, // TCfg
+            0x10505010, // ASelt
+            0x32320202, 0x03033232, // ABin
+            0x22aaaa22, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
+            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{
+            0x55115555, // TCfg
+            0x50105050, // ASelt
+            0x43434343, 0x54540404, // ABin
+            0xaa22aaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
+            0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{
+            0x00551155, // TCfg
+            0x00501050, // ASelt
+            0x05055454, 0x00006565, // ABin
+            0x00aa22aa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
+            0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else if (is_4x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
+            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
+            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x23150503, 0x31070701, 0x07310107, 0x15230305,
+            0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x23150503, 0x31070701, 0x07310107, 0x15230305,
+            0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else if (is_8x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
+            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
+            0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
+            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
+            0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
+            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
+            0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
+            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
+            0x00000708, // AccumType, ConstantType, and PostShift
+            0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
+            0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else
+    {
+        VSILOGE("input or output's format is not support");
+        status = VSI_FAILURE;
+        goto final;
+    }
 
     if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
     {
@@ -1071,7 +1137,168 @@ final:
     if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
     if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
     return status;
-} /* _resize_bilinear_initializer() */
+} /* _bilinear_half_pixel_centers_opt_initializer() */
+
+DEF_KERNEL_INITIALIZER(_bilinear_align_corners_opt_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
+    vsi_nn_kernel_tensor_attr_t * input_attr    = NULL;
+    vsi_size_array_t             * out_shape     = NULL;
+    vsi_size_array_t             * in_shape      = NULL;
+    vsi_nn_kernel_dtype_e         input_dtype   = F16;
+    uint32_t    depth = 0;
+    float       scale_factor[2] = {0};
+    uint32_t    in_width = 0;
+    uint32_t    in_height = 0;
+    uint32_t    out_width = 0;
+    uint32_t    out_height = 0;
+    vsi_bool    is_8x_align_corners  = FALSE;
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    out_shape     = output_attr->shape;
+    in_shape      = input_attr->shape;
+    input_dtype   = input_attr->dtype;
+
+    in_width          = (uint32_t)(in_shape->data[0]);
+    in_height         = (uint32_t)(in_shape->data[1]);
+    depth             = (uint32_t)(in_shape->data[2]);
+    out_width         = (uint32_t)(out_shape->data[0]);
+    out_height        = (uint32_t)(out_shape->data[1]);
+
+    if (out_width > 1)
+    {
+        scale_factor[0] = ((float)(in_width - 1) * 1.0f) / (float)(out_width - 1);
+    }
+    else
+    {
+        scale_factor[0] = ((float)in_width * 1.0f) / (float)out_width;
+    }
+
+    if (out_height > 1)
+    {
+        scale_factor[1] = ((float)(in_height - 1) * 1.0f) / (float)(out_height - 1);
+    }
+    else
+    {
+        scale_factor[1] = ((float)in_height * 1.0f) / (float)out_height;
+    }
+
+    if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
+    {
+        is_8x_align_corners = (scale_factor[0] == scale_factor[1]) && (scale_factor[0] = 0.125f);
+    }
+
+    if (is_8x_align_corners)
+    {
+        gpu_param.global_scale[0] = 2;
+        gpu_param.global_scale[1] = 1;
+        gpu_param.global_scale[2] = 1;
+    }
+
+    if (is_8x_align_corners)
+    {
+        gpu_dp_inst_t uniBilinear_8x_l10_4x8 = {{
+            0x55555505, 0x55555555, // TCfg
+            0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x00000838, 0x01070731, 0x02060e2a, 0x03051523,
+            0x04041c1c, 0x05032315, 0x06022a0e, 0x07013107 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniBilinear_8x_l11_4x8 = {{
+            0x55555505, 0x55555555, // TCfg
+            0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x00000838, 0x01070731, 0x02060e2a, 0x03051523,
+            0x04041c1c, 0x05032315, 0x06022a0e, 0x07013107 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniBilinear_8x_l20_4x8 = {{
+            0x55555505, 0x55555555, // TCfg
+            0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x00001030, 0x020e062a, 0x040c0c24, 0x060a121e,
+            0x08081818, 0x0a061e12, 0x0c04240c, 0x0e022a06 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniBilinear_8x_l21_4x8 = {{
+            0x55555505, 0x55555555, // TCfg
+            0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x00001030, 0x020e062a, 0x040c0c24, 0x060a121e,
+            0x08081818, 0x0a061e12, 0x0c04240c, 0x0e022a06 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniBilinear_8x_l30_4x8 = {{
+            0x55555505, 0x55555555, // TCfg
+            0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x00001828, 0x03150523, 0x06120a1e, 0x090f0f19,
+            0x0c0c1414, 0x0f09190f, 0x12061e0a, 0x15032305 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniBilinear_8x_l31_4x8 = {{
+            0x55555505, 0x55555555, // TCfg
+            0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x00001828, 0x03150523, 0x06120a1e, 0x090f0f19,
+            0x0c0c1414, 0x0f09190f, 0x12061e0a, 0x15032305 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniBilinear_8x_l40_4x8 = {{
+            0x55555505, 0x55555555, // TCfg
+            0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x00002020, 0x041c041c, 0x08180818, 0x0c140c14,
+            0x10101010, 0x140c140c, 0x18081808, 0x1c041c04 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniBilinear_8x_l41_4x8 = {{
+            0x55555505, 0x55555555, // TCfg
+            0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x00002020, 0x041c041c, 0x08180818, 0x0c140c14,
+            0x10101010, 0x140c140c, 0x18081808, 0x1c041c04 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l10_4x8", &uniBilinear_8x_l10_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l11_4x8", &uniBilinear_8x_l11_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l20_4x8", &uniBilinear_8x_l20_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l21_4x8", &uniBilinear_8x_l21_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l30_4x8", &uniBilinear_8x_l30_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l31_4x8", &uniBilinear_8x_l31_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l40_4x8", &uniBilinear_8x_l40_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l41_4x8", &uniBilinear_8x_l41_4x8);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else
+    {
+        VSILOGE("input or output's format is not support");
+        status = VSI_FAILURE;
+        goto final;
+    }
+
+    gpu_param.global_size[0] = gpu_align_p2((in_width  + \
+                               gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (in_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1];
+    gpu_param.global_size[2] = depth / gpu_param.global_scale[2];
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+final:
+    if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
+    if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
+    return status;
+} /* _bilinear_align_corners_opt_initializer() */
 
 /*
  * Query kernel
@@ -1098,19 +1325,46 @@ static vsi_status _query_kernel
     vx_kernel_initialize_f  initializer = _resize_bilinear_initializer;
     uint32_t key;
     uint32_t i;
-    vsi_bool is_2x_upsample =(2 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
-                    && (2 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
-    vsi_bool is_3x_upsample =(3 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
-                    && (3 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
-    vsi_bool is_4x_upsample =(4 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
-                    && (4 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
-    vsi_bool is_8x_upsample =(8 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
-                    && (8 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
+    float width_scale = 0;
+    float height_scale = 0;
+    vsi_size_t input_width = inputs[0]->attr.size[0];
+    vsi_size_t input_height = inputs[0]->attr.size[1];
+    vsi_size_t output_width = outputs[0]->attr.size[0];
+    vsi_size_t output_height = outputs[0]->attr.size[1];
+    vsi_bool is_2x_upsample =(2 * input_width == output_width) \
+                    && (2 * input_height == output_height);
+    vsi_bool is_3x_upsample =(3 * input_width == output_width) \
+                    && (3 * input_height == output_height);
+    vsi_bool is_4x_upsample =(4 * input_width == output_width) \
+                    && (4 * input_height == output_height);
+    vsi_bool is_8x_upsample =(8 * input_width == output_width) \
+                    && (8 * input_height == output_height);
+    vsi_bool is_8x_align_corners = FALSE;
     _internal_scale_e scale_flag = UP;
 
+    if (align_corners && outputs[0]->attr.size[0] > 1)
+    {
+        width_scale = ((float)(input_width - 1) * 1.0f) / (float)(output_width - 1);
+    }
+    else
+    {
+        width_scale = ((float)input_width * 1.0f) / (float)output_width;
+    }
+
+    if (align_corners && output_height > 1)
+    {
+        height_scale = ((float)(input_height - 1) * 1.0f) / (float)(output_height - 1);
+    }
+    else
+    {
+        height_scale = ((float)input_height * 1.0f) / (float)output_height;
+    }
+
     in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
+    is_8x_align_corners = (vsi_bool)( width_scale == 0.125f && height_scale == 0.125f && in_dtype == U8 );
+
     is_2x_upsample &= (in_dtype == U8);
     is_3x_upsample &= (in_dtype == U8);
     is_4x_upsample &= (in_dtype == U8);
@@ -1121,18 +1375,27 @@ static vsi_status _query_kernel
         if (is_same_type && (!align_corners) && (half_pixel_centers) && is_2x_upsample)
         {
             scale_flag = UP_2X_HALF;
+            initializer = _bilinear_half_pixel_centers_opt_initializer;
         }
         else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_3x_upsample)
         {
             scale_flag = UP_3X_HALF;
+            initializer = _bilinear_half_pixel_centers_opt_initializer;
         }
         else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_4x_upsample)
         {
             scale_flag = UP_4X_HALF;
+            initializer = _bilinear_half_pixel_centers_opt_initializer;
         }
         else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_8x_upsample)
         {
             scale_flag = UP_8X_HALF;
+            initializer = _bilinear_half_pixel_centers_opt_initializer;
+        }
+        else if (is_same_type && (align_corners) && (!half_pixel_centers) && is_8x_align_corners)
+        {
+            scale_flag = UP_8X_ALIGN;
+            initializer = _bilinear_align_corners_opt_initializer;
         }
         else if (is_same_type && is_evis2)
         {
@@ -1240,20 +1503,20 @@ static vsi_nn_tensor_t* _create_scale_tensor
 
     if (align_corners && width > 1)
     {
-        width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(width - 1);
+        width_scale = ((float)(input_width - 1) * 1.0f) / (float)(width - 1);
     }
     else
     {
-        width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)width;
+        width_scale = ((float)input_width * 1.0f) / (float)width;
     }
 
     if (align_corners && height > 1)
     {
-        height_scale = ((vx_float32)(input_height - 1) * 1.0f) / (vx_float32)(height - 1);
+        height_scale = ((float)(input_height - 1) * 1.0f) / (float)(height - 1);
     }
     else
     {
-        height_scale = ((vx_float32)input_height * 1.0f) / (vx_float32)height;
+        height_scale = ((float)input_height * 1.0f) / (float)height;
     }
 
 
@@ -1273,7 +1536,7 @@ static vsi_nn_tensor_t* _create_scale_tensor
             int32_t  h0      = 0;
             if (half_pixel_centers)
             {
-                input_h = ((vx_float32)y + 0.5f) * height_scale - 0.5f;
+                input_h = ((float)y + 0.5f) * height_scale - 0.5f;
             }
             else
             {
@@ -1291,7 +1554,7 @@ static vsi_nn_tensor_t* _create_scale_tensor
                 float     br      = 0.0f;
                 if (half_pixel_centers)
                 {
-                    input_w = ((vx_float32)x + 0.5f) * width_scale - 0.5f;
+                    input_w = ((float)x + 0.5f) * width_scale - 0.5f;
                 }
                 else
                 {
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c
index 6e2e6bd..b8e634e 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c
@@ -51,6 +51,15 @@ __BEGIN_DECLS
             "_"STR(UP_SCALE)"x_upsample_half_pixel_centers"), \
           "resize_bilinear_nhwc" }
 
+#define BILINEAR_NHWC_BOUND_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_SCALE ) \
+        (( IN_DTYPE ) | ( OUT_DTYPE << 8) | (UP_SCALE << 16))
+
+#define BILINEAR_NHWC_BOUND_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, UP_SCALE ) \
+        { BILINEAR_NHWC_BOUND_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_SCALE ), \
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_nhwc_bound_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_"STR(UP_SCALE)"x"), \
+          "resize_bilinear_nhwc_bound" }
+
 typedef struct
 {
     uint32_t key;
@@ -65,6 +74,12 @@ static const _kernel_map_type _resize_bilinear_nhwc_kernel_map[] =
     BILINEAR_NHWC_PACK_KERNEL_MAP_UP_SCALE(U8, U8, 1, 0, 4),
 };
 
+static const _kernel_map_type _bilinear_nhwc_bound_kernel_map[] =
+{
+    BILINEAR_NHWC_BOUND_KERNEL_MAP(U8, U8, 2),
+    BILINEAR_NHWC_BOUND_KERNEL_MAP(U8, U8, 3),
+    BILINEAR_NHWC_BOUND_KERNEL_MAP(U8, U8, 4),
+};
 
 /*
  * Kernel params
@@ -81,6 +96,14 @@ static vx_param_description_t _resize_bilinear_nhwc_kernel_param_def[] =
 #define SCALAR_ALIGN_CORNERS         (2)
 #define SCALAR_HALF_PIXEL            (3)
 
+static vx_param_description_t _bilinear_nhwc_bound_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _BILINEAR_NHWC_BOUND_PARAM_NUM  _cnt_of_array( _bilinear_nhwc_bound_kernel_param_def )
+
 /*
  * Kernel initializer
  */
@@ -382,50 +405,193 @@ final:
     return status;
 } /* _resize_bilinear_initializer() */
 
+DEF_KERNEL_INITIALIZER(_bilinear_nhwc_bound_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
+    vsi_nn_kernel_tensor_attr_t * input_attr  = NULL;
+    vsi_size_array_t             * in_shape  = NULL;
+    vsi_size_array_t             * out_shape  = NULL;
+    uint32_t  x_coord[2] = {0};
+    uint32_t    in_width;
+    uint32_t    in_height;
+    uint32_t    out_width;
+    uint32_t    out_height;
+    vsi_bool    is_2x_up_kernel  = FALSE;
+    vsi_bool    is_3x_up_kernel  = FALSE;
+    vsi_bool    is_4x_up_kernel  = FALSE;
+
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    in_shape = input_attr->shape;
+    out_shape = output_attr->shape;
+
+    in_width          = (uint32_t)(in_shape->data[0]);
+    in_height         = (uint32_t)(in_shape->data[1]);
+    out_width         = (uint32_t)(out_shape->data[0]);
+    out_height        = (uint32_t)(out_shape->data[1]);
+
+    is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
+    is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
+    is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
+
+
+    if (is_2x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize_x2_nhwc2_0_4x8 = {{
+            0x55555511, 0x55555555, // TCfg
+            0x46104000, 0x3a48829c, 0x4882acca, 0xc4acca3a, 0xbd4e5b50, // BinSelect
+            0x00000704, // AccumType, ConstantType, and PostShift
+            0x000c0004, 0x09030301, 0x03090103, 0x03090103,
+            0x09030301, 0x09030301, 0x03090103, 0x03090103 // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_param.global_scale[0] = 2;
+        gpu_param.global_scale[1] = 1;
+        x_coord[1] = (uint32_t)(out_shape->data[0]) - 2;
+        x_coord[0] = (x_coord[1] * 2 - 1) >> 2;
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize_x2_nhwc2_0_4x8", &uniResize_x2_nhwc2_0_4x8);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else if (is_3x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize_x3_nhwc2_l10_4x4 = {{
+            0x05055511, // TCfg
+            0x04045010, // ASelt
+            0x31310000, 0x00330022, // ABin
+            0x0a0aaa22, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x00005556, 0x00002aab, 0x38e41c72, 0x1c720e39,
+            0x2aab5556, 0x00000000, 0x2aab5556, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_param.global_scale[0] = 3;
+        gpu_param.global_scale[1] = 1;
+        x_coord[1] = (uint32_t)(out_shape->data[0]) - 2;
+        x_coord[0] = (x_coord[1] - 1) / 6 * 2;
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l10_4x4", &uniResize_x3_nhwc2_l10_4x4);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else if (is_4x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize_x4_nhwc2_l00_4x8 = {{
+            0x55555511, 0x55555555, // TCfg
+            0x46104000, 0x1940409c, 0x48829c46, 0x82acca3a, 0xacca3a48, // BinSelect
+            0x00000706, // AccumType, ConstantType, and PostShift
+            0x00280018, 0x190f0f09, 0x23051503, 0x23051503,
+            0x05230315, 0x05230315, 0x0f19090f, 0x0f19090f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize_x4_nhwc2_l10_4x8 = {{
+            0x55555511, 0x55555555, // TCfg
+            0x46104000, 0x1940409c, 0x48829c46, 0x82acca3a, 0xacca3a48, // BinSelect
+            0x00000706, // AccumType, ConstantType, and PostShift
+            0x00380008, 0x23150503, 0x31070701, 0x31070701,
+            0x07310107, 0x07310107, 0x15230305, 0x15230305 // Constant
+        }, GPU_DP_TYPE_16};
+
+
+        gpu_param.global_scale[0] = 4;
+        gpu_param.global_scale[1] = 1;
+        x_coord[1] = (uint32_t)(out_shape->data[0]) - 2;
+        x_coord[0] = ((x_coord[1] - 3) >> 3) * 2;
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l00_4x8", &uniResize_x4_nhwc2_l00_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l10_4x8", &uniResize_x4_nhwc2_l10_4x8);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else
+    {
+        VSILOGE("input or output's format is not support");
+        status = VSI_FAILURE;
+        goto final;
+    }
+
+    gpu_param.global_size[0]   = gpu_align_p2((out_height  + \
+        gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = 1;
+    gpu_param.dim              = 2;
+
+    status |= vsi_nn_kernel_gpu_add_param( node, "x_coord", &x_coord);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+final:
+    if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
+    if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
+
+    return status;
+} /* _bilinear_nhwc_bound_initializer() */
+
 /*
  * Query kernel
  */
 static vsi_status _query_kernel
     (
     vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs,
-    int32_t align_corners,
-    int32_t half_pixel_centers,
-    uint32_t  up_scale
+    const uint32_t hashkey,
+    uint32_t kernel_id
     )
 {
+    vx_kernel_initialize_f  initializer = NULL;
+    vx_param_description_t * param_def;
     vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_dtype_e in_dtype;
-    vsi_nn_kernel_dtype_e out_dtype;
-    const _kernel_map_type * kernel_map = _resize_bilinear_nhwc_kernel_map;
-    size_t kernel_map_size              = _cnt_of_array( _resize_bilinear_nhwc_kernel_map );
-    vx_param_description_t * param_def  = _resize_bilinear_nhwc_kernel_param_def;
-    size_t param_def_size               = _cnt_of_array( _resize_bilinear_nhwc_kernel_param_def );
-    vx_kernel_initialize_f  initializer = _resize_bilinear_nhwc_initializer;
-    uint32_t key;
-    uint32_t i;
+    const _kernel_map_type* kernel_map;
+    size_t kernel_map_size;
+    size_t param_size;
+    uint32_t i = 0;
 
-    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
-    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-
-    in_dtype = in_dtype == I8 ? U8 : in_dtype;
-    out_dtype = out_dtype == I8 ? U8 : out_dtype;
-
-    key = RESIZE_BILINEAR_NHWC_HASH_KEY( in_dtype, out_dtype, half_pixel_centers, align_corners, up_scale );
-    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    switch( kernel_id )
     {
-        if ( kernel_map[i].key == key )
+        case 0:
+            initializer = _resize_bilinear_nhwc_initializer;
+            kernel_map = _resize_bilinear_nhwc_kernel_map;
+            kernel_map_size = _cnt_of_array( _resize_bilinear_nhwc_kernel_map );
+            param_def = _resize_bilinear_nhwc_kernel_param_def;
+            param_size = _RESIZE_BILINEAR_NHWC_PARAM_NUM;
+            break;
+        case 1:
+            initializer = _bilinear_nhwc_bound_initializer;
+            kernel_map = _bilinear_nhwc_bound_kernel_map;
+            kernel_map_size = _cnt_of_array( _bilinear_nhwc_bound_kernel_map );
+            param_def = _bilinear_nhwc_bound_kernel_param_def;
+            param_size = _BILINEAR_NHWC_BOUND_PARAM_NUM;
+            break;
+        default:
+            VSI_ASSERT( FALSE );
+            return VSI_FAILURE;
+    }
+
+    for( i = 0; i < kernel_map_size; i ++ )
+    {
+        if( kernel_map[i].key == hashkey )
         {
             break;
         }
     }
-
-    if ( i < kernel_map_size )
+    if( i < kernel_map_size )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
         kernel->info.parameters  = param_def;
-        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.numParams   = (uint32_t)param_size;
         kernel->info.initialize  = initializer;
         // Register code source
         vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
@@ -453,7 +619,8 @@ static vsi_nn_kernel_node_t _setup
     )
 {
     vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_RESIZE_BILINEAR_NHWC_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_param_t node0_params[_RESIZE_BILINEAR_NHWC_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_param_t node1_params[_BILINEAR_NHWC_BOUND_PARAM_NUM] = {NULL};
     vsi_nn_kernel_node_t node   = NULL;
     int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
     int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
@@ -463,8 +630,14 @@ static vsi_nn_kernel_node_t _setup
     float scale_y               = (float)outputs[0]->attr.size[2] / (float)inputs[0]->attr.size[2];
     float up_scale              = scale_x == scale_y ? scale_x : 0;
     uint32_t rank               = inputs[0]->attr.dim_num;
-    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
     vsi_size_t  shapes[2][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    vsi_nn_kernel_t * ikernels[2] = { NULL };
+    uint32_t hashkeys[2] = {0};
+    uint32_t i = 0;
+    vsi_nn_tensor_attr_t attr;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
 
     if (!is_same_type || depth != 2 || rank < 3 ||
         (up_scale != 2.0f && up_scale != 3.0f && up_scale != 4.0f))
@@ -472,8 +645,24 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
-    status = _query_kernel( kernel, inputs, outputs,
-                            align_corners, half_pixel_centers, (uint32_t)up_scale);
+    ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+    // Assign unique_id
+    ikernels[0]->unique_id = kernel->unique_id;
+    ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+    // Assign unique_id
+    ikernels[1]->unique_id = kernel->unique_id;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    hashkeys[0] = RESIZE_BILINEAR_NHWC_HASH_KEY( in_dtype, out_dtype, half_pixel_centers,
+        align_corners, (vsi_size_t)up_scale );
+    hashkeys[1] = BILINEAR_NHWC_BOUND_HASH_KEY( in_dtype, out_dtype, (vsi_size_t)up_scale );
+
+    status = _query_kernel( ikernels[0], hashkeys[0], 0);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = _query_kernel( kernel, hashkeys[1], 1);
+    CHECK_STATUS_FAIL_GOTO(status, final );
 
     shapes[0][0] = depth * inputs[0]->attr.size[1];
     shapes[0][1] = inputs[0]->attr.size[2];
@@ -491,26 +680,41 @@ static vsi_nn_kernel_node_t _setup
     reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
         outputs[0], shapes[1], rank );
 
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM,
-                    reshape_tensors, input_num, &reshape_tensors[1], output_num );
-            node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
-            node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
+    // resize bilinear
+    node = vsi_nn_kernel_create_node( graph, ikernels[0] );
+    VSI_ASSERT( node != NULL );
+    vsi_nn_kernel_node_pack_io( node0_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM,
+            reshape_tensors, input_num, &reshape_tensors[1], output_num );
+    node0_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
+    node0_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
+    status  = vsi_nn_kernel_node_pass_param( node, node0_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM );
+    vsi_nn_kernel_scalar_release( &node0_params[SCALAR_ALIGN_CORNERS] );
+    vsi_nn_kernel_scalar_release( &node0_params[SCALAR_HALF_PIXEL] );
+    vsi_nn_kernel_node_release( &node );
 
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
+    // update bound for output tensor
+    memcpy( &attr, &(reshape_tensors[1]->attr), sizeof(vsi_nn_tensor_attr_t) );
+    attr.size[0] = 1;
+    attr.size[1] = 1;
+    attr.dim_num = 2;
+    reshape_tensors[2] = vsi_nn_CreateTensor( graph, &attr );
+    node = vsi_nn_kernel_create_node( graph, kernel );
+    VSI_ASSERT( node != NULL );
+    vsi_nn_kernel_node_pack_io( node1_params, _BILINEAR_NHWC_BOUND_PARAM_NUM,
+            reshape_tensors, 2, &reshape_tensors[2], 1 );
+    status  = vsi_nn_kernel_node_pass_param( node, node1_params, _BILINEAR_NHWC_BOUND_PARAM_NUM );
+
+final:
+    for( i = 0; i < 2; i ++ )
+    {
+        if( ikernels[i] )
+        {
+            vsi_nn_kernel_release( &ikernels[i] );
         }
     }
-
     vsi_safe_release_tensor(reshape_tensors[0]);
     vsi_safe_release_tensor(reshape_tensors[1]);
+    vsi_safe_release_tensor(reshape_tensors[2]);
 
     return node;
 } /* _setup() */
diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
index 2e0cac5..2ccc607 100644
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
@@ -118,7 +118,7 @@ static vsi_status get_scatter_nd_tensor_reshape_size
         return status;
     }
 
-#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
 
     newDim[0] = 0;
     for(i = 0; i < dims_num; ++i)
diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
index c277ba5..957a666 100644
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
@@ -207,7 +207,7 @@ static vsi_status get_scatter_nd_update_tensor_reshape_size
         return status;
     }
 
-#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
 
     newDim[0] = 0;
     for(i = 0; i < dims_num; ++i)
diff --git a/src/tim/vx/internal/src/kernel/evis/select_evis.c b/src/tim/vx/internal/src/kernel/evis/select_evis.c
index f5571ab..897f106 100644
--- a/src/tim/vx/internal/src/kernel/evis/select_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/select_evis.c
@@ -75,10 +75,24 @@ static const _kernel_map_type _select_kernel_map[] =
     PACK_KERNEL_MAP(I8, U8,  U8,  U8),
     PACK_KERNEL_MAP(I8, I16, I16, I16),
     PACK_KERNEL_MAP(I8, F16, F16, F16),
+    PACK_KERNEL_MAP(I8, F16, U8,  F16),
+    PACK_KERNEL_MAP(I8, U8,  F16, F16),
+    PACK_KERNEL_MAP(I8, F16, I8,  F16),
+    PACK_KERNEL_MAP(I8, I8,  F16, F16),
+    PACK_KERNEL_MAP(I8, F16, I16, F16),
+    PACK_KERNEL_MAP(I8, I16, F16, F16),
+    PACK_KERNEL_MAP(I8, F16, F16, U8),
     PACK_KERNEL_MAP_2D(I8, I8,  I8,  I8),
     PACK_KERNEL_MAP_2D(I8, U8,  U8,  U8),
     PACK_KERNEL_MAP_2D(I8, I16, I16, I16),
     PACK_KERNEL_MAP_2D(I8, F16, F16, F16),
+    PACK_KERNEL_MAP_2D(I8, U8,  F16, F16),
+    PACK_KERNEL_MAP_2D(I8, F16, U8,  F16),
+    PACK_KERNEL_MAP_2D(I8, F16, I8,  F16),
+    PACK_KERNEL_MAP_2D(I8, I8,  F16, F16),
+    PACK_KERNEL_MAP_2D(I8, F16, I16, F16),
+    PACK_KERNEL_MAP_2D(I8, I16, F16, F16),
+    PACK_KERNEL_MAP_2D(I8, F16, F16, U8),
 };
 
 /*
@@ -142,7 +156,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
     output_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
     CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
-    if( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         input0_fl = input0_attr->dfp.fl;
         if (input0_fl > 0)
@@ -154,13 +168,13 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
             input0Scale = (float)((int64_t)1 << -input0_fl);
         }
     }
-    else if( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    else if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
     {
         input0Scale = input0_attr->asymm.scale;
         input0Zp    = input0_attr->asymm.zero_point;
     }
 
-    if( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         input1_fl = input1_attr->dfp.fl;
         if (input1_fl > 0)
@@ -172,13 +186,13 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
             input1Scale = (float)((int64_t)1 << -input1_fl);
         }
     }
-    else if( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    else if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
     {
         input1Scale = input1_attr->asymm.scale;
         input1Zp    = input1_attr->asymm.zero_point;
     }
 
-    if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if ( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         output_fl = output_attr->dfp.fl;
         if (output_fl > 0)
@@ -190,7 +204,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
             outputScale = (float)((int64_t)1 << -output_fl);
         }
     }
-    else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    else if ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
     {
         outputScale = output_attr->asymm.scale;
         outputZP    = output_attr->asymm.zero_point;
@@ -203,13 +217,10 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
 
     output_shape  = output_attr->shape;
     gpu_param.dim = output_shape->size < 3 ? 2 : 3;
-    gpu_param.global_offset[0] = 0;
-    gpu_param.global_offset[1] = 0;
-    gpu_param.global_offset[2] = 0;
+
     gpu_param.global_scale[0]  = 8;
     gpu_param.global_scale[1]  = 1;
     gpu_param.global_scale[2]  = 1;
-
     gpu_param.global_size[0]   = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1)
                                              / gpu_param.global_scale[0], 4);
     gpu_param.global_size[1]   = (output_shape->data[1] + gpu_param.global_scale[1] - 1)
@@ -218,83 +229,8 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
                                  (output_shape->data[2] + gpu_param.global_scale[2] - 1)
                                              / gpu_param.global_scale[2] : 1;
 
-
     switch( pack_key )
     {
-        case _PACK_SELECT_KEY( I8,  I8,  I8 ):
-        case _PACK_SELECT_KEY( I16, I16, I16 ):
-        {
-            gpu_dp_inst_t uniConvConditiontoDst_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniConvIntIn0toDst_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniConvIntIn1toDst_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            if (input0_fl >= output_fl)
-            {
-                uint8_t  postshift      = (uint8_t)gpu_min(input0_fl - output_fl, MAX_POST_SHIFT_BITS);
-                uniConvIntIn0toDst_2x8.data[7]    = uniConvIntIn0toDst_2x8.data[7] | (postshift & 0x1F);
-            }
-            else
-            {
-                uint32_t idx = 0;
-                uint32_t multiplier = gpu_min((int64_t)1 << (output_fl - input0_fl), MAX_MULTIPLIER_NUM);
-                for (idx = 8; idx < 16; idx ++)
-                {
-                    uniConvIntIn0toDst_2x8.data[idx] = (uint32_t)(multiplier << 16) | (multiplier & 0xffff);
-                }
-            }
-
-
-            if (input1_fl >= output_fl)
-            {
-                uint8_t  postshift      = (uint8_t)gpu_min(input1_fl - output_fl, MAX_POST_SHIFT_BITS);
-                uniConvIntIn1toDst_2x8.data[7]    = uniConvIntIn1toDst_2x8.data[7] | (postshift & 0x1F);
-            }
-            else
-            {
-                uint32_t idx = 0;
-                uint32_t multiplier = gpu_min((int64_t)1 << (output_fl - input1_fl), MAX_MULTIPLIER_NUM);
-                for (idx = 8; idx < 16; idx ++)
-                {
-                    uniConvIntIn1toDst_2x8.data[idx] = (uint32_t)(multiplier << 16) | (multiplier & 0xffff);
-                }
-            }
-
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvIntIn0toDst_2x8", &uniConvIntIn0toDst_2x8 );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvIntIn1toDst_2x8", &uniConvIntIn1toDst_2x8 );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvConditiontoDst_2x8", &uniConvConditiontoDst_2x8 );
-            CHECK_STATUS_FAIL_GOTO(status, final );
-        }
-        break;
         case _PACK_SELECT_KEY( F16,  F16,  F16 ):
         {
             gpu_dp_inst_t uniConvConditiontoDst_2x8 = {{
@@ -312,61 +248,66 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
             CHECK_STATUS_FAIL_GOTO(status, final );
         }
         break;
+        case _PACK_SELECT_KEY( I8,  I8,  I8 ):
+        case _PACK_SELECT_KEY( I16, I16, I16 ):
         case _PACK_SELECT_KEY( U8,  U8,  U8 ):
+        case _PACK_SELECT_KEY( I8,  F16, F16 ):
+        case _PACK_SELECT_KEY( U8,  F16, F16 ):
+        case _PACK_SELECT_KEY( I16, F16, F16 ):
+        case _PACK_SELECT_KEY( F16, U8,  F16 ):
+        case _PACK_SELECT_KEY( F16, I8,  F16 ):
+        case _PACK_SELECT_KEY( F16, I16, F16 ):
+        case _PACK_SELECT_KEY( F16, F16, U8 ):
         {
-            uint32_t idx = 0;
-            gpu_dp_inst_t uniU8SubZP_MulM_PStoF16In0_2x8 = {{
-                0x99999999, // TCfg
-                0x44444444, // ASelt
+            uint32_t multAndoutZP0[2] = {0};
+            uint32_t multAndoutZP1[2] = {0};
+            gpu_dp_inst_t uniConvConditiontoDst_2x8 = {{
+                0x11111111, // TCfg
+                0x00000000, // ASelt
                 0x03020100, 0x07060504, // ABin
-                0xaaaaaaaa, // BSelt
+                0x22222222, // BSelt
                 0x00000000, 0x00000000, // BBin
                 0x00000600, // AccumType, ConstantType, and PostShift
-                0x00010001, 0x00010001, 0x00010001, 0x00010001,
-                0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
             }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniU8SubZP_MulM_PStoF16In1_2x8 = {{
-                0x99999999, // TCfg
+            gpu_dp_inst_t uniU8MulAndPostShift0_Lo_2x8 = {{
+                0xdddddddd, // TCfg
                 0x44444444, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0xaaaaaaaa, // BSelt
+                0x13121110, 0x17161514, // ABin
+                0x11111111, // BSelt
                 0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00010001, 0x00010001, 0x00010001, 0x00010001,
-                0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
             }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniU8AddZP_2x8 = {{
-                0x55555555, // TCfg
+            gpu_dp_inst_t uniU8MulAndPostShift1_Lo_2x8 = {{
+                0xdddddddd, // TCfg
                 0x44444444, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0xaaaaaaaa, // BSelt
+                0x13121110, 0x17161514, // ABin
+                0x11111111, // BSelt
                 0x00000000, 0x00000000, // BBin
-                0x00000400, // AccumType, ConstantType, and PostShift
-                0x00010001, 0x00010001, 0x00010001, 0x00010001,
-                0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
             }, GPU_DP_TYPE_16 };
 
-            uniU8SubZP_MulM_PStoF16In0_2x8.data[7] |= (in0_postShift & 0x1F);
-            uniU8SubZP_MulM_PStoF16In1_2x8.data[7] |= (in1_postShift & 0x1F);
+            multAndoutZP0[0] = (uint32_t)(in0_M0);
+            multAndoutZP0[1] = (uint32_t)((outputZP << in0_postShift) - input0Zp * in0_M0);
+            multAndoutZP1[0] = (uint32_t)(in1_M0);
+            multAndoutZP1[1] = (uint32_t)((outputZP << in1_postShift) - input1Zp * in1_M0);
 
-            for (idx = 8; idx < 16; idx ++)
-            {
-                uniU8SubZP_MulM_PStoF16In0_2x8.data[idx] = (vx_uint32)(in0_M0 << 16) | in0_M0;
-                uniU8SubZP_MulM_PStoF16In1_2x8.data[idx] = (vx_uint32)(in1_M0 << 16) | in1_M0;
-            }
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift0_Lo_2x8, in0_postShift );
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift1_Lo_2x8, in1_postShift );
 
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniU8SubZP_MulM_PStoF16In0_2x8", &uniU8SubZP_MulM_PStoF16In0_2x8 );
+            status  = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
+            status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
             status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniU8SubZP_MulM_PStoF16In1_2x8", &uniU8SubZP_MulM_PStoF16In1_2x8 );
+                "uniConvConditiontoDst_2x8",  &uniConvConditiontoDst_2x8 );
             status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniU8AddZP_2x8", &uniU8AddZP_2x8 );
+                "uniU8MulAndPostShift0_Lo_2x8",  &uniU8MulAndPostShift0_Lo_2x8 );
             status |= vsi_nn_kernel_gpu_add_param( node,
-                    "input0Zp", &input0Zp );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "input1Zp", &input1Zp );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "outputZP", &outputZP );
+                "uniU8MulAndPostShift1_Lo_2x8",  &uniU8MulAndPostShift1_Lo_2x8 );
             CHECK_STATUS_FAIL_GOTO(status, final );
         }
         break;
@@ -501,4 +442,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( select, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/slice_evis.c b/src/tim/vx/internal/src/kernel/evis/slice_evis.c
index b9f570b..8839470 100644
--- a/src/tim/vx/internal/src/kernel/evis/slice_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/slice_evis.c
@@ -39,7 +39,6 @@
 
 __BEGIN_DECLS
 
-#define _SLICE_KERNEL_SOURCE      "slice"
 #define _SLICE_KERNEL_NAME        CVIVANTE_NAMESPACE("evis.slice")
 
     // Add kernel hashtable here
@@ -50,30 +49,30 @@ __BEGIN_DECLS
 #define SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE , _IMAGE_2D, _SAMEFL) \
     (( IN1_DTYPE << 18 ) | ( IN0_DTYPE << 10 ) | ( OUT_DTYPE << 2 ) | (_IMAGE_2D << 1) | (_SAMEFL))
 
-#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
+#define PACK_KERNEL_MAP_3D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
 {   SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 0 ), \
-    SLICE_SH_KERNEL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
+    SLICE_SH_KERNEL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" }
 
 #define SLICE_SH_KERNEL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
     CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D")
 
-#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
+#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
 {   SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 0 ), \
-    SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
+    SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" }
 
 #define SLICE_SH_KERNEL_SAMEFL_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
     CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_SAMEFL")
 
-#define PACK_KERNEL_MAP_SAMEFL( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
+#define PACK_KERNEL_MAP_SAMEFL( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
 {   SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 1 ), \
-    SLICE_SH_KERNEL_SAMEFL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
+    SLICE_SH_KERNEL_SAMEFL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" }
 
 #define SLICE_SH_KERNEL_SAMEFL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
     CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_SAMEFL_2D")
 
-#define PACK_KERNEL_MAP_SAMEFL_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
+#define PACK_KERNEL_MAP_SAMEFL_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
 {   SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 1 ), \
-    SLICE_SH_KERNEL_SAMEFL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
+    SLICE_SH_KERNEL_SAMEFL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" }
 
     typedef struct
 {
@@ -85,21 +84,33 @@ __BEGIN_DECLS
 static const _kernel_map_type _slice_kernel_map[] =
 {
     // Register kernel here
-    PACK_KERNEL_MAP( F16, I32, F16, _SLICE_KERNEL_SOURCE ),
-    PACK_KERNEL_MAP( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
-    PACK_KERNEL_MAP( U8, I32,  U8,  _SLICE_KERNEL_SOURCE ),
-    PACK_KERNEL_MAP( I8, I32,  I8,  _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP_3D( F16, I32, F16 ),
+    PACK_KERNEL_MAP_3D( F16, I32, I8 ),
+    PACK_KERNEL_MAP_3D( F16, I32, U8 ),
+    PACK_KERNEL_MAP_3D( F16, I32, I16 ),
+    PACK_KERNEL_MAP_3D( I8,  I32, F16 ),
+    PACK_KERNEL_MAP_3D( U8,  I32, F16 ),
+    PACK_KERNEL_MAP_3D( I16, I32, F16 ),
+    PACK_KERNEL_MAP_3D( I16, I32, I16 ),
+    PACK_KERNEL_MAP_3D( U8,  I32,  U8 ),
+    PACK_KERNEL_MAP_3D( I8,  I32,  I8 ),
 
-    PACK_KERNEL_MAP_2D( F16, I32, F16, _SLICE_KERNEL_SOURCE ),
-    PACK_KERNEL_MAP_2D( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
-    PACK_KERNEL_MAP_2D( U8, I32,  U8,  _SLICE_KERNEL_SOURCE ),
-    PACK_KERNEL_MAP_2D( I8, I32,  I8,  _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP_2D( F16, I32, F16 ),
+    PACK_KERNEL_MAP_2D( I16, I32, I16 ),
+    PACK_KERNEL_MAP_2D( F16, I32, I8 ),
+    PACK_KERNEL_MAP_2D( F16, I32, U8 ),
+    PACK_KERNEL_MAP_2D( F16, I32, I16 ),
+    PACK_KERNEL_MAP_2D( I8,  I32, F16 ),
+    PACK_KERNEL_MAP_2D( U8,  I32, F16 ),
+    PACK_KERNEL_MAP_2D( I16, I32, F16 ),
+    PACK_KERNEL_MAP_2D( U8, I32,  U8 ),
+    PACK_KERNEL_MAP_2D( I8, I32,  I8 ),
 
-    PACK_KERNEL_MAP_SAMEFL( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
-    PACK_KERNEL_MAP_SAMEFL( U8,  I32, U8, _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP_SAMEFL( I16, I32, I16 ),
+    PACK_KERNEL_MAP_SAMEFL( U8,  I32, U8 ),
 
-    PACK_KERNEL_MAP_SAMEFL_2D( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
-    PACK_KERNEL_MAP_SAMEFL_2D( U8, I32,  U8,  _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP_SAMEFL_2D( I16, I32, I16 ),
+    PACK_KERNEL_MAP_SAMEFL_2D( U8, I32,  U8 ),
 };
 
 #define _INPUT_NUM          (2)
@@ -201,18 +212,16 @@ DEF_KERNEL_INITIALIZER(_slice_initializer)
         scaleOut         = output_attr->asymm.scale;
     }
 
-    if ((F16 == input_dtype)
-        || (I16 == input_dtype)
-        || (BF16 == input_dtype)
-        )
+    if ((I8 == input_dtype && input_dtype == output_dtype ) ||
+        (U8 == input_dtype && input_dtype == output_dtype ) )
     {
-        gpu_param.global_scale[0]  = 8;
+        gpu_param.global_scale[0]  = 16;
         gpu_param.global_scale[1]  = 1;
         gpu_param.global_scale[2]  = 1;
     }
     else
     {
-        gpu_param.global_scale[0]  = 16;
+        gpu_param.global_scale[0]  = 8;
         gpu_param.global_scale[1]  = 1;
         gpu_param.global_scale[2]  = 1;
     }
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
index fea09ff..e9a9272 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@@ -1416,31 +1416,42 @@ vsi_nn_kernel_tensor_attr_t * vsi_nn_kernel_tensor_attr_create
     switch( attr->quant )
     {
     case VSI_NN_KERNEL_QUANT_DFP:
-        {
+    {
         int8_t fl = 0;
         status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_FIXED_POINT_POS,
             &fl, sizeof(int8_t));
         CHECK_STATUS( status );
         attr->dfp.fl = (int32_t)fl;
+        if (fl >= 0) {
+            attr->scale = 1.0f / ((float)((int64_t)1 << fl));
+        } else {
+            attr->scale = (float)((int64_t)1 << -fl);
         }
-        break;
+    } break;
     case VSI_NN_KERNEL_QUANT_ASYMM:
-        {
-        status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_ZERO_POINT,
-            &(attr->asymm.zero_point), sizeof(int32_t));
-        CHECK_STATUS( status );
-        status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_SCALE,
-            &(attr->asymm.scale), sizeof(float));
-        CHECK_STATUS( status );
+    {
+        status = vxQueryTensor((vx_tensor)tensor,
+                               VX_TENSOR_ZERO_POINT,
+                               &(attr->asymm.zero_point),
+                               sizeof(int32_t));
+        CHECK_STATUS(status);
+        status = vxQueryTensor((vx_tensor)tensor,
+                               VX_TENSOR_SCALE,
+                               &(attr->asymm.scale),
+                               sizeof(float));
+        CHECK_STATUS(status);
         // Reset scale to 1e-8
-        if( (attr->asymm.scale - 0.f) < 1e-8 )
-            {
+        if ((attr->asymm.scale - 0.f) < 1e-8)
+        {
             attr->asymm.scale = (float)1e-8;
             attr->asymm.zero_point = 0;
-            }
         }
-        break;
+        attr->scale = attr->asymm.scale;
+        attr->zero_point = attr->asymm.zero_point;
+    }
+    break;
     default:
+        attr->scale = 1.0f;
         break;
     }
     return attr;
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
index 80d56d0..dd32c01 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
@@ -189,6 +189,16 @@ static float celu_eval(float x, vsi_nn_kernel_lut_params *lut_param)
     return positive + negative;
 }
 
+static float rcp_eval(float x)
+{
+    return 1.0f / x;
+}
+
+static float softsign_eval(float x)
+{
+    return x / (1 + vsi_abs(x));
+}
+
 static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *lut_param)
 {
     float result = 0;
@@ -245,6 +255,12 @@ static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *
     case VSI_NN_KERNEL_LUT_CELU:
         result =  celu_eval(data, lut_param);
         break;
+    case VSI_NN_KERNEL_LUT_RCP:
+        result =  rcp_eval(data);
+        break;
+    case VSI_NN_KERNEL_LUT_SOFTSIGN:
+        result = softsign_eval(data);
+        break;
     default:
         VSILOGE( "unsupported activation function:%d", lut_param->act_type );
         break;
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
index bdb2240..d27a5f6 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
@@ -133,5 +133,9 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(gelu)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(hard_gelu)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(matrixmul)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(celu)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(rcp)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(softsign)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(resize_bilinear)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(resize_nearest)
 
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
index 0ab544b..c6edaaa 100644
--- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
@@ -146,6 +146,8 @@ REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( erf,          VSI_NN_KERNEL_LUT_ERF )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( relu_keras,   VSI_NN_KERNEL_LUT_RELU_KERAS )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( clip,         VSI_NN_KERNEL_LUT_CLIP )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( celu,         VSI_NN_KERNEL_LUT_CELU )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( rcp,          VSI_NN_KERNEL_LUT_RCP )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( softsign,     VSI_NN_KERNEL_LUT_SOFTSIGN )
 
 #undef REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL
 
diff --git a/src/tim/vx/internal/src/kernel/vx/resize_vx.c b/src/tim/vx/internal/src/kernel/vx/resize_vx.c
new file mode 100644
index 0000000..3b2b167
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vx/resize_vx.c
@@ -0,0 +1,152 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+#define REGISTER_SOFTMAX_OPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vx_node node = NULL;
+    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
+    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
+    int32_t type  = vsi_nn_kernel_param_get_int32( params, "type" );
+
+#ifdef VX_SCALE_EXTRA_PARAMETER_SUPPORT
+    vx_nn_scale_params_ext_t param;
+    param.align_corners = align_corners;
+    param.half_pixel_centers = half_pixel_centers;
+    switch (type)
+    {
+        case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR:
+            param.base.type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
+            break;
+        case VSI_NN_INTERPOLATION_BILINEAR:
+            param.base.type = VX_INTERPOLATION_BILINEAR;
+            break;
+        case VSI_NN_INTERPOLATION_AREA:
+            param.base.type = VX_INTERPOLATION_AREA;
+            break;
+        default:
+            param.base.type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
+    }
+    node = vxTensorScaleNode( graph->g,
+                              inputs[0]->t,
+                              (vx_nn_scale_params)(&param),
+                              sizeof(vx_nn_scale_params_ext_t),
+                              outputs[0]->t );
+#else
+    vx_nn_scale_params_t param;
+    if (align_corners || half_pixel_centers)
+    {
+        return NULL;
+    }
+    switch (type)
+    {
+        case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR:
+            param.type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
+            break;
+        case VSI_NN_INTERPOLATION_BILINEAR:
+            param.type = VX_INTERPOLATION_BILINEAR;
+            break;
+        case VSI_NN_INTERPOLATION_AREA:
+            param.type = VX_INTERPOLATION_AREA;
+            break;
+        default:
+            param.type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
+            break;
+    }
+
+    node = vxTensorScaleNode( graph->g,
+                              inputs[0]->t,
+                              &param,
+                              sizeof(param),
+                              outputs[0]->t );
+#endif
+    if ( NULL == node )
+    {
+        VSILOGI("Call vxTensorScaleNode fail.(resize)");
+    }
+
+    return (vsi_nn_kernel_node_t)node;
+} /* _setup() */
+
+#define REGISTER_RESIZE_OPENVX_KERNEL(KERNEL_NAME) \
+    static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num, \
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ) \
+    { \
+        return _setup(graph, inputs, input_num, outputs, output_num, \
+                params, kernel); \
+    } \
+    REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
+
+REGISTER_RESIZE_OPENVX_KERNEL( resize_nearest )
+REGISTER_RESIZE_OPENVX_KERNEL( resize_bilinear )
+
+#undef REGISTER_RESIZE_OPENVX_KERNEL
diff --git a/src/tim/vx/internal/src/kernel/vx/square_vx.c b/src/tim/vx/internal/src/kernel/vx/square_vx.c
index 572737c..5ae1499 100644
--- a/src/tim/vx/internal/src/kernel/vx/square_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/square_vx.c
@@ -32,7 +32,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel_lut.h"
 
 static vsi_nn_kernel_node_t _setup
     (
@@ -46,57 +45,7 @@ static vsi_nn_kernel_node_t _setup
     )
 {
     vx_node node = NULL;
-#ifdef VX_USER_LOOKUP_TABLE_SUPPORT
-    vx_lut lut1 = NULL;
-    vx_lut lut2 = NULL;
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_lut_params lut_param;
 
-    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
-         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 )
-    {
-        return NULL;
-    }
-
-    lut_param.act_type = VSI_NN_KERNEL_LUT_SQUARE;
-
-    lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE);
-    lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE);
-    if( NULL == lut1 || NULL == lut2 )
-    {
-        VSILOGE("create lut object fail.");
-        goto final;
-    }
-
-    status = vsi_nn_kernel_lut(lut1, lut2, &lut_param);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-
-    node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t);
-    if ( NULL == node )
-    {
-        node = vxActivationLayer(
-            graph->g,
-            inputs[0]->t,
-            VX_NN_ACTIVATION_SQUARE,
-            0,
-            0,
-            outputs[0]->t
-            );
-    }
-
-final:
-    if (lut1)
-    {
-        vxReleaseLUT(&lut1);
-        lut1 = NULL;
-    }
-    if (lut2)
-    {
-        vxReleaseLUT(&lut2);
-        lut2 = NULL;
-    }
-    return (vsi_nn_kernel_node_t)node;
-#else
     node = vxActivationLayer(
         graph->g,
         inputs[0]->t,
@@ -107,7 +56,6 @@ final:
         );
 
     return (vsi_nn_kernel_node_t)node;
-#endif
 } /* _setup() */
 
 #define REGISTER_SQUARE_OPENVX_KERNEL(KERNEL_NAME) \
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl
new file mode 100644
index 0000000..0372981
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl
@@ -0,0 +1,478 @@
+__kernel void cumsum_F32toF32_axis2(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    float4 sum = (float4)(0);
+
+    if(exclusive && rev)
+    {
+        coord_out.z = channel - 1;
+        write_imagef(output, coord_out, sum);
+
+        for(coord.z = channel - 1; coord.z > 0; coord.z--)
+        {
+            float4 data = read_imagef(input, coord);
+            coord_out.z--;
+            sum += data;
+
+            write_imagef(output, coord_out, sum);
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.z = 0;
+        write_imagef(output, coord_out, sum);
+        for(coord.z = 0; coord.z < channel - 1; coord.z++)
+        {
+            float4 data = read_imagef(input, coord);
+            coord_out.z++;
+            sum += data;
+
+            write_imagef(output, coord_out, sum);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)
+        {
+            float4 data = read_imagef(input, coord);
+            sum += data;
+
+            write_imagef(output, coord, sum);
+        }
+    }
+    else
+    {
+        for(coord.z = 0; coord.z < channel; coord.z++)
+        {
+            float4 data = read_imagef(input, coord);
+            sum += data;
+
+            write_imagef(output, coord, sum);
+        }
+    }
+}
+
+__kernel void cumsum_U8toU8_axis2(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    uint4 sum = (uint4)(0);
+    uint4 dst = (uint4)(0);
+
+    float cnt = 0.0f;
+
+    if(exclusive && rev)
+    {
+        coord_out.z = channel - 1;
+        write_imageui(output, coord_out, dst);
+        for(coord.z = channel - 1; coord.z > 0; coord.z--)
+        {
+            uint4 data = read_imageui(input, coord);
+            coord_out.z--;
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord_out, dst);
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.z = 0;
+        write_imageui(output, coord_out, dst);
+        for(coord.z = 0; coord.z < channel - 1; coord.z++)
+        {
+            uint4 data = read_imageui(input, coord);
+            coord_out.z++;
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord_out, dst);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)
+        {
+            uint4 data = read_imageui(input, coord);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord, dst);
+        }
+    }
+    else
+    {
+        for(coord.z = 0; coord.z < channel; coord.z++)
+        {
+            uint4 data = read_imageui(input, coord);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord, dst);
+        }
+    }
+}
+
+__kernel void cumsum_F32toF32_axis1(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    float4 sum = (float4)(0);
+
+    if(exclusive && rev)
+    {
+        coord_out.y = height - 1;
+        write_imagef(output, coord_out, sum);
+        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        {
+            float4 data = read_imagef(input, coord);
+            coord_out.y--;
+            sum += data;
+
+            write_imagef(output, coord_out, sum);
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.y = 0;
+        write_imagef(output, coord_out, sum);
+        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        {
+            float4 data = read_imagef(input, coord);
+            coord_out.y++;
+            sum += data;
+
+            write_imagef(output, coord_out, sum);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            float4 data = read_imagef(input, coord);
+            sum += data;
+
+            write_imagef(output, coord, sum);
+        }
+    }
+    else
+    {
+        for(coord.y = 0; coord.y < height; coord.y++)
+        {
+            float4 data = read_imagef(input, coord);
+            sum += data;
+
+            write_imagef(output, coord, sum);
+        }
+    }
+}
+
+__kernel void cumsum_U8toU8_axis1(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    uint4 sum = (uint4)(0);
+    uint4 dst = (uint4)(0);
+
+    float cnt = 0;
+
+    if(exclusive && rev)
+    {
+        coord_out.y = height - 1;
+        write_imageui(output, coord_out, dst);
+
+        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        {
+            uint4 data = read_imageui(input, coord);
+            cnt += 1.0f;
+            coord_out.y--;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord_out, dst);
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.y = 0;
+        write_imageui(output, coord_out, dst);
+        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        {
+            uint4 data = read_imageui(input, coord);
+            cnt += 1.0f;
+            coord_out.y++;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord_out, dst);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            uint4 data = read_imageui(input, coord);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord, dst);
+        }
+    }
+    else
+    {
+        for(coord.y = 0; coord.y < height; coord.y++)
+        {
+            uint4 data = read_imageui(input, coord);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord, dst);
+        }
+    }
+}
+
+__kernel void cumsum_F32toF32_axis0(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    float4 sum = (float4)(0);
+
+    if(exclusive && rev)
+    {
+        coord_out.x = width - 1;
+        write_imagef(output, coord_out, sum);
+        for(coord.x = width - 1; coord.x > 0; coord.x--)
+        {
+            float4 data = read_imagef(input, coord);
+            coord_out.x--;
+            sum += data;
+
+            write_imagef(output, coord_out, sum);
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.x = 0;
+        write_imagef(output, coord_out, sum);
+        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        {
+            float4 data = read_imagef(input, coord);
+            coord_out.x++;
+            sum += data;
+
+            write_imagef(output, coord_out, sum);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+        {
+            float4 data = read_imagef(input, coord);
+            sum += data;
+
+            write_imagef(output, coord, sum);
+        }
+    }
+    else
+    {
+        for(coord.x = 0; coord.x < width; coord.x++)
+        {
+            float4 data = read_imagef(input, coord);
+            sum += data;
+
+            write_imagef(output, coord, sum);
+        }
+    }
+}
+
+__kernel void cumsum_U8toU8_axis0(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    uint4 sum = (uint4)(0);
+    uint4 dst = (uint4)(0);
+
+    float cnt = 0;
+
+    if(exclusive && rev)
+    {
+        coord_out.x = width - 1;
+        write_imageui(output, coord_out, dst);
+        for(coord.x = width - 1; coord.x > 0; coord.x--)
+        {
+            uint4 data = read_imageui(input, coord);
+            coord_out.x--;
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord_out, dst);
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.x = 0;
+        write_imageui(output, coord_out, dst);
+        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        {
+            uint4 data = read_imageui(input, coord);
+            coord_out.x++;
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord_out, dst);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+        {
+            uint4 data = read_imageui(input, coord);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord, dst);
+        }
+    }
+    else
+    {
+        for(coord.x = 0; coord.x < width; coord.x++)
+        {
+            uint4 data = read_imageui(input, coord);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord, dst);
+        }
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl
new file mode 100644
index 0000000..caced34
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl
@@ -0,0 +1,314 @@
+
+__kernel void cumsum_F32toF32_axis1_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    float4 sum = (float4)(0);
+
+    if(exclusive && rev)
+    {
+        coord.w = height - 1;
+        write_imagef(output, coord.zw, sum);
+        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            coord.w--;
+            sum += data;
+
+            write_imagef(output, coord.zw, sum);
+        }
+    }
+    else if(exclusive)
+    {
+        write_imagef(output, coord.zw, sum);
+        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            coord.w++;
+            sum += data;
+
+            write_imagef(output, coord.zw, sum);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            sum += data;
+
+            write_imagef(output, coord.xy, sum);
+        }
+    }
+    else
+    {
+        for(coord.y = 0; coord.y < height; coord.y++)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            sum += data;
+
+            write_imagef(output, coord.xy, sum);
+        }
+    }
+}
+
+__kernel void cumsum_U8toU8_axis1_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    uint4 sum = (uint4)(0);
+    uint4 dst = (uint4)(0);
+
+    float cnt = 0;
+
+    if(exclusive && rev)
+    {
+        coord.w = height - 1;
+        write_imageui(output, coord.zw, sum);
+        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        {
+            uint4 data = read_imageui(input, coord.xy);
+            cnt += 1.0f;
+            coord.w--;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.zw, dst);
+        }
+    }
+    else if(exclusive)
+    {
+        write_imageui(output, coord.zw, sum);
+        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        {
+            uint4 data = read_imageui(input, coord.xy);
+            cnt += 1.0f;
+            coord.w++;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.zw, dst);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            uint4 data = read_imageui(input, coord.xy);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.xy, dst);
+        }
+    }
+    else
+    {
+        for(coord.y = 0; coord.y < height; coord.y++)
+        {
+            uint4 data = read_imageui(input, coord.xy);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.xy, dst);
+        }
+    }
+}
+
+__kernel void cumsum_F32toF32_axis0_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    float4 sum = (float4)(0);
+
+    if(exclusive && rev)
+    {
+        coord.x = width - 1;
+        coord.z = coord.x;
+        write_imagef(output, coord.zw, sum);
+        for(; coord.x > 0; coord.x--)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            coord.z--;
+            sum += data;
+
+            write_imagef(output, coord.zw, sum);
+        }
+    }
+    else if(exclusive)
+    {
+        coord.z = 0;
+        write_imagef(output, coord.zw, sum);
+        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            coord.z++;
+            sum += data;
+
+            write_imagef(output, coord.zw, sum);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            sum += data;
+
+            write_imagef(output, coord.xy, sum);
+        }
+    }
+    else
+    {
+        for(coord.x = 0; coord.x < width; coord.x++)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            sum += data;
+
+            write_imagef(output, coord.xy, sum);
+        }
+    }
+}
+
+__kernel void cumsum_U8toU8_axis0_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    uint4 sum = (uint4)(0);
+    uint4 dst = (uint4)(0);
+
+    float cnt = 0.0f;
+
+    if(exclusive && rev)
+    {
+        coord.x = width - 1;
+        coord.z = coord.x;
+        write_imageui(output, coord.zw, sum);
+        for(; coord.x > 0; coord.x--)
+        {
+            uint4 data = read_imageui(input, coord.xy);
+            coord.z--;
+            cnt += 1.0;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.zw, dst);
+        }
+    }
+    else if(exclusive)
+    {
+        coord.z = 0;
+        write_imageui(output, coord.zw, sum);
+        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        {
+            uint4 data = read_imageui(input, coord.xy);
+            cnt += 1.0f;
+            coord.z++;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.zw, dst);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+        {
+            uint4 data = read_imageui(input, coord.xy);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.xy, dst);
+        }
+    }
+    else
+    {
+        for(coord.x = 0; coord.x < width; coord.x++)
+        {
+            uint4 data = read_imageui(input, coord.xy);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.xy, dst);
+        }
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl
index c991ffc..65be20e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl
@@ -136,6 +136,21 @@ float eltwise_unary_celu(float val, float alpha, float rcp_alpha)
     return val < 0 ? x : val;
 }
 
+float eltwise_unary_rcp(float val, float alpha, float rcp_alpha)
+{
+    return 1.0f / val;
+}
+
+float eltwise_unary_sign(float val, float alpha, float rcp_alpha)
+{
+    return sign(val);
+}
+
+float eltwise_unary_softsign(float val, float alpha, float rcp_alpha)
+{
+    return val / (1.0f + fabs(val));
+}
+
 #define ELTWISE_UNARY_F32_2D(func_name) \
 __kernel void func_name##_F32toF32_2D \
     ( \
@@ -170,6 +185,9 @@ ELTWISE_UNARY_F32_2D(gelu)
 ELTWISE_UNARY_F32_2D(hard_gelu)
 ELTWISE_UNARY_F32_2D(selu)
 ELTWISE_UNARY_F32_2D(celu)
+ELTWISE_UNARY_F32_2D(rcp)
+ELTWISE_UNARY_F32_2D(sign)
+ELTWISE_UNARY_F32_2D(softsign)
 
 #define ELTWISE_UNARY_U8_2D(func_name) \
 __kernel void func_name##_U8toU8_2D \
@@ -206,6 +224,9 @@ ELTWISE_UNARY_U8_2D(gelu)
 ELTWISE_UNARY_U8_2D(hard_gelu)
 ELTWISE_UNARY_U8_2D(selu)
 ELTWISE_UNARY_U8_2D(celu)
+ELTWISE_UNARY_U8_2D(rcp)
+ELTWISE_UNARY_U8_2D(sign)
+ELTWISE_UNARY_U8_2D(softsign)
 
 __kernel void neg_I32toI32_2D
     (
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl
index 20cc454..5a21ad8 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl
@@ -136,6 +136,21 @@ float eltwise_unary_celu(float val, float alpha, float rcp_alpha)
     return val < 0 ? x : val;
 }
 
+float eltwise_unary_rcp(float val, float alpha, float rcp_alpha)
+{
+    return 1.0f / val;
+}
+
+float eltwise_unary_sign(float val, float alpha, float rcp_alpha)
+{
+    return sign(val);
+}
+
+float eltwise_unary_softsign(float val, float alpha, float rcp_alpha)
+{
+    return val / (1.0f + fabs(val));
+}
+
 #define ELTWISE_UNARY_F32(func_name) \
 __kernel void func_name##_F32toF32 \
     ( \
@@ -170,6 +185,9 @@ ELTWISE_UNARY_F32(gelu)
 ELTWISE_UNARY_F32(hard_gelu)
 ELTWISE_UNARY_F32(selu)
 ELTWISE_UNARY_F32(celu)
+ELTWISE_UNARY_F32(rcp)
+ELTWISE_UNARY_F32(sign)
+ELTWISE_UNARY_F32(softsign)
 
 #define ELTWISE_UNARY_U8(func_name) \
 __kernel void func_name##_U8toU8 \
@@ -206,6 +224,9 @@ ELTWISE_UNARY_U8(gelu)
 ELTWISE_UNARY_U8(hard_gelu)
 ELTWISE_UNARY_U8(selu)
 ELTWISE_UNARY_U8(celu)
+ELTWISE_UNARY_U8(rcp)
+ELTWISE_UNARY_U8(sign)
+ELTWISE_UNARY_U8(softsign)
 
 __kernel void neg_I32toI32
     (
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f16.cl b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f16.cl
deleted file mode 100644
index f05e01d..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f16.cl
+++ /dev/null
@@ -1,229 +0,0 @@
-__kernel void instance_norm_meanvari_F16(
-    __read_only image2d_array_t   input,
-    __write_only image2d_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int width,
-    int height
-    )
-{
-    int gidx = get_global_id(0);
-    int gidz = get_global_id(1);
-    int lidx = get_local_id(0);
-
-    int4 coord = (int4)(gidx, 0, gidz, 0);
-    float4 data;
-    float sum = 0, sqr = 0;
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    if(gidx < width)
-    {
-        for(coord.y = 0; coord.y < height;)
-        {
-            data = read_imagef(input, coord);
-            coord.y++;
-            sum += data.x;
-            sqr += data.x * data.x;
-        }
-    }
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
-        sum = 0; sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-
-        float4 dst = (float4)(0);
-        dst.x = sum;
-        write_imagef(output, coord_out.xy, dst);
-        coord_out.x++;
-        dst.x = sqr;
-        write_imagef(output, coord_out.xy, dst);
-    }
-}
-
-__kernel void instance_norm_meanvari_F16_2D(
-    __read_only image2d_t   input,
-    __write_only image2d_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int width,
-    int height
-    )
-{
-    int gidx = get_global_id(0);
-    int gidz = get_global_id(1);
-    int lidx = get_local_id(0);
-    int gidy = gidz * height;
-
-    int2 coord = (int2)(gidx, gidy);
-    float4 data;
-    float sum = 0, sqr = 0;
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    int endH = gidy + height;
-    if(gidx < width)
-    {
-        for(; coord.y < endH;)
-        {
-            data = read_imagef(input, coord);
-            coord.y++;
-            sum += data.x;
-            sqr += data.x * data.x;
-        }
-    }
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
-        sum = 0; sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-
-        float4 dst = (float4)(0);
-        dst.x = sum;
-        write_imagef(output, coord_out.xy, dst);
-        coord_out.x++;
-        dst.x = sqr;
-        write_imagef(output, coord_out.xy, dst);
-    }
-}
-
-__kernel void instance_norm_F16toF16(
-    __read_only image2d_array_t   input,
-    __read_only image2d_t   bias,
-    __read_only image2d_t   scale,
-    __read_only image2d_t   meanVari,
-    __write_only image2d_array_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int output_zp,
-    float output_scale,
-    float output_fl,
-    int width,
-    int height,
-    float dim_ratio,
-    int group_num
-    )
-{
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
-    int4 coord_para = (int4)(0, gidz, 0, 0);
-
-    float4 gamma = read_imagef(scale, coord_para.yx);
-    float4 beta  = read_imagef(bias, coord_para.yx);
-    float4 mean_vari = (float4)(0);
-    float scale_vari, bias_val;
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari.x += read_imagef(meanVari, coord_para.xy).x;
-        coord_para.x++;
-        mean_vari.y += read_imagef(meanVari, coord_para.xy).x;
-        coord_para.x+=3;
-    }
-    mean_vari *= dim_ratio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = gamma.s0 * mean_vari.s1;
-    bias_val = (beta.s0 - scale_vari * mean_vari.s0);
-
-    float4 data, dst;
-    for(coord.y = 0; coord.y < height;coord.y++)
-    {
-        data = read_imagef(input, coord);
-
-        dst.x = data.x * scale_vari + bias_val;
-        write_imagef(output, coord, dst);
-    }
-}
-
-__kernel void instance_norm_F16toF16_2D(
-    __read_only image2d_t   input,
-    __read_only image2d_t   bias,
-    __read_only image2d_t   scale,
-    __read_only image2d_t   meanVari,
-    __write_only image2d_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int output_zp,
-    float output_scale,
-    float output_fl,
-    int width,
-    int height,
-    float dim_ratio,
-    int group_num
-    )
-{
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-    int2 coord = (int2)(get_global_id(0), gidy);
-    int2 coord_para = (int2)(0, gidz);
-    int endH = gidy + height;
-
-    float4 gamma = read_imagef(scale, coord_para.yx);
-    float4 beta  = read_imagef(bias, coord_para.yx);
-    float4 mean_vari = (float4)(0);
-    float scale_vari, bias_val;
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari.x += read_imagef(meanVari, coord_para.xy).x;
-        coord_para.x++;
-        mean_vari.y += read_imagef(meanVari, coord_para.xy).x;
-        coord_para.x+=3;
-    }
-    mean_vari *= dim_ratio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = gamma.s0 * mean_vari.s1;
-    bias_val = (beta.s0 - scale_vari * mean_vari.s0);
-
-    float4 data, dst;
-    for(; coord.y < endH; coord.y++)
-    {
-        data = read_imagef(input, coord);
-
-        dst.x = data.x * scale_vari + bias_val;
-        write_imagef(output, coord, dst);
-    }
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f32.cl
index 5946570..85d7f98 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f32.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f32.cl
@@ -1,13 +1,10 @@
-__kernel void instance_norm_meanvari_F32(
-    __read_only image2d_array_t   input,
-    __write_only image2d_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int width,
-    int height
+__kernel void instance_norm_sums_F32(
+    __read_only  image2d_array_t input,
+    __write_only image2d_t       output,
+                 float           eps,
+                 int             rsFlg,
+                 int             width,
+                 int             height
     )
 {
     int gidx = get_global_id(0);
@@ -27,8 +24,8 @@ __kernel void instance_norm_meanvari_F32(
         {
             data = read_imagef(input, coord);
             coord.y++;
-            sum += data.x;
-            sqr += data.x * data.x;
+            sum = sum + data.x;
+            sqr = sqr + data.x * data.x;
         }
     }
     lcl_sum[lidx] = sum;
@@ -58,16 +55,13 @@ __kernel void instance_norm_meanvari_F32(
     }
 }
 
-__kernel void instance_norm_meanvari_F32_2D(
-    __read_only image2d_t   input,
-    __write_only image2d_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int width,
-    int height
+__kernel void instance_norm_sums_F32_2D(
+    __read_only  image2d_t input,
+    __write_only image2d_t output,
+                 float     eps,
+                 int       rsFlg,
+                 int       width,
+                 int       height
     )
 {
     int gidx = get_global_id(0);
@@ -89,8 +83,8 @@ __kernel void instance_norm_meanvari_F32_2D(
         {
             data = read_imagef(input, coord);
             coord.y++;
-            sum += data.x;
-            sqr += data.x * data.x;
+            sum = sum + data.x;
+            sqr = sqr + data.x * data.x;
         }
     }
     lcl_sum[lidx] = sum;
@@ -121,23 +115,19 @@ __kernel void instance_norm_meanvari_F32_2D(
 }
 
 __kernel void instance_norm_F32toF32(
-    __read_only image2d_array_t   input,
-    __read_only image2d_t   bias,
-    __read_only image2d_t   scale,
-    __read_only image2d_t   meanVari,
-    __write_only image2d_array_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int output_zp,
-    float output_scale,
-    float output_fl,
-    int width,
-    int height,
-    float dim_ratio,
-    int group_num
+    __read_only  image2d_array_t input,
+    __read_only  image2d_t       bias,
+    __read_only  image2d_t       scale,
+    __read_only  image2d_t       meanVari,
+    __write_only image2d_array_t output,
+                 float           eps,
+                 int             rsFlg,
+                 int             output_zp,
+                 float           output_scale,
+                 int             width,
+                 int             height,
+                 float           inv_multiplier,
+                 int             group_num
     )
 {
     int gidz = get_global_id(1);
@@ -156,7 +146,7 @@ __kernel void instance_norm_F32toF32(
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;
         coord_para.x+=3;
     }
-    mean_vari *= dim_ratio;
+    mean_vari *= inv_multiplier;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
     mean_vari.s1 = rsqrt(mean_vari.s1);
 
@@ -174,23 +164,19 @@ __kernel void instance_norm_F32toF32(
 }
 
 __kernel void instance_norm_F32toF32_2D(
-    __read_only image2d_t   input,
-    __read_only image2d_t   bias,
-    __read_only image2d_t   scale,
-    __read_only image2d_t   meanVari,
-    __write_only image2d_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int output_zp,
-    float output_scale,
-    float output_fl,
-    int width,
-    int height,
-    float dim_ratio,
-    int group_num
+    __read_only  image2d_t input,
+    __read_only  image2d_t bias,
+    __read_only  image2d_t scale,
+    __read_only  image2d_t meanVari,
+    __write_only image2d_t output,
+                 float     eps,
+                 int       rsFlg,
+                 int       output_zp,
+                 float     output_scale,
+                 int       width,
+                 int       height,
+                 float     inv_multiplier,
+                 int       group_num
     )
 {
     int gidz = get_global_id(1);
@@ -211,12 +197,12 @@ __kernel void instance_norm_F32toF32_2D(
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;
         coord_para.x+=3;
     }
-    mean_vari *= dim_ratio;
+    mean_vari *= inv_multiplier;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
     mean_vari.s1 = rsqrt(mean_vari.s1);
 
     scale_vari = gamma.s0 * mean_vari.s1;
-    bias_val = beta.s0 - scale_vari * mean_vari.s0;
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0);
 
     float4 data, dst;
     for(; coord.y < endH; coord.y++)
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_i32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_i32.cl
index 3928749..12b6243 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_i32.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_i32.cl
@@ -1,13 +1,10 @@
-__kernel void instance_norm_meanvari_I32(
-    __read_only image2d_array_t   input,
-    __write_only image2d_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int width,
-    int height
+__kernel void instance_norm_sums_I32(
+    __read_only  image2d_array_t input,
+    __write_only image2d_t       output,
+                 float           eps,
+                 int             rsFlg,
+                 int             width,
+                 int             height
     )
 {
     int gidx = get_global_id(0);
@@ -16,9 +13,8 @@ __kernel void instance_norm_meanvari_I32(
 
     int4 coord = (int4)(gidx, 0, gidz, 0);
     int4 data;
-    float sum = 0, sqr = 0;
-    int tmpSum = 0;
-    float e2InScale = input_fl * input_fl;
+    float2 sum_x_x2 = 0;
+    int2 _sum_x_x2 = 0;
 
     __local float lcl_sum[16];
     __local float lcl_sqr[16];
@@ -29,13 +25,13 @@ __kernel void instance_norm_meanvari_I32(
         {
             data = read_imagei(input, coord);
             coord.y++;
-            tmpSum += data.x;
-            sqr += (data.x * data.x * e2InScale);
+            _sum_x_x2.x = _sum_x_x2.x + data.x;
+            _sum_x_x2.y = _sum_x_x2.y + data.x * data.x;
         }
-        sum = tmpSum * input_fl;
+        sum_x_x2 = convert_float2(_sum_x_x2);
     }
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
+    lcl_sum[lidx] = sum_x_x2.x;
+    lcl_sqr[lidx] = sum_x_x2.y;
     barrier(CLK_LOCAL_MEM_FENCE);
 
     int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
@@ -45,7 +41,7 @@ __kernel void instance_norm_meanvari_I32(
         __local float4* tmp_sum = (__local float4*)lcl_sum;
         __local float4* tmp_sqr = (__local float4*)lcl_sqr;
 
-        sum = 0; sqr = 0;
+        float sum = 0, sqr = 0;
         for(int i = 0; i < 4; i++)
         {
             sum += dot(tmp_sum[i], one);
@@ -61,16 +57,13 @@ __kernel void instance_norm_meanvari_I32(
     }
 }
 
-__kernel void instance_norm_meanvari_I32_2D(
-    __read_only image2d_t   input,
-    __write_only image2d_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int width,
-    int height
+__kernel void instance_norm_sums_I32_2D(
+    __read_only  image2d_t input,
+    __write_only image2d_t output,
+                 float     eps,
+                 int       rsFlg,
+                 int       width,
+                 int       height
     )
 {
     int gidx = get_global_id(0);
@@ -80,9 +73,8 @@ __kernel void instance_norm_meanvari_I32_2D(
 
     int2 coord = (int2)(gidx, gidy);
     int4 data;
-    float sum = 0, sqr = 0;
-    int tmpSum = 0;
-    float e2InScale = input_fl * input_fl;
+    float2 sum_x_x2 = 0;
+    int2 _sum_x_x2 = 0;
 
     __local float lcl_sum[16];
     __local float lcl_sqr[16];
@@ -94,13 +86,13 @@ __kernel void instance_norm_meanvari_I32_2D(
         {
             data = read_imagei(input, coord);
             coord.y++;
-            tmpSum += data.x;
-            sqr += (data.x * data.x * e2InScale);
+            _sum_x_x2.x = _sum_x_x2.x + data.x;
+            _sum_x_x2.y = _sum_x_x2.y + data.x * data.x;
         }
-        sum = tmpSum * input_fl;
+        sum_x_x2 = convert_float2(_sum_x_x2);
     }
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
+    lcl_sum[lidx] = sum_x_x2.x;
+    lcl_sqr[lidx] = sum_x_x2.y;
     barrier(CLK_LOCAL_MEM_FENCE);
 
     int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
@@ -110,7 +102,7 @@ __kernel void instance_norm_meanvari_I32_2D(
         __local float4* tmp_sum = (__local float4*)lcl_sum;
         __local float4* tmp_sqr = (__local float4*)lcl_sqr;
 
-        sum = 0; sqr = 0;
+        float sum = 0, sqr = 0;
         for(int i = 0; i < 4; i++)
         {
             sum += dot(tmp_sum[i], one);
@@ -127,23 +119,19 @@ __kernel void instance_norm_meanvari_I32_2D(
 }
 
 __kernel void instance_norm_I32toI32(
-    __read_only image2d_array_t   input,
-    __read_only image2d_t   bias,
-    __read_only image2d_t   scale,
-    __read_only image2d_t   meanVari,
-    __write_only image2d_array_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int output_zp,
-    float output_scale,
-    float output_fl,
-    int width,
-    int height,
-    float dim_ratio,
-    int group_num
+    __read_only  image2d_array_t input,
+    __read_only  image2d_t       bias,
+    __read_only  image2d_t       scale,
+    __read_only  image2d_t       meanVari,
+    __write_only image2d_array_t output,
+                 float           eps,
+                 int             rsFlg,
+                 int             output_zp,
+                 float           output_scale,
+                 int             width,
+                 int             height,
+                 float           inv_multiplier,
+                 int             group_num
     )
 {
     int gidz = get_global_id(1);
@@ -162,13 +150,13 @@ __kernel void instance_norm_I32toI32(
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;
         coord_para.x+=3;
     }
-    mean_vari *= dim_ratio;
+    mean_vari *= inv_multiplier;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
     mean_vari.s1 = rsqrt(mean_vari.s1);
 
     scale_vari = gamma.s0 * mean_vari.s1;
-    float alpha = input_fl * output_fl * scale_vari;
-    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_fl;
+    float alpha = output_scale * scale_vari;
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;
 
     int4 data, dst;
     for(coord.y = 0; coord.y < height;coord.y++)
@@ -183,23 +171,19 @@ __kernel void instance_norm_I32toI32(
 }
 
 __kernel void instance_norm_I32toI32_2D(
-    __read_only image2d_t   input,
-    __read_only image2d_t   bias,
-    __read_only image2d_t   scale,
-    __read_only image2d_t   meanVari,
-    __write_only image2d_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int output_zp,
-    float output_scale,
-    float output_fl,
-    int width,
-    int height,
-    float dim_ratio,
-    int group_num
+    __read_only  image2d_t input,
+    __read_only  image2d_t bias,
+    __read_only  image2d_t scale,
+    __read_only  image2d_t meanVari,
+    __write_only image2d_t output,
+                 float     eps,
+                 int       rsFlg,
+                 int       output_zp,
+                 float     output_scale,
+                 int       width,
+                 int       height,
+                 float     inv_multiplier,
+                 int       group_num
     )
 {
     int gidz = get_global_id(1);
@@ -220,13 +204,13 @@ __kernel void instance_norm_I32toI32_2D(
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;
         coord_para.x+=3;
     }
-    mean_vari *= dim_ratio;
+    mean_vari *= inv_multiplier;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
     mean_vari.s1 = rsqrt(mean_vari.s1);
 
     scale_vari = gamma.s0 * mean_vari.s1;
-    float alpha = input_fl * output_fl * scale_vari;
-    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_fl;
+    float alpha = output_scale * scale_vari;
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;
 
     int4 data, dst;
     for(; coord.y < endH; coord.y++)
@@ -241,23 +225,19 @@ __kernel void instance_norm_I32toI32_2D(
 }
 
 __kernel void instance_norm_I32toF32(
-    __read_only image2d_array_t   input,
-    __read_only image2d_t   bias,
-    __read_only image2d_t   scale,
-    __read_only image2d_t   meanVari,
-    __write_only image2d_array_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int output_zp,
-    float output_scale,
-    float output_fl,
-    int width,
-    int height,
-    float dim_ratio,
-    int group_num
+    __read_only  image2d_array_t input,
+    __read_only  image2d_t       bias,
+    __read_only  image2d_t       scale,
+    __read_only  image2d_t       meanVari,
+    __write_only image2d_array_t output,
+                 float           eps,
+                 int             rsFlg,
+                 int             output_zp,
+                 float           output_scale,
+                 int             width,
+                 int             height,
+                 float           inv_multiplier,
+                 int             group_num
     )
 {
     int gidz = get_global_id(1);
@@ -276,12 +256,12 @@ __kernel void instance_norm_I32toF32(
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;
         coord_para.x+=3;
     }
-    mean_vari *= dim_ratio;
+    mean_vari *= inv_multiplier;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
     mean_vari.s1 = rsqrt(mean_vari.s1);
 
     scale_vari = gamma.s0 * mean_vari.s1;
-    float alpha = input_fl * scale_vari;
+    float alpha = scale_vari;
     bias_val = (beta.s0 - scale_vari * mean_vari.s0);
 
     int4 data;
@@ -296,23 +276,19 @@ __kernel void instance_norm_I32toF32(
 }
 
 __kernel void instance_norm_I32toF32_2D(
-    __read_only image2d_t   input,
-    __read_only image2d_t   bias,
-    __read_only image2d_t   scale,
-    __read_only image2d_t   meanVari,
-    __write_only image2d_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int output_zp,
-    float output_scale,
-    float output_fl,
-    int width,
-    int height,
-    float dim_ratio,
-    int group_num
+    __read_only  image2d_t input,
+    __read_only  image2d_t bias,
+    __read_only  image2d_t scale,
+    __read_only  image2d_t meanVari,
+    __write_only image2d_t output,
+                 float     eps,
+                 int       rsFlg,
+                 int       output_zp,
+                 float     output_scale,
+                 int       width,
+                 int       height,
+                 float     inv_multiplier,
+                 int       group_num
     )
 {
     int gidz = get_global_id(1);
@@ -333,12 +309,12 @@ __kernel void instance_norm_I32toF32_2D(
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;
         coord_para.x+=3;
     }
-    mean_vari *= dim_ratio;
+    mean_vari *= inv_multiplier;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
     mean_vari.s1 = rsqrt(mean_vari.s1);
 
     scale_vari = gamma.s0 * mean_vari.s1;
-    float alpha = input_fl * scale_vari;
+    float alpha = scale_vari;
     bias_val = beta.s0 - scale_vari * mean_vari.s0;
 
     int4 data;
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_u8.cl b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_u8.cl
index 8b82717..1494685 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_u8.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_u8.cl
@@ -1,13 +1,10 @@
-__kernel void instance_norm_meanvari_U8(
-    __read_only image2d_array_t   input,
-    __write_only image2d_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int width,
-    int height
+__kernel void instance_norm_sums_U8(
+    __read_only  image2d_array_t input,
+    __write_only image2d_t       output,
+                 float           eps,
+                 int             rsFlg,
+                 int             width,
+                 int             height
     )
 {
     int gidx = get_global_id(0);
@@ -16,9 +13,8 @@ __kernel void instance_norm_meanvari_U8(
 
     int4 coord = (int4)(gidx, 0, gidz, 0);
     uint4 data;
-    float sum = 0, sqr = 0;
-    int tmpSum = 0, tmpSqr = 0;
-    float e2InScale = input_scale * input_scale;
+    float2 sum_x_x2 = 0;
+    int2 _sum_x_x2 = 0;
 
     __local float lcl_sum[16];
     __local float lcl_sqr[16];
@@ -29,14 +25,13 @@ __kernel void instance_norm_meanvari_U8(
         {
             data = read_imageui(input, coord);
             coord.y++;
-            tmpSum += data.x;
-            tmpSqr += data.x * data.x;
+            _sum_x_x2.x = _sum_x_x2.x + data.x;
+            _sum_x_x2.y = _sum_x_x2.y + data.x * data.x;
         }
-        sqr = (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;
-        sum = (tmpSum - height * input_zp) * input_scale;
+        sum_x_x2 = convert_float2(_sum_x_x2);
     }
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
+    lcl_sum[lidx] = sum_x_x2.x;
+    lcl_sqr[lidx] = sum_x_x2.y;
     barrier(CLK_LOCAL_MEM_FENCE);
 
     int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
@@ -46,7 +41,7 @@ __kernel void instance_norm_meanvari_U8(
         __local float4* tmp_sum = (__local float4*)lcl_sum;
         __local float4* tmp_sqr = (__local float4*)lcl_sqr;
 
-        sum = 0; sqr = 0;
+        float sum = 0, sqr = 0;
         for(int i = 0; i < 4; i++)
         {
             sum += dot(tmp_sum[i], one);
@@ -62,16 +57,13 @@ __kernel void instance_norm_meanvari_U8(
     }
 }
 
-__kernel void instance_norm_meanvari_U8_2D(
-    __read_only image2d_t   input,
-    __write_only image2d_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int width,
-    int height
+__kernel void instance_norm_sums_U8_2D(
+    __read_only  image2d_t input,
+    __write_only image2d_t output,
+                 float     eps,
+                 int       rsFlg,
+                 int       width,
+                 int       height
     )
 {
     int gidx = get_global_id(0);
@@ -81,9 +73,8 @@ __kernel void instance_norm_meanvari_U8_2D(
 
     int2 coord = (int2)(gidx, gidy);
     uint4 data;
-    float sum = 0, sqr = 0;
-    int tmpSum = 0, tmpSqr = 0;
-    float e2InScale = input_scale * input_scale;
+    float2 sum_x_x2 = 0;
+    int2 _sum_x_x2 = 0;
 
     __local float lcl_sum[16];
     __local float lcl_sqr[16];
@@ -95,14 +86,13 @@ __kernel void instance_norm_meanvari_U8_2D(
         {
             data = read_imageui(input, coord);
             coord.y++;
-            tmpSum += data.x;
-            tmpSqr += data.x * data.x;
+            _sum_x_x2.x = _sum_x_x2.x + data.x;
+            _sum_x_x2.y = _sum_x_x2.y + data.x * data.x;
         }
-        sqr = (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;
-        sum = (tmpSum - height * input_zp) * input_scale;
+        sum_x_x2 = convert_float2(_sum_x_x2);
     }
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
+    lcl_sum[lidx] = sum_x_x2.x;
+    lcl_sqr[lidx] = sum_x_x2.y;
     barrier(CLK_LOCAL_MEM_FENCE);
 
     int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
@@ -112,7 +102,7 @@ __kernel void instance_norm_meanvari_U8_2D(
         __local float4* tmp_sum = (__local float4*)lcl_sum;
         __local float4* tmp_sqr = (__local float4*)lcl_sqr;
 
-        sum = 0; sqr = 0;
+        float sum = 0, sqr = 0;
         for(int i = 0; i < 4; i++)
         {
             sum += dot(tmp_sum[i], one);
@@ -129,23 +119,19 @@ __kernel void instance_norm_meanvari_U8_2D(
 }
 
 __kernel void instance_norm_U8toU8(
-    __read_only image2d_array_t   input,
-    __read_only image2d_t   bias,
-    __read_only image2d_t   scale,
-    __read_only image2d_t   meanVari,
-    __write_only image2d_array_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int output_zp,
-    float output_scale,
-    float output_fl,
-    int width,
-    int height,
-    float dim_ratio,
-    int group_num
+    __read_only  image2d_array_t input,
+    __read_only  image2d_t       bias,
+    __read_only  image2d_t       scale,
+    __read_only  image2d_t       meanVari,
+    __write_only image2d_array_t output,
+                 float           eps,
+                 int             rsFlg,
+                 int             output_zp,
+                 float           output_scale,
+                 int             width,
+                 int             height,
+                 float           inv_multiplier,
+                 int             group_num
     )
 {
     int gidz = get_global_id(1);
@@ -156,7 +142,6 @@ __kernel void instance_norm_U8toU8(
     float4 beta  = read_imagef(bias, coord_para.yx);
     float4 mean_vari = (float4)(0);
     float scale_vari, bias_val;
-    float scale_inOut = input_scale * output_scale;
 
     for(int i = 0; i < group_num; i++)
     {
@@ -165,19 +150,18 @@ __kernel void instance_norm_U8toU8(
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;
         coord_para.x+=3;
     }
-    mean_vari *= dim_ratio;
+    mean_vari *= inv_multiplier;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
     mean_vari.s1 = rsqrt(mean_vari.s1);
 
     scale_vari = gamma.s0 * mean_vari.s1;
-    float alpha = scale_inOut * scale_vari;
+    float alpha = output_scale * scale_vari;
     bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;
 
     uint4 data, dst;
     for(coord.y = 0; coord.y < height;coord.y++)
     {
         data = read_imageui(input, coord);
-        data.x -= input_zp;
 
         float4 norm;
         norm.x = data.x * alpha + bias_val;
@@ -187,23 +171,19 @@ __kernel void instance_norm_U8toU8(
 }
 
 __kernel void instance_norm_U8toU8_2D(
-    __read_only image2d_t   input,
-    __read_only image2d_t   bias,
-    __read_only image2d_t   scale,
-    __read_only image2d_t   meanVari,
-    __write_only image2d_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int output_zp,
-    float output_scale,
-    float output_fl,
-    int width,
-    int height,
-    float dim_ratio,
-    int group_num
+    __read_only  image2d_t input,
+    __read_only  image2d_t bias,
+    __read_only  image2d_t scale,
+    __read_only  image2d_t meanVari,
+    __write_only image2d_t output,
+                 float     eps,
+                 int       rsFlg,
+                 int       output_zp,
+                 float     output_scale,
+                 int       width,
+                 int       height,
+                 float     inv_multiplier,
+                 int       group_num
     )
 {
     int gidz = get_global_id(1);
@@ -216,7 +196,6 @@ __kernel void instance_norm_U8toU8_2D(
     float4 beta  = read_imagef(bias, coord_para.yx);
     float4 mean_vari = (float4)(0);
     float scale_vari, bias_val;
-    float scale_inOut = input_scale * output_scale;
 
     for(int i = 0; i < group_num; i++)
     {
@@ -225,19 +204,18 @@ __kernel void instance_norm_U8toU8_2D(
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;
         coord_para.x+=3;
     }
-    mean_vari *= dim_ratio;
+    mean_vari *= inv_multiplier;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
     mean_vari.s1 = rsqrt(mean_vari.s1);
 
     scale_vari = gamma.s0 * mean_vari.s1;
-    float alpha = scale_inOut * scale_vari;
+    float alpha = output_scale * scale_vari;
     bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;
 
     uint4 data, dst;
     for(; coord.y < endH; coord.y++)
     {
         data = read_imageui(input, coord);
-        data.x -= input_zp;
 
         float4 norm;
         norm.x = data.x * alpha + bias_val;
@@ -247,23 +225,19 @@ __kernel void instance_norm_U8toU8_2D(
 }
 
 __kernel void instance_norm_U8toF16(
-    __read_only image2d_array_t   input,
-    __read_only image2d_t   bias,
-    __read_only image2d_t   scale,
-    __read_only image2d_t   meanVari,
-    __write_only image2d_array_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int output_zp,
-    float output_scale,
-    float output_fl,
-    int width,
-    int height,
-    float dim_ratio,
-    int group_num
+    __read_only  image2d_array_t input,
+    __read_only  image2d_t       bias,
+    __read_only  image2d_t       scale,
+    __read_only  image2d_t       meanVari,
+    __write_only image2d_array_t output,
+                 float           eps,
+                 int             rsFlg,
+                 int             output_zp,
+                 float           output_scale,
+                 int             width,
+                 int             height,
+                 float           inv_multiplier,
+                 int             group_num
     )
 {
     int gidz = get_global_id(1);
@@ -274,7 +248,6 @@ __kernel void instance_norm_U8toF16(
     float4 beta  = read_imagef(bias, coord_para.yx);
     float4 mean_vari = (float4)(0);
     float scale_vari, bias_val;
-    float scale_inOut = input_scale * output_scale;
 
     for(int i = 0; i < group_num; i++)
     {
@@ -283,19 +256,18 @@ __kernel void instance_norm_U8toF16(
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;
         coord_para.x+=3;
     }
-    mean_vari *= dim_ratio;
+    mean_vari *= inv_multiplier;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
     mean_vari.s1 = rsqrt(mean_vari.s1);
 
     scale_vari = gamma.s0 * mean_vari.s1;
-    float alpha = scale_inOut * scale_vari;
+    float alpha = output_scale * scale_vari;
     bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;
 
     uint4 data;
     for(coord.y = 0; coord.y < height;coord.y++)
     {
         data = read_imageui(input, coord);
-        data.x -= input_zp;
 
         float4 norm;
         norm.x = data.x * alpha + bias_val;
@@ -304,23 +276,19 @@ __kernel void instance_norm_U8toF16(
 }
 
 __kernel void instance_norm_U8toF16_2D(
-    __read_only image2d_t   input,
-    __read_only image2d_t   bias,
-    __read_only image2d_t   scale,
-    __read_only image2d_t   meanVari,
-    __write_only image2d_t  output,
-    float eps,
-    int rsFlg,
-    int input_zp,
-    float input_scale,
-    float input_fl,
-    int output_zp,
-    float output_scale,
-    float output_fl,
-    int width,
-    int height,
-    float dim_ratio,
-    int group_num
+    __read_only  image2d_t input,
+    __read_only  image2d_t bias,
+    __read_only  image2d_t scale,
+    __read_only  image2d_t meanVari,
+    __write_only image2d_t output,
+                 float     eps,
+                 int       rsFlg,
+                 int       output_zp,
+                 float     output_scale,
+                 int       width,
+                 int       height,
+                 float     inv_multiplier,
+                 int       group_num
     )
 {
     int gidz = get_global_id(1);
@@ -333,7 +301,6 @@ __kernel void instance_norm_U8toF16_2D(
     float4 beta  = read_imagef(bias, coord_para.yx);
     float4 mean_vari = (float4)(0);
     float scale_vari, bias_val;
-    float scale_inOut = input_scale * output_scale;
 
     for(int i = 0; i < group_num; i++)
     {
@@ -342,19 +309,18 @@ __kernel void instance_norm_U8toF16_2D(
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;
         coord_para.x+=3;
     }
-    mean_vari *= dim_ratio;
+    mean_vari *= inv_multiplier;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
     mean_vari.s1 = rsqrt(mean_vari.s1);
 
     scale_vari = gamma.s0 * mean_vari.s1;
-    float alpha = scale_inOut * scale_vari;
+    float alpha = output_scale * scale_vari;
     bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;
 
     uint4 data;
     for(; coord.y < endH; coord.y++)
     {
         data = read_imageui(input, coord);
-        data.x -= input_zp;
 
         float4 norm;
         norm.x = data.x * alpha + bias_val;
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/maxpoolwithargmax.cl b/src/tim/vx/internal/src/libnnext/ops/cl/maxpoolwithargmax.cl
new file mode 100644
index 0000000..0296e39
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/maxpoolwithargmax.cl
@@ -0,0 +1,191 @@
+#define FP32_MIN   -3.4e38
+#define I32_MIN    -2147483647
+
+__kernel void maxpoolwithargmax_F32toF32_I32(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    __write_only image2d_array_t  argmax,
+    int ksize_x, int ksize_y, int stride_x, int stride_y,
+    int pad_left, int pad_top, int width, int height,
+    float scale, float tail)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0);
+    int4 coord_in  = coord_out;
+
+    int hstart = gidy * stride_y - pad_top;
+    int wstart = gidx * stride_x - pad_left;
+    int hend = min(hstart + ksize_y, height);
+    int wend = min(wstart + ksize_x, width);
+    int h, w;
+    int4 index_max = (int4)(0);
+    float value_max = FP32_MIN;
+    float4 dst = (float4)(0);
+
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    int2 coord_max = (int2)(wstart, hstart);
+    for (h = hstart; h < hend; ++ h)
+    {
+        for (w = wstart; w < wend; ++ w)
+        {
+            coord_in.xy = (int2)(w, h);
+            float4 data = read_imagef(input, coord_in);
+
+            if (data.x > value_max)
+            {
+                value_max = data.x;
+                coord_max = coord_in.xy;
+            }
+        }
+    }
+
+    index_max.x = coord_max.x + coord_max.y * width + get_global_id(2) * width * height;
+    dst.x = value_max;
+    write_imagef(output, coord_out, dst);
+    write_imagei(argmax, coord_out, index_max);
+}
+
+__kernel void maxpoolwithargmax_BF16toBF16_I32(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    __write_only image2d_array_t  argmax,
+    int ksize_x, int ksize_y, int stride_x, int stride_y,
+    int pad_left, int pad_top, int width, int height,
+    float scale, float tail)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0);
+    int4 coord_in  = coord_out;
+
+    int hstart = gidy * stride_y - pad_top;
+    int wstart = gidx * stride_x - pad_left;
+    int hend = min(hstart + ksize_y, height);
+    int wend = min(wstart + ksize_x, width);
+    int h, w;
+    int4 index_max = (int4)(0);
+    float value_max = FP32_MIN;
+    uint4 dst = (uint4)(0);
+
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    int2 coord_max = (int2)(wstart, hstart);
+    for (h = hstart; h < hend; ++ h)
+    {
+        for (w = wstart; w < wend; ++ w)
+        {
+            coord_in.xy = (int2)(w, h);
+            uint4 src = read_imageui(input, coord_in);
+            src = src << 16;
+            float4 data;
+            _viv_asm(COPY, data, src, 16);
+
+            if (data.x > value_max)
+            {
+                value_max = data.x;
+                coord_max = coord_in.xy;
+            }
+        }
+    }
+
+    index_max.x = coord_max.x + coord_max.y * width + get_global_id(2) * width * height;
+    _viv_asm(COPY, dst, value_max, 4);
+    dst.x = dst.x >> 16;
+    write_imageui(output, coord_out, dst);
+    write_imagei(argmax, coord_out, index_max);
+}
+
+__kernel void maxpoolwithargmax_U32toU32_I32(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    __write_only image2d_array_t  argmax,
+    int ksize_x, int ksize_y, int stride_x, int stride_y,
+    int pad_left, int pad_top, int width, int height,
+    float scale, float tail)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0);
+    int4 coord_in  = coord_out;
+
+    int hstart = gidy * stride_y - pad_top;
+    int wstart = gidx * stride_x - pad_left;
+    int hend = min(hstart + ksize_y, height);
+    int wend = min(wstart + ksize_x, width);
+    int h, w;
+    int4 index_max = (int4)(0);
+    uint value_max = 0;
+    uint4 dst = (uint4)(0);
+
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+
+    int2 coord_max = (int2)(wstart, hstart);
+    for (h = hstart; h < hend; ++ h)
+    {
+        for (w = wstart; w < wend; ++ w)
+        {
+            coord_in.xy = (int2)(w, h);
+            uint4 data = read_imageui(input, coord_in);
+
+            if (data.x > value_max)
+            {
+                value_max = data.x;
+                coord_max = coord_in.xy;
+            }
+        }
+    }
+
+    index_max.x = coord_max.x + coord_max.y * width + get_global_id(2) * width * height;
+    dst.x = convert_uint(convert_float(value_max) * scale + tail);
+    write_imageui(output, coord_out, dst);
+    write_imagei(argmax, coord_out, index_max);
+}
+
+__kernel void maxpoolwithargmax_I32toI32_I32(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    __write_only image2d_array_t  argmax,
+    int ksize_x, int ksize_y, int stride_x, int stride_y,
+    int pad_left, int pad_top, int width, int height,
+    float scale, float tail)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0);
+    int4 coord_in  = coord_out;
+
+    int hstart = gidy * stride_y - pad_top;
+    int wstart = gidx * stride_x - pad_left;
+    int hend = min(hstart + ksize_y, height);
+    int wend = min(wstart + ksize_x, width);
+    int h, w;
+    int4 index_max = (int4)(0);
+    int value_max = I32_MIN;
+    int4 dst = (int4)(0);
+
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    int2 coord_max = (int2)(wstart, hstart);
+    for (h = hstart; h < hend; ++ h)
+    {
+        for (w = wstart; w < wend; ++ w)
+        {
+            coord_in.xy = (int2)(w, h);
+            int4 data = read_imagei(input, coord_in);
+
+            if (data.x > value_max)
+            {
+                value_max = data.x;
+                coord_max = coord_in.xy;
+            }
+        }
+    }
+
+    index_max.x = coord_max.x + coord_max.y * width + get_global_id(2) * width * height;
+    dst.x = convert_int(convert_float(value_max) * scale + tail);
+    write_imagei(output, coord_out, dst);
+    write_imagei(argmax, coord_out, index_max);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/maxpoolwithargmax_2d.cl b/src/tim/vx/internal/src/libnnext/ops/cl/maxpoolwithargmax_2d.cl
new file mode 100644
index 0000000..33bd14f
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/maxpoolwithargmax_2d.cl
@@ -0,0 +1,190 @@
+#define FP32_MIN   -3.4e38
+#define I32_MIN    -2147483647
+
+__kernel void maxpoolwithargmax_F32toF32_I32_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    __write_only image2d_t  argmax,
+    int ksize_x, int ksize_y, int stride_x, int stride_y,
+    int pad_left, int pad_top, int width, int height,
+    float scale, float tail)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int2 coord_out = (int2)(gidx, gidy);
+    int2 coord_in  = coord_out;
+
+    int hstart = gidy * stride_y - pad_top;
+    int wstart = gidx * stride_x - pad_left;
+    int hend = min(hstart + ksize_y, height);
+    int wend = min(wstart + ksize_x, width);
+    int h, w;
+    int4 index_max = (int4)(0);
+    float value_max = FP32_MIN;
+    float4 dst = (float4)(0);
+
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    int2 coord_max = (int2)(wstart, hstart);
+    for (h = hstart; h < hend; ++ h)
+    {
+        for (w = wstart; w < wend; ++ w)
+        {
+            coord_in.xy = (int2)(w, h);
+            float4 data = read_imagef(input, coord_in);
+
+            if (data.x > value_max)
+            {
+                value_max = data.x;
+                coord_max = coord_in;
+            }
+        }
+    }
+
+    index_max.x = coord_max.x + coord_max.y * width;
+    dst.x = value_max;
+    write_imagef(output, coord_out, dst);
+    write_imagei(argmax, coord_out, index_max);
+}
+
+__kernel void maxpoolwithargmax_BF16toBF16_I32_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    __write_only image2d_t  argmax,
+    int ksize_x, int ksize_y, int stride_x, int stride_y,
+    int pad_left, int pad_top, int width, int height,
+    float scale, float tail)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int2 coord_out = (int2)(gidx, gidy);
+    int2 coord_in  = coord_out;
+
+    int hstart = gidy * stride_y - pad_top;
+    int wstart = gidx * stride_x - pad_left;
+    int hend = min(hstart + ksize_y, height);
+    int wend = min(wstart + ksize_x, width);
+    int h, w;
+    int4 index_max = (int4)(0);
+    float value_max = FP32_MIN;
+    uint4 dst = (uint4)(0);
+
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    int2 coord_max = (int2)(wstart, hstart);
+    for (h = hstart; h < hend; ++ h)
+    {
+        for (w = wstart; w < wend; ++ w)
+        {
+            coord_in.xy = (int2)(w, h);
+            uint4 src = read_imageui(input, coord_in);
+            src = src << 16;
+            float4 data;
+            _viv_asm(COPY, data, src, 16);
+
+            if (data.x > value_max)
+            {
+                value_max = data.x;
+                coord_max = coord_in;
+            }
+        }
+    }
+
+    index_max.x = coord_max.x + coord_max.y * width;
+    _viv_asm(COPY, dst, value_max, 4);
+    dst.x = dst.x >> 16;
+    write_imageui(output, coord_out, dst);
+    write_imagei(argmax, coord_out, index_max);
+}
+
+__kernel void maxpoolwithargmax_U32toU32_I32_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    __write_only image2d_t  argmax,
+    int ksize_x, int ksize_y, int stride_x, int stride_y,
+    int pad_left, int pad_top, int width, int height,
+    float scale, float tail)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int2 coord_out = (int2)(gidx, gidy);
+    int2 coord_in  = coord_out;
+
+    int hstart = gidy * stride_y - pad_top;
+    int wstart = gidx * stride_x - pad_left;
+    int hend = min(hstart + ksize_y, height);
+    int wend = min(wstart + ksize_x, width);
+    int h, w;
+    int4 index_max = (int4)(0);
+    uint value_max = 0;
+    uint4 dst = (uint4)(0);
+
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    int2 coord_max = (int2)(wstart, hstart);
+    for (h = hstart; h < hend; ++ h)
+    {
+        for (w = wstart; w < wend; ++ w)
+        {
+            coord_in.xy = (int2)(w, h);
+            uint4 data = read_imageui(input, coord_in);
+
+            if (data.x > value_max)
+            {
+                value_max = data.x;
+                coord_max = coord_in;
+            }
+        }
+    }
+
+    index_max.x = coord_max.x + coord_max.y * width;
+    dst.x = convert_uint(convert_float(value_max) * scale + tail);
+    write_imageui(output, coord_out, dst);
+    write_imagei(argmax, coord_out, index_max);
+}
+
+__kernel void maxpoolwithargmax_I32toI32_I32_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    __write_only image2d_t  argmax,
+    int ksize_x, int ksize_y, int stride_x, int stride_y,
+    int pad_left, int pad_top, int width, int height,
+    float scale, float tail)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int2 coord_out = (int2)(gidx, gidy);
+    int2 coord_in  = coord_out;
+
+    int hstart = gidy * stride_y - pad_top;
+    int wstart = gidx * stride_x - pad_left;
+    int hend = min(hstart + ksize_y, height);
+    int wend = min(wstart + ksize_x, width);
+    int h, w;
+    int4 index_max = (int4)(0);
+    int value_max = I32_MIN;
+    int4 dst = (int4)(0);
+
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    int2 coord_max = (int2)(wstart, hstart);
+    for (h = hstart; h < hend; ++ h)
+    {
+        for (w = wstart; w < wend; ++ w)
+        {
+            coord_in.xy = (int2)(w, h);
+            int4 data = read_imagei(input, coord_in);
+
+            if (data.x > value_max)
+            {
+                value_max = data.x;
+                coord_max = coord_in;
+            }
+        }
+    }
+
+    index_max.x = coord_max.x + coord_max.y * width;
+    dst.x = convert_int(convert_float(value_max) * scale + tail);
+    write_imagei(output, coord_out, dst);
+    write_imagei(argmax, coord_out, index_max);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/mod.cl b/src/tim/vx/internal/src/libnnext/ops/cl/mod.cl
new file mode 100644
index 0000000..42649c9
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/mod.cl
@@ -0,0 +1,306 @@
+__kernel void mod_F32F32toF32
+    (
+    __read_only  image2d_array_t  input,
+    __read_only  image2d_array_t  input1,
+    __write_only image2d_array_t  output,
+                 int              isfmod,
+                 float            input0Scale,
+                 float            input0Tail,
+                 float            input1Scale,
+                 float            input1Tail,
+                 float            outputScale,
+                 float            outputTail
+     )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    float4 src0;
+    float4 src1;
+    READ_IMAGEF_2DARRAY(src0, input, coord);
+    READ_IMAGEF_2DARRAY(src1, input1, coord);
+    float4 dst  = fmod(src0, src1);
+    write_imagef(output, coord, dst);
+}
+
+__kernel void mod_F32F32toF32_2D
+    (
+    __read_only  image2d_t input,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 int       isfmod,
+                 float     input0Scale,
+                 float     input0Tail,
+                 float     input1Scale,
+                 float     input1Tail,
+                 float     outputScale,
+                 float     outputTail
+     )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    float4 src0 = read_imagef(input, coord);
+    float4 src1 = read_imagef(input1, coord);
+    float4 dst  = fmod(src0, src1);
+    write_imagef(output, coord, dst);
+}
+
+__kernel void mod_I32I32toI32
+    (
+    __read_only  image2d_array_t  input,
+    __read_only  image2d_array_t  input1,
+    __write_only image2d_array_t  output,
+                 int              isfmod,
+                 float            input0Scale,
+                 float            input0Tail,
+                 float            input1Scale,
+                 float            input1Tail,
+                 float            outputScale,
+                 float            outputTail
+     )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 src0;
+    int4 src1;
+    READ_IMAGEI_2DARRAY(src0, input, coord);
+    READ_IMAGEI_2DARRAY(src1, input1, coord);
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;
+    float4 out;
+    if (isfmod)
+    {
+        out = fmod(in0, in1) * outputScale + outputTail;
+    }
+    else
+    {
+        out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;
+    }
+    int4 dst = convert_int4(out);
+    write_imagei(output, coord, dst);
+}
+
+__kernel void mod_I32I32toI32_2D
+    (
+    __read_only  image2d_t input,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 int       isfmod,
+                 float     input0Scale,
+                 float     input0Tail,
+                 float     input1Scale,
+                 float     input1Tail,
+                 float     outputScale,
+                 float     outputTail
+     )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    int4 src0 = read_imagei(input, coord);
+    int4 src1 = read_imagei(input1, coord);
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;
+    float4 out;
+    if (isfmod)
+    {
+        out = fmod(in0, in1) * outputScale + outputTail;
+    }
+    else
+    {
+        out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;
+    }
+    int4 dst = convert_int4(out);
+    write_imagei(output, coord, dst);
+}
+
+__kernel void mod_I32I32toU8
+    (
+    __read_only  image2d_array_t  input,
+    __read_only  image2d_array_t  input1,
+    __write_only image2d_array_t  output,
+                 int              isfmod,
+                 float            input0Scale,
+                 float            input0Tail,
+                 float            input1Scale,
+                 float            input1Tail,
+                 float            outputScale,
+                 float            outputTail
+     )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 src0;
+    int4 src1;
+    READ_IMAGEI_2DARRAY(src0, input, coord);
+    READ_IMAGEI_2DARRAY(src1, input1, coord);
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;
+    float4 out;
+    if (isfmod)
+    {
+        out = fmod(in0, in1) * outputScale + outputTail;
+    }
+    else
+    {
+        out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;
+    }
+    uint4 dst = convert_uint4(out);
+    write_imageui(output, coord, dst);
+}
+
+__kernel void mod_I32I32toU8_2D
+    (
+    __read_only  image2d_t input,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 int       isfmod,
+                 float     input0Scale,
+                 float     input0Tail,
+                 float     input1Scale,
+                 float     input1Tail,
+                 float     outputScale,
+                 float     outputTail
+     )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    int4 src0 = read_imagei(input, coord);
+    int4 src1 = read_imagei(input1, coord);
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;
+    float4 out;
+    if (isfmod)
+    {
+        out = fmod(in0, in1) * outputScale + outputTail;
+    }
+    else
+    {
+        out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;
+    }
+    uint4 dst = convert_uint4(out);
+    write_imageui(output, coord, dst);
+}
+
+__kernel void mod_U8U8toU8
+    (
+    __read_only  image2d_array_t  input,
+    __read_only  image2d_array_t  input1,
+    __write_only image2d_array_t  output,
+                 int              isfmod,
+                 float            input0Scale,
+                 float            input0Tail,
+                 float            input1Scale,
+                 float            input1Tail,
+                 float            outputScale,
+                 float            outputTail
+     )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    uint4 src0, src1;
+    float4 in0, in1, out;
+    READ_IMAGEUI_2DARRAY(src0, input, coord);
+    READ_IMAGEUI_2DARRAY(src1, input1, coord);
+    in0 = convert_float4(src0) * input0Scale + input0Tail;
+    in1 = convert_float4(src1) * input1Scale + input1Tail;
+    if (isfmod)
+    {
+        out = fmod(in0, in1) * outputScale + outputTail;
+    }
+    else
+    {
+        out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;
+    }
+    uint4 dst  = convert_uint4(out);
+    write_imageui(output, coord, dst);
+}
+
+__kernel void mod_U8U8toU8_2D
+    (
+    __read_only  image2d_t input,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 int       isfmod,
+                 float     input0Scale,
+                 float     input0Tail,
+                 float     input1Scale,
+                 float     input1Tail,
+                 float     outputScale,
+                 float     outputTail
+     )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    uint4 src0 = read_imageui(input, coord);
+    uint4 src1 = read_imageui(input1, coord);
+    float4 in0, in1, out;
+    in0 = convert_float4(src0) * input0Scale + input0Tail;
+    in1 = convert_float4(src1) * input1Scale + input1Tail;
+    if (isfmod)
+    {
+        out = fmod(in0, in1) * outputScale + outputTail;
+    }
+    else
+    {
+        out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;
+    }
+    uint4 dst  = convert_uint4(out);
+    write_imageui(output, coord, dst);
+}
+
+__kernel void mod_U8I32toU8
+    (
+    __read_only  image2d_array_t  input,
+    __read_only  image2d_array_t  input1,
+    __write_only image2d_array_t  output,
+                 int              isfmod,
+                 float            input0Scale,
+                 float            input0Tail,
+                 float            input1Scale,
+                 float            input1Tail,
+                 float            outputScale,
+                 float            outputTail
+     )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    uint4 src0;
+    int4 src1;
+    float4 in0, in1, out;
+    READ_IMAGEUI_2DARRAY(src0, input, coord);
+    READ_IMAGEI_2DARRAY(src1, input1, coord);
+    in0 = convert_float4(src0) * input0Scale + input0Tail;
+    in1 = convert_float4(src1);
+    if (isfmod)
+    {
+        out = fmod(in0, in1) * outputScale + outputTail;
+    }
+    else
+    {
+        out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;
+    }
+    uint4 dst = convert_uint4(out);
+    write_imageui(output, coord, dst);
+}
+
+__kernel void mod_U8I32toU8_2D
+    (
+    __read_only  image2d_t input,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 int       isfmod,
+                 float     input0Scale,
+                 float     input0Tail,
+                 float     input1Scale,
+                 float     input1Tail,
+                 float     outputScale,
+                 float     outputTail
+     )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    uint4 src0 = read_imageui(input, coord);
+    int4 src1 = read_imagei(input1, coord);
+    float4 in0, in1, out;
+    in0 = convert_float4(src0) * input0Scale + input0Tail;
+    in1 = convert_float4(src1);
+    if (isfmod)
+    {
+        out = fmod(in0, in1) * outputScale + outputTail;
+    }
+    else
+    {
+        out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;
+    }
+    uint4 dst = convert_uint4(out);
+    write_imageui(output, coord, dst);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl b/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl
index feef55a..b2d6aae 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl
@@ -45,21 +45,25 @@ inline float roi_align_1x1
 
 
 #define EPS_GRID 0.00001f
-__kernel void roi_align_F32toF32
+__kernel void roi_align_F32_F32toF32
 (
-    __read_only  image2d_array_t  input,
-    __read_only  image2d_t        rois,
-    __read_only  image2d_t        n_rois,
-    __write_only image2d_array_t  output,
-                           float  spatial_x_scale,
-                           float  spatial_y_scale,
-                           float  in_width,
-                           float  in_height,
-                           float  rcp_of_out_width,
-                           float  rcp_of_out_height,
-                           float  sampling_x_ratio,
-                           float  sampling_y_ratio,
-                           int    depth
+    __read_only  image2d_array_t input,
+    __read_only  image2d_t       rois,
+    __read_only  image2d_t       n_rois,
+    __write_only image2d_array_t output,
+                 float           input_scale,
+                 float           input_tail,
+                 float           output_scale,
+                 float           output_zp,
+                 float           spatial_x_scale,
+                 float           spatial_y_scale,
+                 float           in_width,
+                 float           in_height,
+                 float           rcp_of_out_width,
+                 float           rcp_of_out_height,
+                 float           sampling_x_ratio,
+                 float           sampling_y_ratio,
+                 int             depth
 )
 {
     int px = get_global_id(0);
@@ -105,4 +109,126 @@ __kernel void roi_align_F32toF32
 
         write_imagef(output, (int4)(px, py, kz1, 0), interp);
     }
+}
+
+inline float roi_align_1x1_U8toF32
+(
+    __read_only image2d_array_t  input,
+                float            input_scale,
+                float            input_tail,
+                float2           region_start,
+                float2           region_end,
+                float2           bin_size,
+                int2             grid_size,
+                float2           rcp_of_grid_size,
+                int              pz
+)
+{
+    float sum = 0;
+
+    for(int iy = 0; iy < grid_size.y; ++iy)
+    {
+        for(int ix = 0; ix < grid_size.x; ++ix)
+        {
+            float2 ixy = (float2)(ix + 0.5f, iy + 0.5f);
+            float2 pos = region_start + ixy * bin_size * rcp_of_grid_size;
+
+            int2 xy_low  = convert_int2(pos);
+            int2 xy_high = xy_low + 1;
+
+            float ly = pos.y - xy_low.y;
+            float lx = pos.x - xy_low.x;
+            float hy = 1.0f - ly;
+            float hx = 1.0f - lx;
+
+            float w1 = hy * hx;
+            float w2 = hy * lx;
+            float w3 = ly * hx;
+            float w4 = ly * lx;
+
+            uint4 data;
+            data.x = read_imageui(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;
+            data.y = read_imageui(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;
+            data.z = read_imageui(input, (int4)(xy_low.x, xy_high.y, pz, 0)).x;
+            data.w = read_imageui(input, (int4)(xy_high.x, xy_high.y, pz, 0)).x;
+
+            float4 value = convert_float4(data) * input_scale + input_tail;
+
+            sum = sum + w1 * value.x + w2 * value.y + w3 * value.z + w4 * value.w;
+        }
+    }
+
+    return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);
+}
+
+__kernel void roi_align_U8_U16toU8
+(
+    __read_only  image2d_array_t input,
+    __read_only  image2d_t       rois,
+    __read_only  image2d_t       n_rois,
+    __write_only image2d_array_t output,
+                 float           input_scale,
+                 float           input_tail,
+                 float           output_scale,
+                 float           output_zp,
+                 float           spatial_x_scale,
+                 float           spatial_y_scale,
+                 float           in_width,
+                 float           in_height,
+                 float           rcp_of_out_width,
+                 float           rcp_of_out_height,
+                 float           sampling_x_ratio,
+                 float           sampling_y_ratio,
+                 int             depth
+)
+{
+    int px = get_global_id(0);
+    int py = get_global_id(1);
+    int pw = get_global_id(2);
+
+    int roi_batch = read_imagei(n_rois, (int2)(pw, 0)).x;
+    float4 roi_x = convert_float4(read_imageui(rois, (int2)(0, pw)));
+    float4 roi_y = convert_float4(read_imageui(rois, (int2)(1, pw)));
+    float4 roi_z = convert_float4(read_imageui(rois, (int2)(2, pw)));
+    float4 roi_w = convert_float4(read_imageui(rois, (int2)(3, pw)));
+    float4 roi = (float4)(roi_x.x, roi_y.x, roi_z.x, roi_w.x);
+
+    float4 roi_anchor = roi * (float4)(spatial_x_scale, spatial_y_scale, spatial_x_scale, spatial_y_scale);
+    float2 roi_dims = fmax(roi_anchor.zw - roi_anchor.xy, 1.0f);
+
+    float2 spatial_indx     = (float2)(px, py);
+    float2 pooled_dims      = (float2)(rcp_of_out_width, rcp_of_out_height);
+    float2 max_spatial_dims = (float2)(in_width, in_height);
+
+    float2 bin_size     = roi_dims * pooled_dims;
+    float2 region_start = spatial_indx * bin_size + roi_anchor.xy;
+    float2 region_end   = region_start + bin_size;
+
+    float2 roi_bin_grid = (float2)(sampling_x_ratio, sampling_y_ratio);
+
+    roi_bin_grid = roi_bin_grid == 0 ? ceil(bin_size - EPS_GRID) : roi_bin_grid;
+
+    int kz = roi_batch * depth;
+    float2 rcp_of_grid_size = 1.0f / roi_bin_grid;
+    int2 grid_size_xy = convert_int2(roi_bin_grid);
+    float4 interp;
+    int kz1 = pw * depth;
+    for (int pz = 0; pz < depth; pz ++, kz ++, kz1 ++)
+    {
+        interp.x = roi_align_1x1_U8toF32( input,
+                       input_scale,
+                       input_tail,
+                       region_start,
+                       region_end,
+                       bin_size,
+                       grid_size_xy,
+                       rcp_of_grid_size,
+                       kz);
+
+        uint4 dst;
+        interp.x = interp.x * output_scale + output_zp;
+        interp.x = interp.x < 255 ? interp.x : 255;
+        dst.x = convert_uint_rte(interp.x);
+        write_imageui(output, (int4)(px, py, kz1, 0), dst.xxxx);
+    }
 }
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl
new file mode 100644
index 0000000..0b8f988
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl
@@ -0,0 +1,327 @@
+#define LOCAL_SIZE_X    (32)
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_F32toF32_I32
+ (
+  __read_only  image2d_t input,
+               image2d_t input_t,
+               image2d_t indices_t,
+  __write_only image2d_t output,
+  __write_only image2d_t indices,
+               int       width
+  )
+ {
+    uint lid = get_local_id(0);
+    uint work_group_size = get_local_size(0);
+    uint offset = 0;
+
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)
+    {
+        float4 data = read_imagef(input, coord.xy);
+
+        write_imagef(input_t, coord.xy, data);
+        write_imagei(indices_t, coord.xy, coord.xxxx);
+    }
+
+    __local int sorted[1];
+    int width_minus_one = width - 1;
+    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
+    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
+
+    int x_start = lid * num_pixels_per_thread;
+    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
+
+    sorted[0] = 0;
+
+    while (1)
+    {
+        if (lid == 0)
+        {
+            *sorted = 0;
+        }
+        int swapped = 0;
+        barrier(CLK_GLOBAL_MEM_FENCE);
+
+        // odd-even
+        coord.x = x_start;
+        coord.z = x_start + 1;
+        for (; coord.x < x_end; )
+        {
+            float4 left = read_imagef(input_t, coord.xy);
+            float4 right = read_imagef(input_t, coord.zy);
+
+            if (left.x < right.x)
+            {
+                int4 l_index = read_imagei(indices_t, coord.xy);
+                int4 r_index = read_imagei(indices_t, coord.zy);
+                swapped = 1;
+
+                write_imagef(input_t, coord.xy, right);
+                write_imagef(input_t, coord.zy, left);
+
+                write_imagei(indices_t, coord.xy, r_index);
+                write_imagei(indices_t, coord.zy, l_index);
+            }
+
+            coord.xz = coord.xz + 2;
+        }
+
+        // even-odd
+        coord.x = x_start + 1;
+        coord.z = x_start + 2;
+        for (; coord.x < x_end; )
+        {
+            float4 left = read_imagef(input_t, coord.xy);
+            float4 right = read_imagef(input_t, coord.zy);
+
+            if (left.x < right.x)
+            {
+                int4 l_index = read_imagei(indices_t, coord.xy);
+                int4 r_index = read_imagei(indices_t, coord.zy);
+                swapped = 1;
+
+                write_imagef(input_t, coord.xy, right);
+                write_imagef(input_t, coord.zy, left);
+
+                write_imagei(indices_t, coord.xy, r_index);
+                write_imagei(indices_t, coord.zy, l_index);
+            }
+
+            coord.xz = coord.xz + 2;
+        }
+
+        atomic_add(sorted, swapped);
+        barrier(CLK_GLOBAL_MEM_FENCE);
+
+        if (*sorted == 0)
+            break;
+        barrier(CLK_GLOBAL_MEM_FENCE);
+    }
+
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)
+    {
+        float4 data = read_imagef(input_t, coord.xy);
+        int4 index = read_imagei(indices_t, coord.xy);
+
+        write_imagef(output, coord.xy, data);
+        write_imagei(indices, coord.xy, index);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_U32toU32_I32
+ (
+  __read_only  image2d_t input,
+               image2d_t input_t,
+               image2d_t indices_t,
+  __write_only image2d_t output,
+  __write_only image2d_t indices,
+               int       width
+  )
+ {
+    uint lid = get_local_id(0);
+    uint work_group_size = get_local_size(0);
+    uint offset = 0;
+
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)
+    {
+        uint4 data = read_imageui(input, coord.xy);
+
+        write_imageui(input_t, coord.xy, data);
+        write_imagei(indices_t, coord.xy, coord.xxxx);
+    }
+
+    __local int sorted[1];
+    int width_minus_one = width - 1;
+    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
+    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
+
+    int x_start = lid * num_pixels_per_thread;
+    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
+
+    sorted[0] = 0;
+
+    while (1)
+    {
+        if (lid == 0)
+        {
+            *sorted = 0;
+        }
+        int swapped = 0;
+        barrier(CLK_GLOBAL_MEM_FENCE);
+
+        // odd-even
+        coord.x = x_start;
+        coord.z = x_start + 1;
+        for (; coord.x < x_end; )
+        {
+            uint4 left = read_imageui(input_t, coord.xy);
+            uint4 right = read_imageui(input_t, coord.zy);
+
+            if (left.x < right.x)
+            {
+                int4 l_index = read_imagei(indices_t, coord.xy);
+                int4 r_index = read_imagei(indices_t, coord.zy);
+                swapped = 1;
+
+                write_imageui(input_t, coord.xy, right);
+                write_imageui(input_t, coord.zy, left);
+
+                write_imagei(indices_t, coord.xy, r_index);
+                write_imagei(indices_t, coord.zy, l_index);
+            }
+
+            coord.xz = coord.xz + 2;
+        }
+
+        // even-odd
+        coord.x = x_start + 1;
+        coord.z = x_start + 2;
+        for (; coord.x < x_end; )
+        {
+            uint4 left = read_imageui(input_t, coord.xy);
+            uint4 right = read_imageui(input_t, coord.zy);
+
+            if (left.x < right.x)
+            {
+                int4 l_index = read_imagei(indices_t, coord.xy);
+                int4 r_index = read_imagei(indices_t, coord.zy);
+                swapped = 1;
+
+                write_imageui(input_t, coord.xy, right);
+                write_imageui(input_t, coord.zy, left);
+
+                write_imagei(indices_t, coord.xy, r_index);
+                write_imagei(indices_t, coord.zy, l_index);
+            }
+
+            coord.xz = coord.xz + 2;
+        }
+
+        atomic_add(sorted, swapped);
+        barrier(CLK_GLOBAL_MEM_FENCE);
+
+        if (*sorted == 0)
+            break;
+        barrier(CLK_GLOBAL_MEM_FENCE);
+    }
+
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)
+    {
+        uint4 data = read_imageui(input_t, coord.xy);
+        int4 index = read_imagei(indices_t, coord.xy);
+
+        write_imageui(output, coord.xy, data);
+        write_imagei(indices, coord.xy, index);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_I32toI32_I32
+ (
+  __read_only  image2d_t input,
+               image2d_t input_t,
+               image2d_t indices_t,
+  __write_only image2d_t output,
+  __write_only image2d_t indices,
+               int       width
+  )
+ {
+    uint lid = get_local_id(0);
+    uint work_group_size = get_local_size(0);
+    uint offset = 0;
+
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)
+    {
+        int4 data = read_imagei(input, coord.xy);
+
+        write_imagei(input_t, coord.xy, data);
+        write_imagei(indices_t, coord.xy, coord.xxxx);
+    }
+
+    __local int sorted[1];
+    int width_minus_one = width - 1;
+    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
+    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
+
+    int x_start = lid * num_pixels_per_thread;
+    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
+
+    sorted[0] = 0;
+
+    while (1)
+    {
+        if (lid == 0)
+        {
+            *sorted = 0;
+        }
+        int swapped = 0;
+        barrier(CLK_GLOBAL_MEM_FENCE);
+
+        // odd-even
+        coord.x = x_start;
+        coord.z = x_start + 1;
+        for (; coord.x < x_end; )
+        {
+            int4 left = read_imagei(input_t, coord.xy);
+            int4 right = read_imagei(input_t, coord.zy);
+
+            if (left.x < right.x)
+            {
+                int4 l_index = read_imagei(indices_t, coord.xy);
+                int4 r_index = read_imagei(indices_t, coord.zy);
+                swapped = 1;
+
+                write_imagei(input_t, coord.xy, right);
+                write_imagei(input_t, coord.zy, left);
+
+                write_imagei(indices_t, coord.xy, r_index);
+                write_imagei(indices_t, coord.zy, l_index);
+            }
+
+            coord.xz = coord.xz + 2;
+        }
+
+        // even-odd
+        coord.x = x_start + 1;
+        coord.z = x_start + 2;
+        for (; coord.x < x_end; )
+        {
+            int4 left = read_imagei(input_t, coord.xy);
+            int4 right = read_imagei(input_t, coord.zy);
+
+            if (left.x < right.x)
+            {
+                int4 l_index = read_imagei(indices_t, coord.xy);
+                int4 r_index = read_imagei(indices_t, coord.zy);
+                swapped = 1;
+
+                write_imagei(input_t, coord.xy, right);
+                write_imagei(input_t, coord.zy, left);
+
+                write_imagei(indices_t, coord.xy, r_index);
+                write_imagei(indices_t, coord.zy, l_index);
+            }
+
+            coord.xz = coord.xz + 2;
+        }
+
+        atomic_add(sorted, swapped);
+        barrier(CLK_GLOBAL_MEM_FENCE);
+
+        if (*sorted == 0)
+            break;
+        barrier(CLK_GLOBAL_MEM_FENCE);
+    }
+
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)
+    {
+        int4 data = read_imagei(input_t, coord.xy);
+        int4 index = read_imagei(indices_t, coord.xy);
+
+        write_imagei(output, coord.xy, data);
+        write_imagei(indices, coord.xy, index);
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum.vx
new file mode 100644
index 0000000..fad3ad2
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum.vx
@@ -0,0 +1,262 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;
+_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform int channel;
+_viv_uniform int input_zp;
+_viv_uniform float in_out_scale;
+_viv_uniform float in_out_zp_scale;
+_viv_uniform float output_zp;
+
+__kernel void cumsum_F16toF16_axis2(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    for(coord.z = 0; coord.z < channel; coord.z++)
+    {
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        _viv_asm(COPY, data, src, 16);
+
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+        _viv_asm(COPY, dst, sum, 16);
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+#define CUMSUM_8BITS_AXIS2(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_##in_name##to##out_name##_axis2( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    src_type src; \
+    dst_type dst; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
+ \
+    for(coord.z = 0; coord.z < channel; coord.z++) \
+    { \
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+        float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp; \
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+        int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+        int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+CUMSUM_8BITS_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_8BITS_AXIS2(I8, I8, vxc_char16, vxc_char16)
+
+__kernel void cumsum_I16toI16_axis2(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);
+
+    for(coord.z = 0; coord.z < channel; coord.z++)
+    {
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+        float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp;
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+        int4 tmpDst0 = convert_int4_rte(tmpSum0);
+        int4 tmpDst1 = convert_int4_rte(tmpSum1);
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);
+
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel void cumsum_F16toF16_axis1(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        _viv_asm(COPY, data, src, 16);
+
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+        _viv_asm(COPY, dst, sum, 16);
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+#define CUMSUM_8BITS_AXIS1(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_##in_name##to##out_name##_axis1( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    src_type src; \
+    dst_type dst; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
+ \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+        int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+        int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+CUMSUM_8BITS_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_8BITS_AXIS1(I8, I8, vxc_char16,  vxc_char16)
+
+__kernel void cumsum_I16toI16_axis1(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+        int4 tmpDst0 = convert_int4_rte(tmpSum0);
+        int4 tmpDst1 = convert_int4_rte(tmpSum1);
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel void cumsum_F16toF16_axis0(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, tmpsum, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    for(; coord.x < width; coord.x += 8)
+    {
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        _viv_asm(COPY, data, src, 16);
+
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);
+        _viv_asm(COPY, dst, sum, 16);
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+#define CUMSUM_QINT_AXIS0(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_##in_name##to##out_name##_axis0( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    src_type src; \
+    dst_type dst; \
+    vxc_short8 rowSum; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0); \
+    short zp = (short)input_zp; \
+ \
+    for(; coord.x < width; coord.x += 8) \
+    { \
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \
+        VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \
+        VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \
+        VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32A_4x4); \
+        VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32B_4x4); \
+ \
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+
+CUMSUM_QINT_AXIS0(U8,  U8,  vxc_uchar16, vxc_uchar16)
+CUMSUM_QINT_AXIS0(I8,  I8,  vxc_char16,  vxc_char16)
+CUMSUM_QINT_AXIS0(I16, I16, vxc_short8,  vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_2d.vx
new file mode 100644
index 0000000..c54bb35
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_2d.vx
@@ -0,0 +1,204 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;
+_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform int input_zp;
+_viv_uniform float in_out_scale;
+_viv_uniform float in_out_zp_scale;
+_viv_uniform float output_zp;
+
+__kernel void cumsum_F16toF16_axis1_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int2 coord = (int2)(get_global_id(0), 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+
+    for(; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        _viv_asm(COPY, data, src, 16);
+
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertF16toF16_2x8);
+        _viv_asm(COPY, dst, sum, 16);
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+#define CUMSUM_8BITS_AXIS1_2D(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_##in_name##to##out_name##_axis1_2D( \
+    __read_only image2d_t   input, \
+    __write_only image2d_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    src_type src; \
+    dst_type dst; \
+    int4 sum0 = (int4)(0); \
+    int4 sum1 = (int4)(0); \
+    int4 sum2 = (int4)(0); \
+    int4 sum3 = (int4)(0); \
+ \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32A_4x4); \
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32B_4x4); \
+        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32C_4x4); \
+        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32D_4x4); \
+ \
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+        int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+        int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                 uniConvertInt32toUint8_2x8); \
+        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \
+                 uniConvertInt32toUint8_2x8); \
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+
+CUMSUM_8BITS_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_8BITS_AXIS1_2D(I8, I8, vxc_char16, vxc_char16)
+
+__kernel void cumsum_I16toI16_axis1_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+
+    vxc_short8 src, dst;
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32A_4x4);
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32B_4x4);
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+        int4 tmpDst0 = convert_int4_rte(tmpSum0);
+        int4 tmpDst1 = convert_int4_rte(tmpSum1);
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
+                 uniConvertInt32toUint8_2x8);
+
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel void cumsum_F16toF16_axis0_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+
+    vxc_short8 src, dst;
+    vxc_half8 data, tmpsum, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    for(; coord.x < width; coord.x += 8)
+    {
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        _viv_asm(COPY, data, src, 16);
+
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16A_4x4);
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16B_4x4);
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16C_2x8);
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumHorzF16toF16_2x8);
+        _viv_asm(COPY, dst, sum, 16);
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+#define CUMSUM_QINT_AXIS0_2D(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_##in_name##to##out_name##_axis0_2D( \
+    __read_only image2d_t   input, \
+    __write_only image2d_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    src_type src; \
+    dst_type dst; \
+    vxc_short8 rowSum; \
+    int4 sum0, sum1; \
+    sum0 ^= sum0; \
+    sum1 ^= sum1; \
+    short zp = (short)input_zp; \
+ \
+    for(; coord.x < width; coord.x += 8) \
+    { \
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzU8toI16A_4x4); \
+        VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzU8toI16B_8x4); \
+        VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSubZpI16toI16_2x8); \
+        VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumHorzI16toI32A_4x4); \
+        VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumHorzI16toI32B_4x4); \
+ \
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+ \
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                 uniConvertInt32toUint8_2x8); \
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+
+CUMSUM_QINT_AXIS0_2D(U8,  U8,  vxc_uchar16, vxc_uchar16)
+CUMSUM_QINT_AXIS0_2D(I8,  I8,  vxc_char16,  vxc_char16)
+CUMSUM_QINT_AXIS0_2D(I16, I16, vxc_short8,  vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_bf16.vx
new file mode 100644
index 0000000..5d45e73
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_bf16.vx
@@ -0,0 +1,188 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform int channel;
+
+__kernel void cumsum_BF16toBF16_axis2(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_ushort8 src, val0, val1;
+    vxc_ushort8 dst0, dst1, dst;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float4 sum0 = (float4)(0), sum1 = (float4)(0);
+
+    for(coord.z = 0; coord.z < channel; coord.z++)
+    {
+        float4 data0, data1;
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, data0, val0, 16);
+        _viv_asm(COPY, data1, val1, 16);
+
+        sum0 += data0;
+        sum1 += data1;
+        _viv_asm(COPY, dst0, sum0, 16);
+        _viv_asm(COPY, dst1, sum1, 16);
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel void cumsum_BF16toBF16_axis1(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_ushort8 src, val0, val1;
+    vxc_ushort8 dst0, dst1, dst;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float4 sum0 = (float4)(0), sum1 = (float4)(0);
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        float4 data0, data1;
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, data0, val0, 16);
+        _viv_asm(COPY, data1, val1, 16);
+        sum0 += data0;
+        sum1 += data1;
+        _viv_asm(COPY, dst0, sum0, 16);
+        _viv_asm(COPY, dst1, sum1, 16);
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel void cumsum_BF16toBF16_axis0(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_ushort8 src, val0, val1;
+    vxc_ushort8 dst0, dst1, dst;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float preSum = 0;
+    float4 one = (float4)(1.0, 1.0, 1.0, 1.0);
+    float4 q = (float4)(1.0, 1.0, 1.0, 0);
+
+    for(; coord.x < width; coord.x += 8)
+    {
+        float4 data0, data1;
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, data0, val0, 16);
+        _viv_asm(COPY, data1, val1, 16);
+
+        float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));
+        float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));
+        tmpSum1 += tmpSum0.w;
+
+        tmpSum0 += preSum;
+        tmpSum1 += preSum;
+
+        preSum = tmpSum1.w;
+
+        _viv_asm(COPY, dst0, tmpSum0, 16);
+        _viv_asm(COPY, dst1, tmpSum1, 16);
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel void cumsum_BF16toBF16_axis1_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int2 coord = (int2)(get_global_id(0), 0);
+
+    vxc_ushort8 src, val0, val1;
+    vxc_ushort8 dst0, dst1, dst;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float4 sum0 = (float4)(0), sum1 = (float4)(0);
+
+    for(; coord.y < height; coord.y++)
+    {
+        float4 data0, data1;
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, data0, val0, 16);
+        _viv_asm(COPY, data1, val1, 16);
+
+        sum0 += data0;
+        sum1 += data1;
+
+        _viv_asm(COPY, dst0, sum0, 16);
+        _viv_asm(COPY, dst1, sum1, 16);
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniExtractOddData_2x8);
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel void cumsum_BF16toBF16_axis0_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+
+    vxc_ushort8 src, val0, val1;
+    vxc_ushort8 dst0, dst1, dst;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float preSum = 0;
+    float4 one = (float4)(1.0, 1.0, 1.0, 1.0);
+    float4 q = (float4)(1.0, 1.0, 1.0, 0);
+
+    for(; coord.x < width; coord.x += 8)
+    {
+        float4 data0, data1;
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, data0, val0, 16);
+        _viv_asm(COPY, data1, val1, 16);
+
+        float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));
+        float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));
+        tmpSum1 += tmpSum0.w;
+
+        tmpSum0 += preSum;
+        tmpSum1 += preSum;
+
+        preSum = tmpSum1.w;
+
+        _viv_asm(COPY, dst0, tmpSum0, 16);
+        _viv_asm(COPY, dst1, tmpSum1, 16);
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniExtractOddData_2x8);
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx
new file mode 100644
index 0000000..b9f4e17
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx
@@ -0,0 +1,178 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform int channel;
+
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
+
+#define CUMSUM_F16TOQINT_AXIS2(out_name, src_type, dst_type) \
+__kernel void cumsum_F16to##out_name##_axis2( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    for(coord.z = 0; coord.z < channel; coord.z++) \
+    { \
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, data, src, 16); \
+ \
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                uniU8MulAndPostShift_0_Lo_2x8); \
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+CUMSUM_F16TOQINT_AXIS2(I8,  vxc_half8, vxc_char16)
+CUMSUM_F16TOQINT_AXIS2(I16, vxc_half8, vxc_short8)
+CUMSUM_F16TOQINT_AXIS2(U8,  vxc_half8, vxc_uchar16)
+
+
+#define CUMSUM_F16TOQINT_AXIS1(out_name, src_type, dst_type) \
+__kernel void cumsum_F16to##out_name##_axis1( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, data, src, 16); \
+ \
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                uniU8MulAndPostShift_0_Lo_2x8); \
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+CUMSUM_F16TOQINT_AXIS1(I8,  vxc_half8, vxc_char16)
+CUMSUM_F16TOQINT_AXIS1(I16, vxc_half8, vxc_short8)
+CUMSUM_F16TOQINT_AXIS1(U8,  vxc_half8, vxc_uchar16)
+
+#define CUMSUM_F16TOQINT_AXIS0(out_name, src_type, dst_type) \
+__kernel void cumsum_F16to##out_name##_axis0( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, tmpsum, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    for(; coord.x < width; coord.x += 8) \
+    { \
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, data, src, 16); \
+ \
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4); \
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4); \
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8); \
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8); \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                uniU8MulAndPostShift_0_Lo_2x8); \
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+CUMSUM_F16TOQINT_AXIS0(I8,  vxc_half8, vxc_char16)
+CUMSUM_F16TOQINT_AXIS0(I16, vxc_half8, vxc_short8)
+CUMSUM_F16TOQINT_AXIS0(U8,  vxc_half8, vxc_uchar16)
+
+#define CUMSUM_F16TOQINT_AXIS1_2D(out_name, src_type, dst_type) \
+__kernel void cumsum_F16to##out_name##_axis1_2D( \
+    __read_only image2d_t   input, \
+    __write_only image2d_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    for(; coord.y < height; coord.y++) \
+    { \
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, data, src, 16); \
+ \
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertF16toF16_2x8); \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                uniU8MulAndPostShift_0_Lo_2x8); \
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+CUMSUM_F16TOQINT_AXIS1_2D(I8,  vxc_half8, vxc_char16)
+CUMSUM_F16TOQINT_AXIS1_2D(I16, vxc_half8, vxc_short8)
+CUMSUM_F16TOQINT_AXIS1_2D(U8,  vxc_half8, vxc_uchar16)
+
+#define CUMSUM_F16TOQINT_AXIS0_2D(out_name, src_type, dst_type) \
+__kernel void cumsum_F16to##out_name##_axis0_2D( \
+    __read_only image2d_t   input, \
+    __write_only image2d_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, tmpsum, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    for(; coord.x < width; coord.x += 8) \
+    { \
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, data, src, 16); \
+ \
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16A_4x4); \
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16B_4x4); \
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16C_2x8); \
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumHorzF16toF16_2x8); \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                uniU8MulAndPostShift_0_Lo_2x8); \
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+CUMSUM_F16TOQINT_AXIS0_2D(I8,  vxc_half8, vxc_char16)
+CUMSUM_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8)
+CUMSUM_F16TOQINT_AXIS0_2D(U8,  vxc_half8, vxc_uchar16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx
index 696807f..8b7a639 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx
@@ -50,6 +50,22 @@ float4 eltwise_unary_celu(float4 val)
     return val < 0 ? x : val;
 }
 
+float4 eltwise_unary_rcp(float4 val)
+{
+    return 1.0f / val;
+}
+
+float4 eltwise_unary_sign(float4 val)
+{
+    return sign(val);
+}
+
+float4 eltwise_unary_softsign(float4 val)
+{
+    float4 _rcp = 1.0f / (1.0f + fabs(val));
+    return val * _rcp;
+}
+
 _viv_uniform float inputScale;
 _viv_uniform float inputTail;
 _viv_uniform float outputScale;
@@ -94,83 +110,6 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;
     _viv_asm(COPY, dst, dst2, 16); \
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
-//EXP
-ELTSISE_UNARY_2D(exp, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(exp, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(exp, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(exp, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(exp, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(exp, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(exp, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(exp, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(exp, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(exp, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//SIN
-ELTSISE_UNARY_2D(sin, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(sin, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(sin, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(sin, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(sin, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(sin, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(sin, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(sin, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(sin, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//COS
-ELTSISE_UNARY_2D(cos, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(cos, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(cos, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(cos, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(cos, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(cos, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(cos, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(cos, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(cos, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//LOG
-ELTSISE_UNARY_2D(log, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(log, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(log, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(log, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(log, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(log, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(log, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(log, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(log, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//SELU
-ELTSISE_UNARY_2D(selu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(selu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(selu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(selu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(selu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(selu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(selu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(selu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(selu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(selu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//NEG
-ELTSISE_UNARY_2D(neg, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(neg, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(neg, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(neg, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(neg, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(neg, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(neg, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(neg, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(neg, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//CELU
-ELTSISE_UNARY_2D(celu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(celu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(celu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(celu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(celu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(celu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(celu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(celu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(celu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(celu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
 
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
@@ -205,17 +144,36 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;
     VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
+#define ADD_ELTSISE_UNARY_2D(func_name) \
+ELTSISE_UNARY_2D(func_name, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8) \
+ELTSISE_UNARY_2D(func_name, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8) \
+ELTSISE_UNARY_2D(func_name, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8) \
+ELTSISE_UNARY_2D(func_name, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8) \
+ELTSISE_UNARY_2D(func_name, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8) \
+ELTSISE_UNARY_2D(func_name, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8) \
+ELTSISE_UNARY_2D(func_name, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8) \
+ELTSISE_UNARY_2D(func_name, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8) \
+ELTSISE_UNARY_2D(func_name, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8) \
+ELTSISE_UNARY_2D(func_name, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8) \
+ELTSISE_UNARY_BF16_2D(func_name)
+
 //EXP
-ELTSISE_UNARY_BF16_2D(exp)
+ADD_ELTSISE_UNARY_2D(exp)
 //SIN
-ELTSISE_UNARY_BF16_2D(sin)
+ADD_ELTSISE_UNARY_2D(sin)
 //COS
-ELTSISE_UNARY_BF16_2D(cos)
+ADD_ELTSISE_UNARY_2D(cos)
 //LOG
-ELTSISE_UNARY_BF16_2D(log)
+ADD_ELTSISE_UNARY_2D(log)
 //SELU
-ELTSISE_UNARY_BF16_2D(selu)
+ADD_ELTSISE_UNARY_2D(selu)
 //NEG
-ELTSISE_UNARY_BF16_2D(neg)
+ADD_ELTSISE_UNARY_2D(neg)
 //CELU
-ELTSISE_UNARY_BF16_2D(celu)
+ADD_ELTSISE_UNARY_2D(celu)
+//RCP
+ADD_ELTSISE_UNARY_2D(rcp)
+//SIGN
+ADD_ELTSISE_UNARY_2D(sign)
+//SOFTSIGN
+ADD_ELTSISE_UNARY_2D(softsign)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx
index d150e2a..df52777 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx
@@ -50,6 +50,22 @@ float4 eltwise_unary_celu(float4 val)
     return val < 0 ? x : val;
 }
 
+float4 eltwise_unary_rcp(float4 val)
+{
+    return 1.0f / val;
+}
+
+float4 eltwise_unary_sign(float4 val)
+{
+    return sign(val);
+}
+
+float4 eltwise_unary_softsign(float4 val)
+{
+    float4 _rcp = 1.0f / (1.0f + fabs(val));
+    return val * _rcp;
+}
+
 _viv_uniform float inputScale;
 _viv_uniform float inputTail;
 _viv_uniform float outputScale;
@@ -94,83 +110,6 @@ __kernel void func_name##_##src_type_name##to##dst_type_name( \
     _viv_asm(COPY, dst, dst2, 16); \
     VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
-//EXP
-ELTSISE_UNARY_3D(exp, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(exp, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(exp, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(exp, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(exp, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(exp, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(exp, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(exp, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(exp, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(exp, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//SIN
-ELTSISE_UNARY_3D(sin, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(sin, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(sin, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(sin, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(sin, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(sin, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(sin, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(sin, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(sin, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//COS
-ELTSISE_UNARY_3D(cos, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(cos, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(cos, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(cos, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(cos, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(cos, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(cos, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(cos, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(cos, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//LOG
-ELTSISE_UNARY_3D(log, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(log, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(log, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(log, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(log, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(log, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(log, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(log, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(log, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//SELU
-ELTSISE_UNARY_3D(selu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(selu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(selu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(selu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(selu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(selu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(selu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(selu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(selu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(selu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//NEG
-ELTSISE_UNARY_3D(neg, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(neg, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(neg, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(neg, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(neg, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(neg, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(neg, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(neg, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(neg, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//CELU
-ELTSISE_UNARY_3D(celu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(celu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(celu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(celu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(celu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(celu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(celu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(celu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(celu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(celu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
 
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
@@ -204,17 +143,36 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;
     VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
     VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
+#define ADD_ELTSISE_UNARY_3D(func_name) \
+ELTSISE_UNARY_3D(func_name, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8) \
+ELTSISE_UNARY_3D(func_name, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8) \
+ELTSISE_UNARY_3D(func_name, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8) \
+ELTSISE_UNARY_3D(func_name, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8) \
+ELTSISE_UNARY_3D(func_name, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8) \
+ELTSISE_UNARY_3D(func_name, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8) \
+ELTSISE_UNARY_3D(func_name, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8) \
+ELTSISE_UNARY_3D(func_name, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8) \
+ELTSISE_UNARY_3D(func_name, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8) \
+ELTSISE_UNARY_3D(func_name, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8) \
+ELTSISE_UNARY_BF16(func_name)
+
 //EXP
-ELTSISE_UNARY_BF16(exp)
+ADD_ELTSISE_UNARY_3D(exp)
 //SIN
-ELTSISE_UNARY_BF16(sin)
+ADD_ELTSISE_UNARY_3D(sin)
 //COS
-ELTSISE_UNARY_BF16(cos)
+ADD_ELTSISE_UNARY_3D(cos)
 //LOG
-ELTSISE_UNARY_BF16(log)
+ADD_ELTSISE_UNARY_3D(log)
 //SELU
-ELTSISE_UNARY_BF16(selu)
+ADD_ELTSISE_UNARY_3D(selu)
 //NEG
-ELTSISE_UNARY_BF16(neg)
+ADD_ELTSISE_UNARY_3D(neg)
 //CELU
-ELTSISE_UNARY_BF16(selu)
+ADD_ELTSISE_UNARY_3D(celu)
+//RCP
+ADD_ELTSISE_UNARY_3D(rcp)
+//SIGN
+ADD_ELTSISE_UNARY_3D(sign)
+//SOFTSIGN
+ADD_ELTSISE_UNARY_3D(softsign)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx
new file mode 100644
index 0000000..c1b970d
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx
@@ -0,0 +1,370 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+_viv_uniform float input_scale;
+_viv_uniform float input_scale2;
+_viv_uniform float input_zp;
+_viv_uniform float sum_x_tail;
+_viv_uniform float sum_x2_tail0;
+_viv_uniform float sum_x2_tail1;
+
+_viv_uniform VXC_512Bits uniSumX_16x1;
+_viv_uniform VXC_512Bits uniSumX2_16x1;
+_viv_uniform VXC_512Bits uniResetFp32_4x4;
+_viv_uniform int group_stride;
+
+#define GROUP_NORM_SUMS_8BITS_IMPL(name, src_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+                 float eps, int is2D) \
+{ \
+    int gidx = get_global_id(0) << 4; \
+    int lidx = get_local_id(0); \
+    int gidz = get_global_id(1); \
+    int4 coord = (int4)(gidx, 0, gidz, 0); \
+    src_type src0; \
+    float2 sums_f32 = 0; \
+    int2 sums = 0, sum_x_x2; \
+ \
+    __local float lcl_sum[16]; \
+    __local float lcl_sqr[16]; \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr_a); \
+    if (gidx < width) \
+    { \
+        for(coord.y = 0; coord.y < height;) \
+        { \
+            VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+            coord.y++; \
+            VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \
+            VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \
+            sums = sums + sum_x_x2; \
+        } \
+        sums_f32 = convert_float2(sums); \
+        sums_f32.y = sums_f32.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sums_f32.x; \
+        sums_f32.x = sums_f32.x * input_scale + sum_x_tail; \
+    } \
+    lcl_sum[lidx] = sums_f32.x; \
+    lcl_sqr[lidx] = sums_f32.y; \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \
+    if(lidx == 0) \
+    { \
+        float4 one = (float4)(1, 1, 1, 1); \
+        __local float4* tmp_sum = (__local float4*)lcl_sum; \
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr; \
+        float sum_x = 0,sum_x2 = 0; \
+        for(int i = 0; i < 4; i++) \
+        { \
+            sum_x += dot(tmp_sum[i], one); \
+            sum_x2 += dot(tmp_sqr[i], one); \
+        } \
+        float4 data = (float4)(sum_x, sum_x2, 0, 0); \
+        write_imagef(output, coord_out, data); \
+    } \
+}
+GROUP_NORM_SUMS_8BITS_IMPL(U8, vxc_uchar16)
+GROUP_NORM_SUMS_8BITS_IMPL(I8, vxc_char16)
+
+#define GROUP_NORM_SUMS_8BITS_IMPL_2D(name, src_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+                 float eps, int is2D) \
+{ \
+    int gidx = get_global_id(0) << 4; \
+    int lidx = get_local_id(0); \
+ \
+    int2 coord = (int2)(gidx, get_global_id(1)); \
+    src_type src0; \
+    float2 sums = 0; \
+ \
+    __local float lcl_sum[16]; \
+    __local float lcl_sqr[16]; \
+    if(gidx < width) \
+    { \
+        VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        VXC_DP16x1(sums, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \
+        VXC_DP16x1(sums, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \
+        sums.y = sums.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sums.x; \
+        sums.x = sums.x * input_scale + sum_x_tail; \
+    } \
+    lcl_sum[lidx] = sums.x; \
+    lcl_sqr[lidx] = sums.y; \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0); \
+    if(lidx == 0) \
+    { \
+        float4 one = (float4)(1, 1, 1, 1); \
+        __local float4* tmp_sum = (__local float4*)lcl_sum; \
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr; \
+        float sum_x = 0,sum_x2 = 0; \
+        for(int i = 0; i < 4; i++) \
+        { \
+            sum_x += dot(tmp_sum[i], one); \
+            sum_x2 += dot(tmp_sqr[i], one); \
+        } \
+        float4 data = (float4)(sum_x, sum_x2, 0, 0); \
+        write_imagef(output, coord_out, data); \
+    } \
+}
+GROUP_NORM_SUMS_8BITS_IMPL_2D(U8, vxc_uchar16)
+GROUP_NORM_SUMS_8BITS_IMPL_2D(I8, vxc_char16)
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_means(
+    image2d_t input, image2d_t output, float eps, float group_ratio)
+{
+    int gidx = get_global_id(0);
+    int lidx = get_local_id(0);
+
+    int2 coord = (int2)(gidx, get_global_id(1));
+    vxc_uchar16 src0 = 1;
+    float2 sum_sqr = (float2)(0);
+    float4 mean_vari;
+    VXC_DP4x4(mean_vari, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniResetFp32_4x4);
+
+    __local float2 lcl_data[16];
+    __local float2 lcl_sum[4];
+
+    for(; coord.x < group_stride; coord.x += 64)
+    {
+        mean_vari += read_imagef(input, coord);
+    }
+    lcl_data[lidx] = mean_vari.xy;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(lidx < 4)
+    {
+        float2 tmpSum = (float2)(0);
+        for(int i = lidx; i < 16; i+=4)
+        {
+            tmpSum += lcl_data[i];
+        }
+        lcl_sum[lidx] = tmpSum;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(lidx == 0)
+    {
+        for(int i = 0; i < 4; i++)
+        {
+            sum_sqr += lcl_sum[i];
+        }
+        mean_vari.xy = sum_sqr * group_ratio;
+        mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+        mean_vari.s1 = rsqrt(mean_vari.s1);
+
+        coord.x = 0;
+        write_imagef(output, coord, mean_vari);
+    }
+}
+
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;
+#define CONVERT_INPUT_TO_F32() \
+VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
+VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \
+VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4);
+
+#define GROUP_NORM_8BITS_IMPL(name, src_type, dst_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_array_t scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+    float eps, int is2D, float rSpaceOrg, int pStride) \
+{ \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+    src_type src0; \
+    dst_type dst; \
+    vxc_short8 src1; \
+    vxc_half8 scale_h; \
+    float scale_vari, bias_val; \
+    float4 bias_f, scale_f; \
+ \
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy); \
+    VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    bias_f = read_imagef(bias, coord_para.xy); \
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    _viv_asm(COPY, scale_h, src1, 16); \
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+ \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    vxc_int4 tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
+    float alpha = input_scale * output_scale * scale_vari; \
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
+    bias_val = bias_val - input_zp * alpha; \
+ \
+    CONVERT_INPUT_TO_F32() \
+    norm = tmpData0 * alpha + bias_val; \
+    tmpVal0 = convert_int4_rte(norm); \
+    norm = tmpData1 * alpha + bias_val; \
+    tmpVal1 = convert_int4_rte(norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    norm = tmpData2 * alpha + bias_val; \
+    tmpVal0 = convert_int4_rte(norm); \
+    norm = tmpData3 * alpha + bias_val; \
+    tmpVal1 = convert_int4_rte(norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
+}
+GROUP_NORM_8BITS_IMPL(U8_F16toU8, vxc_uchar16, vxc_uchar16)
+GROUP_NORM_8BITS_IMPL(I8_F16toI8, vxc_char16,  vxc_char16)
+
+#define GROUP_NORM_8BITS_IMPL_2D(name, src_type, dst_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_array_t scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+    float eps, int is2D, float rSpaceOrg, int pStride) \
+{ \
+    int gidz = get_global_id(1); \
+    int2 coord = (int2)(get_global_id(0), gidz); \
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \
+    src_type src0; \
+    dst_type dst; \
+    vxc_short8 src1; \
+    vxc_half8 scale_h; \
+    float scale_vari, bias_val; \
+    float4 bias_f, scale_f; \
+ \
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy); \
+    VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    bias_f = read_imagef(bias, coord_para.xy); \
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    _viv_asm(COPY, scale_h, src1, 16); \
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+ \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    vxc_int4 tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
+    float alpha = input_scale * output_scale * scale_vari; \
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
+    bias_val = bias_val - input_zp * alpha; \
+ \
+    CONVERT_INPUT_TO_F32() \
+    norm = tmpData0 * alpha + bias_val; \
+    tmpVal0 = convert_int4_rte(norm); \
+    norm = tmpData1 * alpha + bias_val; \
+    tmpVal1 = convert_int4_rte(norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    norm = tmpData2 * alpha + bias_val; \
+    tmpVal0 = convert_int4_rte(norm); \
+    norm = tmpData3 * alpha + bias_val; \
+    tmpVal1 = convert_int4_rte(norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+}
+GROUP_NORM_8BITS_IMPL_2D(U8_F16toU8, vxc_uchar16, vxc_uchar16)
+GROUP_NORM_8BITS_IMPL_2D(I8_F16toI8, vxc_char16,  vxc_char16)
+
+#define GROUP_NORM_8BITS_F32_IMPL(name, src_type, dst_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+    float eps, int is2D, float rSpaceOrg, int pStride) \
+{ \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+    src_type src0; \
+    dst_type dst; \
+    float scale_vari, bias_val; \
+    float4 bias_f, scale_f; \
+ \
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy); \
+    bias_f = read_imagef(bias, coord_para.xy); \
+    scale_f = read_imagef(scale, coord_para.xy); \
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    vxc_int4 tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
+    float alpha = input_scale * output_scale * scale_vari; \
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
+    bias_val = bias_val - input_zp * alpha; \
+ \
+    CONVERT_INPUT_TO_F32() \
+    norm = tmpData0 * alpha + bias_val; \
+    tmpVal0 = convert_int4_rte(norm); \
+    norm = tmpData1 * alpha + bias_val; \
+    tmpVal1 = convert_int4_rte(norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    norm = tmpData2 * alpha + bias_val; \
+    tmpVal0 = convert_int4_rte(norm); \
+    norm = tmpData3 * alpha + bias_val; \
+    tmpVal1 = convert_int4_rte(norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
+}
+GROUP_NORM_8BITS_F32_IMPL(U8_F32toU8, vxc_uchar16, vxc_uchar16)
+GROUP_NORM_8BITS_F32_IMPL(I8_F32toI8, vxc_char16,  vxc_char16)
+
+#define GROUP_NORM_8BITS_F32_IMPL_2D(name, src_type, dst_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+    float eps, int is2D, float rSpaceOrg, int pStride) \
+{ \
+    int gidz = get_global_id(1); \
+    int2 coord = (int2)(get_global_id(0), gidz); \
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \
+    src_type src0; \
+    dst_type dst; \
+    float scale_vari, bias_val; \
+    float4 bias_f, scale_f; \
+ \
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy); \
+    bias_f = read_imagef(bias, coord_para.xy); \
+    scale_f = read_imagef(scale, coord_para.xy); \
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    vxc_int4 tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
+    float alpha = input_scale * output_scale * scale_vari; \
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
+    bias_val = bias_val - input_zp * alpha; \
+ \
+    CONVERT_INPUT_TO_F32() \
+    norm = tmpData0 * alpha + bias_val; \
+    tmpVal0 = convert_int4_rte(norm); \
+    norm = tmpData1 * alpha + bias_val; \
+    tmpVal1 = convert_int4_rte(norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    norm = tmpData2 * alpha + bias_val; \
+    tmpVal0 = convert_int4_rte(norm); \
+    norm = tmpData3 * alpha + bias_val; \
+    tmpVal1 = convert_int4_rte(norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+}
+GROUP_NORM_8BITS_F32_IMPL_2D(U8_F32toU8, vxc_uchar16, vxc_uchar16)
+GROUP_NORM_8BITS_F32_IMPL_2D(I8_F32toI8, vxc_char16,  vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx
new file mode 100644
index 0000000..120e37e
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx
@@ -0,0 +1,233 @@
+#include "cl_viv_vx_ext.h"
+
+
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;
+_viv_uniform float input_scale;
+_viv_uniform float input_zp;
+
+#define GROUP_NORM_8BITSTOF16_IMPL(name, src_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_array_t scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+    float eps, int is2D, float rSpaceOrg, int pStride) \
+{ \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+    src_type src0; \
+    vxc_short8 src1, outval; \
+    vxc_half8 scale_h, dst; \
+    float scale_vari, bias_val; \
+    vxc_float4 bias_f, scale_f; \
+ \
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); \
+    VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    bias_f = read_imagef(bias, coord_para.xy); \
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    _viv_asm(COPY, scale_h, src1, 16); \
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+ \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
+    half4 tmpVal0, tmpVal1; \
+    float alpha = scale_vari * input_scale; \
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \
+    bias_val = bias_val - input_zp * alpha; \
+ \
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \
+    norm = alpha * tmpData0 + bias_val; \
+    _viv_asm(CONV, tmpVal0, norm); \
+    norm = alpha * tmpData1 + bias_val; \
+    _viv_asm(CONV, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    coord.x += 8; \
+    norm = alpha * tmpData2 + bias_val; \
+    _viv_asm(CONV, tmpVal0, norm); \
+    norm = alpha * tmpData3 + bias_val; \
+    _viv_asm(CONV, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+GROUP_NORM_8BITSTOF16_IMPL(U8_F16toF16, vxc_uchar16)
+GROUP_NORM_8BITSTOF16_IMPL(I8_F16toF16, vxc_char16)
+
+
+#define GROUP_NORM_8BITSTOF16_IMPL_2D(name, src_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_array_t scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+    float eps, int is2D, float rSpaceOrg, int pStride) \
+{ \
+    int gidz = get_global_id(1); \
+    int2 coord = (int2)(get_global_id(0), gidz); \
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \
+    src_type src0; \
+    vxc_short8 src1, outval; \
+    vxc_half8 scale_h, dst; \
+    float scale_vari, bias_val; \
+    vxc_float4 bias_f, scale_f; \
+ \
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); \
+    VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    bias_f = read_imagef(bias, coord_para.xy); \
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    _viv_asm(COPY, scale_h, src1, 16); \
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+ \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
+    half4 tmpVal0, tmpVal1; \
+    float alpha = scale_vari; \
+    float alpha = scale_vari * input_scale; \
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \
+    bias_val = bias_val - input_zp * alpha; \
+ \
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \
+    norm = alpha * tmpData0 + bias_val; \
+    _viv_asm(CONV, tmpVal0, norm); \
+    norm = alpha * tmpData1 + bias_val; \
+    _viv_asm(CONV, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    coord.x += 8; \
+    norm = alpha * tmpData2 + bias_val; \
+    _viv_asm(CONV, tmpVal0, norm); \
+    norm = alpha * tmpData3 + bias_val; \
+    _viv_asm(CONV, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+GROUP_NORM_8BITSTOF16_IMPL_2D(U8_F16toF16, vxc_uchar16)
+GROUP_NORM_8BITSTOF16_IMPL_2D(I8_F16toF16, vxc_char16)
+
+#define GROUP_NORM_8TOF16_F32_IMPL(name, src_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+    float eps, int is2D, float rSpaceOrg, int pStride) \
+{ \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+    src_type src0; \
+    vxc_short8 outval; \
+    vxc_half8 dst; \
+    float scale_vari, bias_val; \
+    vxc_float4 bias_f, scale_f; \
+ \
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); \
+    bias_f = read_imagef(bias, coord_para.xy); \
+    scale_f = read_imagef(scale, coord_para.xy); \
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
+    half4 tmpVal0, tmpVal1; \
+    float alpha = scale_vari * input_scale; \
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \
+    bias_val = bias_val - input_zp * alpha; \
+ \
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \
+    norm = alpha * tmpData0 + bias_val; \
+    _viv_asm(CONV, tmpVal0, norm); \
+    norm = alpha * tmpData1 + bias_val; \
+    _viv_asm(CONV, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    coord.x += 8; \
+    norm = alpha * tmpData2 + bias_val; \
+    _viv_asm(CONV, tmpVal0, norm); \
+    norm = alpha * tmpData3 + bias_val; \
+    _viv_asm(CONV, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+GROUP_NORM_8TOF16_F32_IMPL(U8_F32toF16, vxc_uchar16)
+GROUP_NORM_8TOF16_F32_IMPL(I8_F32toF16, vxc_char16)
+
+#define GROUP_NORM_8TOF16_F32_IMPL_2D(name, src_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+    float eps, int is2D, float rSpaceOrg, int pStride) \
+{ \
+    int gidz = get_global_id(1); \
+    int2 coord = (int2)(get_global_id(0), gidz); \
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \
+    src_type src0; \
+    vxc_short8 outval; \
+    vxc_half8 dst; \
+    float scale_vari, bias_val; \
+    vxc_float4 bias_f, scale_f; \
+ \
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); \
+    bias_f = read_imagef(bias, coord_para.xy); \
+    scale_f = read_imagef(scale, coord_para.xy); \
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
+    half4 tmpVal0, tmpVal1; \
+    float alpha = scale_vari * input_scale; \
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \
+    bias_val = bias_val - input_zp * alpha; \
+ \
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \
+    norm = alpha * tmpData0 + bias_val; \
+    _viv_asm(CONV, tmpVal0, norm); \
+    norm = alpha * tmpData1 + bias_val; \
+    _viv_asm(CONV, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    coord.x += 8; \
+    norm = alpha * tmpData2 + bias_val; \
+    _viv_asm(CONV, tmpVal0, norm); \
+    norm = alpha * tmpData3 + bias_val; \
+    _viv_asm(CONV, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+GROUP_NORM_8TOF16_F32_IMPL_2D(U8_F32toF16, vxc_uchar16)
+GROUP_NORM_8TOF16_F32_IMPL_2D(I8_F32toF16, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx
new file mode 100644
index 0000000..b62b67f
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx
@@ -0,0 +1,347 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uniSum_X_X2_8x2;
+_viv_uniform float input_scale;
+_viv_uniform float input_scale2;
+_viv_uniform float input_zp;
+_viv_uniform float sum_x_tail;
+_viv_uniform float sum_x2_tail0;
+_viv_uniform float sum_x2_tail1;
+
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+#define GROUP_NORM_SUMS_16BITS_IMPL(name, src_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+                 float eps, int is2D) \
+{ \
+    int gidx = get_global_id(0) << 3; \
+    int lidx = get_local_id(0); \
+    int gidz = get_global_id(1); \
+    int4 coord = (int4)(gidx, 0, gidz, 0); \
+    vxc_short8 src0; \
+    src_type in_h; \
+    float4 sumsqr; \
+    float4 tmpSumSqr = (float4)(0); \
+ \
+    __local float lcl_sum[16]; \
+    __local float lcl_sqr[16]; \
+ \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr_a); \
+ \
+    if(gidx < width) \
+    { \
+        for(coord.y = 0; coord.y < height;) \
+        { \
+            VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            coord.y++; \
+            _viv_asm(COPY, in_h, src0, 16); \
+            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
+            tmpSumSqr += sumsqr; \
+        } \
+        tmpSumSqr.y = tmpSumSqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * tmpSumSqr.x; \
+        tmpSumSqr.x = tmpSumSqr.x * input_scale + sum_x_tail; \
+    } \
+ \
+    lcl_sum[lidx] = tmpSumSqr.x; \
+    lcl_sqr[lidx] = tmpSumSqr.y; \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \
+    if(lidx == 0) \
+    { \
+        float4 one = (float4)(1, 1, 1, 1); \
+        __local float4* tmp_sum = (__local float4*)lcl_sum; \
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr; \
+ \
+        float sum = 0; \
+        float sqr = 0; \
+        for(int i = 0; i < 4; i++) \
+        { \
+            sum += dot(tmp_sum[i], one); \
+            sqr += dot(tmp_sqr[i], one); \
+        } \
+ \
+        float4 data = (float4)(sum, sqr, 0, 0); \
+        write_imagef(output, coord_out, data); \
+    } \
+}
+GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_half8)
+GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)
+
+#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+                 float eps, int is2D) \
+{ \
+    int gidx = get_global_id(0) << 3; \
+    int lidx = get_local_id(0); \
+ \
+    int2 coord = (int2)(gidx, get_global_id(1)); \
+    vxc_short8 src0; \
+    src_type in_h; \
+    float4 sumsqr = (float4)(0); \
+ \
+    __local float lcl_sum[16]; \
+    __local float lcl_sqr[16]; \
+ \
+    if(gidx < width) \
+    { \
+        VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, in_h, src0, 16); \
+        VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
+        sumsqr.y = sumsqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sumsqr.x; \
+        sumsqr.x = sumsqr.x * input_scale + sum_x_tail; \
+    } \
+ \
+    lcl_sum[lidx] = sumsqr.x; \
+    lcl_sqr[lidx] = sumsqr.y; \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0); \
+    if(lidx == 0) \
+    { \
+        float4 one = (float4)(1, 1, 1, 1); \
+        __local float4* tmp_sum = (__local float4*)lcl_sum; \
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr; \
+ \
+        float sum = 0; \
+        float sqr = 0; \
+        for(int i = 0; i < 4; i++) \
+        { \
+            sum += dot(tmp_sum[i], one); \
+            sqr += dot(tmp_sqr[i], one); \
+        } \
+ \
+        float4 data = (float4)(sum, sqr, 0, 0); \
+        write_imagef(output, coord_out, data); \
+    } \
+}
+GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)
+GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)
+
+#define GROUP_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_array_t scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+    float eps, int is2D, float rSpaceOrg, int pStride) \
+{ \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+    vxc_short8 src0; \
+    vxc_short8 src1; \
+    vxc_half8 scale_h; \
+    src_type in_h; \
+    float scale_vari, bias_val; \
+    float4 bias_f, scale_f; \
+ \
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy); \
+    VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    bias_f = read_imagef(bias, coord_para.xy); \
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    _viv_asm(COPY, scale_h, src1, 16); \
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+ \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    float4  tmpData0, tmpData1; \
+    copy_type outval; \
+    conv_type tmpVal0, tmpVal1; \
+    float alpha = input_scale * output_scale * scale_vari; \
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
+    bias_val = bias_val - input_zp * alpha; \
+    dst_type dst; \
+ \
+    _viv_asm(COPY, in_h, src0, 16); \
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
+ \
+    float4 norm; \
+    norm = alpha * tmpData0 + bias_val; \
+    _viv_asm(CONV_RTE, tmpVal0, norm); \
+    norm = alpha * tmpData1 + bias_val; \
+    _viv_asm(CONV_RTE, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+GROUP_NORM_16BITS_IMPL(F16_F16toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)
+GROUP_NORM_16BITS_IMPL(F16_F16toI16, vxc_half8,  vxc_short8, vxc_short8, int4)
+GROUP_NORM_16BITS_IMPL(F16_F16toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)
+GROUP_NORM_16BITS_IMPL(F16_F16toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)
+GROUP_NORM_16BITS_IMPL(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, int4)
+GROUP_NORM_16BITS_IMPL(I16_F16toF16, vxc_short8, vxc_half8,  vxc_short8, half4)
+
+#define GROUP_NORM_16BITS_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_array_t scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+    float eps, int is2D, float rSpaceOrg, int pStride) \
+{ \
+    int gidz = get_global_id(1); \
+    int2 coord = (int2)(get_global_id(0), gidz); \
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \
+    vxc_short8 src0; \
+    vxc_short8 src1; \
+    vxc_half8 scale_h; \
+    src_type in_h; \
+    float scale_vari, bias_val; \
+    float4 bias_f, scale_f; \
+ \
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy); \
+    VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    bias_f = read_imagef(bias, coord_para.xy); \
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    _viv_asm(COPY, scale_h, src1, 16); \
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+ \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    float4  tmpData0, tmpData1; \
+    copy_type outval; \
+    conv_type tmpVal0, tmpVal1; \
+    float alpha = output_scale * scale_vari; \
+    bias_val = input_scale * (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
+    bias_val = bias_val - input_zp * alpha; \
+    dst_type dst; \
+ \
+    _viv_asm(COPY, in_h, src0, 16); \
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
+    float4 norm; \
+    norm = alpha * tmpData0 + bias_val; \
+    _viv_asm(CONV, tmpVal0, norm); \
+    norm = alpha * tmpData1 + bias_val; \
+    _viv_asm(CONV, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+GROUP_NORM_16BITS_IMPL_2D(F16_F16toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)
+GROUP_NORM_16BITS_IMPL_2D(F16_F16toI16, vxc_half8,  vxc_short8, vxc_short8, int4)
+GROUP_NORM_16BITS_IMPL_2D(F16_F16toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)
+GROUP_NORM_16BITS_IMPL_2D(F16_F16toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)
+GROUP_NORM_16BITS_IMPL_2D(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, int4)
+GROUP_NORM_16BITS_IMPL_2D(I16_F16toF16, vxc_short8, vxc_half8,  vxc_short8, half4)
+
+#define GROUP_NORM_16BITS_F32_IMPL(name, src_type, dst_type, copy_type, conv_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+    float eps, int is2D, float rSpaceOrg, int pStride) \
+{ \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+    vxc_short8 src0; \
+    src_type in_h; \
+    float scale_vari, bias_val; \
+    float4 bias_f, scale_f; \
+ \
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy); \
+    bias_f = read_imagef(bias, coord_para.xy); \
+    scale_f = read_imagef(scale, coord_para.xy); \
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    float4  tmpData0, tmpData1; \
+    copy_type outval; \
+    conv_type tmpVal0, tmpVal1; \
+    float alpha = input_scale * output_scale * scale_vari; \
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
+    bias_val = bias_val - input_zp * alpha; \
+    dst_type dst; \
+ \
+    _viv_asm(COPY, in_h, src0, 16); \
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
+ \
+    float4 norm; \
+    norm = alpha * tmpData0 + bias_val; \
+    _viv_asm(CONV, tmpVal0, norm); \
+    norm = alpha * tmpData1 + bias_val; \
+    _viv_asm(CONV_RTE, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+GROUP_NORM_16BITS_F32_IMPL(F16_F32toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)
+GROUP_NORM_16BITS_F32_IMPL(F16_F32toI16, vxc_half8,  vxc_short8, vxc_short8, int4)
+GROUP_NORM_16BITS_F32_IMPL(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)
+GROUP_NORM_16BITS_F32_IMPL(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)
+GROUP_NORM_16BITS_F32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
+GROUP_NORM_16BITS_F32_IMPL(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)
+
+#define GROUP_NORM_16BITS_F32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+    float eps, int is2D, float rSpaceOrg, int pStride) \
+{ \
+    int gidz = get_global_id(1); \
+    int2 coord = (int2)(get_global_id(0), gidz); \
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \
+    vxc_short8 src0; \
+    src_type in_h; \
+    float scale_vari, bias_val; \
+    float4 bias_f, scale_f; \
+ \
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy); \
+    bias_f = read_imagef(bias, coord_para.xy); \
+    scale_f = read_imagef(scale, coord_para.xy); \
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    float4  tmpData0, tmpData1; \
+    copy_type outval; \
+    conv_type tmpVal0, tmpVal1; \
+    float alpha = input_scale * output_scale * scale_vari; \
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
+    bias_val = bias_val - input_zp * alpha; \
+    dst_type dst; \
+ \
+    _viv_asm(COPY, in_h, src0, 16); \
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
+    float4 norm; \
+    norm = alpha * tmpData0 + bias_val; \
+    _viv_asm(CONV, tmpVal0, norm); \
+    norm = alpha * tmpData1 + bias_val; \
+    _viv_asm(CONV, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)
+GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI16, vxc_half8,  vxc_short8, vxc_short8, int4)
+GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)
+GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)
+GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
+GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16.vx
deleted file mode 100644
index 161383d..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16.vx
+++ /dev/null
@@ -1,306 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int width;
-_viv_uniform int height;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
-_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;
-
-_viv_uniform float outputScale;
-_viv_uniform int output_ZP;
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_F16(
-    image2d_array_t input, image2d_array_t output, float eps, int is2D)
-{
-    int gidx = get_global_id(0) << 3;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(gidx, 0, gidz, 0);
-    vxc_short8 src0;
-    vxc_half8 in_h;
-    vxc_float4 sumsqr;
-    vxc_float4 tmpSumSqr = (vxc_float4)(0);
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    int8 input_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
-
-    if(gidx < width)
-    {
-        for(coord.y = 0; coord.y < height;)
-        {
-            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            _viv_asm(COPY, in_h, src0, 16);
-            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-                uniFp16SumSqr_dp8x2);
-            tmpSumSqr += sumsqr;
-        }
-    }
-
-    lcl_sum[lidx] = tmpSumSqr.x;
-    lcl_sqr[lidx] = tmpSumSqr.y;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
-        float sum = 0;
-        float sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            //sum += lcl_sum[i];
-            //sqr += lcl_sqr[i];
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_F16_2D(
-    image2d_array_t input, image2d_array_t output, float eps, int is2D)
-{
-    int gidx = get_global_id(0) << 3;
-    int lidx = get_local_id(0);
-
-    int2 coord = (int2)(gidx, get_global_id(1));
-    vxc_short8 src0;
-    vxc_half8 in_h;
-    vxc_float4 sumsqr = (vxc_float4)(0);
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    if(gidx < width)
-    {
-        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, in_h, src0, 16);
-        VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-                uniFp16SumSqr_dp8x2);
-    }
-
-    lcl_sum[lidx] = sumsqr.x;
-    lcl_sqr[lidx] = sumsqr.y;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
-        float sum = 0;
-        float sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            //sum += lcl_sum[i];
-            //sqr += lcl_sqr[i];
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toF16(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,
-    float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
-    vxc_short8 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h, in_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    bias_f = read_imagef(bias, coord_para.xy);
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
-
-    _viv_asm(COPY, in_h, src0, 16);
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertEndInt16Fp32_4x4);
-
-    vxc_float4 norm;
-    norm = scale_vari * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = scale_vari * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-            uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toF16_2D(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,
-    float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidz = get_global_id(1);
-    int2 coord = (int2)(get_global_id(0), gidz);
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
-    vxc_short8 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h, in_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    bias_f = read_imagef(bias, coord_para.xy);
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
-
-    _viv_asm(COPY, in_h, src0, 16);
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        UniFP16toFP32Lo4_dp4x4);
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertEndInt16Fp32_4x4);
-    vxc_float4 norm;
-    norm = scale_vari * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = scale_vari * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,
-    float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
-    vxc_short8 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h, in_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    bias_f = read_imagef(bias, coord_para.xy);
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_uchar16 outval;
-    vxc_int4 tmpVal0, tmpVal1;
-    float alpha = outputScale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
-
-    _viv_asm(COPY, in_h, src0, 16);
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertEndInt16Fp32_4x4);
-
-    vxc_float4 norm;
-    norm = alpha * tmpData0 + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = alpha * tmpData1 + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8_2D(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,
-    float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidz = get_global_id(1);
-    int2 coord = (int2)(get_global_id(0), gidz);
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
-    vxc_short8 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h, in_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    bias_f = read_imagef(bias, coord_para.xy);
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_uchar16 outval;
-    vxc_int4 tmpVal0, tmpVal1;
-    float alpha = outputScale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
-
-    _viv_asm(COPY, in_h, src0, 16);
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertEndInt16Fp32_4x4);
-    vxc_float4 norm;
-    norm = alpha * tmpData0 + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = alpha * tmpData1 + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16_scale.vx
deleted file mode 100644
index cb00ac9..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16_scale.vx
+++ /dev/null
@@ -1,174 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;
-
-_viv_uniform float outputScale;
-_viv_uniform int output_ZP;
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toF16(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,
-    float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
-    vxc_short8 src0;
-    vxc_half8 in_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    bias_f = read_imagef(bias, coord_para.xy);
-    scale_f = read_imagef(scale, coord_para.xy);
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
-
-    _viv_asm(COPY, in_h, src0, 16);
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertEndInt16Fp32_4x4);
-
-    vxc_float4 norm;
-    norm = scale_vari * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = scale_vari * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-            uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toF16_2D(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,
-    float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidz = get_global_id(1);
-    int2 coord = (int2)(get_global_id(0), gidz);
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
-    vxc_short8 src0;
-    vxc_half8 in_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    bias_f = read_imagef(bias, coord_para.xy);
-    scale_f = read_imagef(scale, coord_para.xy);
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
-
-    _viv_asm(COPY, in_h, src0, 16);
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        UniFP16toFP32Lo4_dp4x4);
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertEndInt16Fp32_4x4);
-    vxc_float4 norm;
-    norm = scale_vari * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = scale_vari * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toU8(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,
-    float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
-    vxc_short8 src0;
-    vxc_half8 in_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    bias_f = read_imagef(bias, coord_para.xy);
-    scale_f = read_imagef(scale, coord_para.xy);
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_uchar16 outval;
-    vxc_int4 tmpVal0, tmpVal1;
-    float alpha = outputScale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
-
-    _viv_asm(COPY, in_h, src0, 16);
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertEndInt16Fp32_4x4);
-
-    vxc_float4 norm;
-    norm = alpha * tmpData0 + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = alpha * tmpData1 + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toU8_2D(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,
-    float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidz = get_global_id(1);
-    int2 coord = (int2)(get_global_id(0), gidz);
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
-    vxc_short8 src0;
-    vxc_half8 in_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    bias_f = read_imagef(bias, coord_para.xy);
-    scale_f = read_imagef(scale, coord_para.xy);
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_uchar16 outval;
-    vxc_int4 tmpVal0, tmpVal1;
-    float alpha = outputScale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
-
-    _viv_asm(COPY, in_h, src0, 16);
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertEndInt16Fp32_4x4);
-    vxc_float4 norm;
-    norm = alpha * tmpData0 + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = alpha * tmpData1 + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16.vx
deleted file mode 100644
index 1282e00..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16.vx
+++ /dev/null
@@ -1,339 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int width;
-_viv_uniform int height;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
-
-_viv_uniform float inFlScale_s2;
-_viv_uniform float input_fl_scale;
-_viv_uniform float inOut_fl_scale;
-_viv_uniform float output_fl_scale;
-
-_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;
-_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;
-_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I16(
-    image2d_array_t input,
-    image2d_array_t output,
-              float eps,
-              int is2D)
-{
-    int gidx = get_global_id(0) << 3;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(gidx, 0, gidz, 0);
-    vxc_short8 src0;
-    float sum = 0, sqr = 0;
-    vxc_float4 sumsqr = (vxc_float4)(0);
-    vxc_float4 tmpSumSqr = (vxc_float4)(0);
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    int8 input_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
-    if(gidx < width)
-    {
-        for(coord.y = 0; coord.y < height;)
-        {
-            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-                uniInt16SumSqr_dp8x2);
-            //tmpSumSqr += sumsqr;
-            tmpSumSqr.x += sumsqr.x;
-            sqr += (sumsqr.y * inFlScale_s2);
-        }
-        sum = tmpSumSqr.x * input_fl_scale;
-        //sqr = tmpSumSqr.y * inFlScale_s2;
-    }
-
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
-        sum = 0; sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            //sum += lcl_sum[i];
-            //sqr += lcl_sqr[i];
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I16_2D(
-    image2d_array_t input,
-    image2d_array_t output,
-              float eps,
-              int is2D)
-{
-    int gidx = get_global_id(0) << 3;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-
-    int2 coord = (int2)(gidx, gidz);
-    vxc_short8 src0;
-    float sum = 0, sqr = 0;
-    vxc_float4 sumsqr = (vxc_float4)(0);
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    if(gidx < width)
-    {
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-                uniInt16SumSqr_dp8x2);
-        sqr = sumsqr.y * inFlScale_s2;
-        sum = sumsqr.x * input_fl_scale;
-        //sqr = tmpSumSqr.y * inFlScale_s2;
-    }
-
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
-        sum = 0; sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            //sum += lcl_sum[i];
-            //sqr += lcl_sqr[i];
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toF16(
-    image2d_array_t input,
-    image2d_t bias,
-    image2d_t scale,
-    image2d_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int is2D,
-              float rSpaceOrg, int pStride)
-{
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
-    vxc_short8 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    bias_f = read_imagef(bias, coord_para.xy);
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        UniFP16toFP32Lo4_dp4x4);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
-
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Fst_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Secd_4x4);
-
-    vxc_float4 norm;
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toF16_2D(
-    image2d_array_t input,
-    image2d_t bias,
-    image2d_t scale,
-    image2d_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int is2D,
-              float rSpaceOrg, int pStride)
-{
-    int gidz = get_global_id(1);
-    int2 coord = (int2)(get_global_id(0), gidz);
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
-    vxc_short8 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    bias_f = read_imagef(bias, coord_para.xy);
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
-
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Fst_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Secd_4x4);
-    vxc_float4 norm;
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI16(
-    image2d_array_t input,
-    image2d_t bias,
-    image2d_t scale,
-    image2d_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int is2D,
-              float rSpaceOrg, int pStride)
-{
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
-    vxc_short8 src0, src2;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    bias_f = read_imagef(bias, coord_para.xy);
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1;
-    float alpha = inOut_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
-
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Fst_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Secd_4x4);
-    vxc_float4 norm;
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt32toInt16_2x8);
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI16_2D(
-    image2d_array_t input,
-    image2d_t bias,
-    image2d_t scale,
-    image2d_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int is2D,
-              float rSpaceOrg, int pStride)
-{
-    int gidz = get_global_id(1);
-    int2 coord = (int2)(get_global_id(0), gidz);
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
-    vxc_short8 src0, src2;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    bias_f = read_imagef(bias, coord_para.xy);
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1;
-    float alpha = inOut_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
-
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Fst_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Secd_4x4);
-    vxc_float4 norm;
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toInt16_2x8);
-    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16_scale.vx
deleted file mode 100644
index 397a5f8..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16_scale.vx
+++ /dev/null
@@ -1,191 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
-
-_viv_uniform float input_fl_scale;
-_viv_uniform float inOut_fl_scale;
-_viv_uniform float output_fl_scale;
-
-_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;
-_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;
-
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toF16(
-    image2d_array_t input,
-    image2d_t bias,
-    image2d_t scale,
-    image2d_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int is2D,
-              float rSpaceOrg, int pStride)
-{
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
-    vxc_short8 src0;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    bias_f = read_imagef(bias, coord_para.xy);
-    scale_f = read_imagef(scale, coord_para.xy);
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
-
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Fst_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Secd_4x4);
-
-    vxc_float4 norm;
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toF16_2D(
-    image2d_array_t input,
-    image2d_t bias,
-    image2d_t scale,
-    image2d_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int is2D,
-              float rSpaceOrg, int pStride)
-{
-    int gidz = get_global_id(1);
-    int2 coord = (int2)(get_global_id(0), gidz);
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
-    vxc_short8 src0;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    bias_f = read_imagef(bias, coord_para.xy);
-    scale_f = read_imagef(scale, coord_para.xy);
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
-
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Fst_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Secd_4x4);
-    vxc_float4 norm;
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toI16(
-    image2d_array_t input,
-    image2d_t bias,
-    image2d_t scale,
-    image2d_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int is2D,
-              float rSpaceOrg, int pStride)
-{
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
-    vxc_short8 src0, src2;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    bias_f = read_imagef(bias, coord_para.xy);
-    scale_f = read_imagef(scale, coord_para.xy);
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1;
-    float alpha = inOut_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
-
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Fst_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Secd_4x4);
-    vxc_float4 norm;
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt32toInt16_2x8);
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toI16_2D(
-    image2d_array_t input,
-    image2d_t bias,
-    image2d_t scale,
-    image2d_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int is2D,
-              float rSpaceOrg, int pStride)
-{
-    int gidz = get_global_id(1);
-    int2 coord = (int2)(get_global_id(0), gidz);
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
-    vxc_short8 src0, src2;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    bias_f = read_imagef(bias, coord_para.xy);
-    scale_f = read_imagef(scale, coord_para.xy);
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1;
-    float alpha = inOut_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
-
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Fst_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Secd_4x4);
-    vxc_float4 norm;
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toInt16_2x8);
-    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8.vx
deleted file mode 100644
index 6a407a3..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8.vx
+++ /dev/null
@@ -1,317 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int width;
-_viv_uniform int height;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-_viv_uniform VXC_512Bits uniSumInt8_16x1;
-_viv_uniform VXC_512Bits uniSqrSumInt8_16x1;
-_viv_uniform float inFlScale_s2;
-_viv_uniform float input_fl_scale;
-
-_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4;
-_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4;
-_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4;
-_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4;
-
-_viv_uniform float inOut_fl_scale;
-_viv_uniform float output_fl_scale;
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I8(
-    image2d_array_t input, image2d_array_t output, float eps, int is2D)
-{
-    int gidx = get_global_id(0) << 4;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(gidx, 0, gidz, 0);
-    vxc_char16 src0;
-    float sum = 0, sqr = 0;
-    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    int8 input_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
-
-    if(gidx < width)
-    {
-        for(coord.y = 0; coord.y < height;)
-        {
-            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);
-            tmpSum += (tmpSum1);
-            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);
-            tmpSqr += (tmpSqr1);
-        }
-        sqr = tmpSqr * inFlScale_s2;
-        sum = tmpSum * input_fl_scale;
-    }
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
-        sum = 0; sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I8_2D(
-    image2d_array_t input, image2d_array_t output, float eps, int is2D)
-{
-    int gidx = get_global_id(0) << 4;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-
-    int2 coord = (int2)(gidx, gidz);
-    vxc_char16 src0;
-    float sum = 0, sqr = 0;
-    int tmpSum1, tmpSqr1;
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    if(gidx < width)
-    {
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);
-        sqr = tmpSqr1 * inFlScale_s2;
-        sum = tmpSum1 * input_fl_scale;
-    }
-
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
-        sum = 0; sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toF16(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
-    vxc_char16 src0;
-    vxc_short8 src1, outval;
-    vxc_half8 scale_h, dst;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    bias_f = read_imagef(bias, coord_para.xy);
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
-
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord.x += 8;
-    norm = alpha * tmpData2 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData3 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toF16_2D(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidz = get_global_id(1);
-    int2 coord = (int2)(get_global_id(0), gidz);
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
-    vxc_char16 src0;
-    vxc_short8 src1, outval;
-    vxc_half8 scale_h, dst;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    bias_f = read_imagef(bias, coord_para.xy);
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    coord.x += 8;
-    norm = alpha * tmpData2 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData3 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
-    vxc_char16 src0, src2;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    bias_f = read_imagef(bias, coord_para.xy);
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    float alpha = inOut_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
-
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    norm = tmpData2 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData3 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8_2D(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidz = get_global_id(1);
-    int2 coord = (int2)(get_global_id(0), gidz);
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
-    vxc_char16 src0, src2;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    bias_f = read_imagef(bias, coord_para.xy);
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    float alpha = inOut_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
-
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    norm = tmpData2 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData3 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8_scale.vx
deleted file mode 100644
index 350e425..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8_scale.vx
+++ /dev/null
@@ -1,186 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-_viv_uniform float input_fl_scale;
-
-_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4;
-_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4;
-_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4;
-_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4;
-
-_viv_uniform float inOut_fl_scale;
-_viv_uniform float output_fl_scale;
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toF16(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
-    vxc_char16 src0;
-    vxc_short8 outval;
-    vxc_half8 dst;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    bias_f = read_imagef(bias, coord_para.xy);
-    scale_f = read_imagef(scale, coord_para.xy);
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
-
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord.x += 8;
-    norm = alpha * tmpData2 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData3 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toF16_2D(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidz = get_global_id(1);
-    int2 coord = (int2)(get_global_id(0), gidz);
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
-    vxc_char16 src0;
-    vxc_short8 outval;
-    vxc_half8 dst;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    bias_f = read_imagef(bias, coord_para.xy);
-    scale_f = read_imagef(scale, coord_para.xy);
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    coord.x += 8;
-    norm = alpha * tmpData2 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData3 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toI8(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
-    vxc_char16 src0, src2;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    bias_f = read_imagef(bias, coord_para.xy);
-    scale_f = read_imagef(scale, coord_para.xy);
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    float alpha = inOut_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
-
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    norm = tmpData2 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData3 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toI8_2D(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidz = get_global_id(1);
-    int2 coord = (int2)(get_global_id(0), gidz);
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
-    vxc_char16 src0, src2;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    bias_f = read_imagef(bias, coord_para.xy);
-    scale_f = read_imagef(scale, coord_para.xy);
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    float alpha = inOut_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
-
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    norm = tmpData2 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData3 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx
deleted file mode 100644
index c08a996..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx
+++ /dev/null
@@ -1,342 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int width;
-_viv_uniform int height;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniSumU8_16x1;
-_viv_uniform VXC_512Bits uniSqrSum_16x1;
-_viv_uniform float input_scale;
-_viv_uniform int inputZP;
-_viv_uniform int sumInZp;
-_viv_uniform int tmpZp1;
-_viv_uniform float e2InScale;
-_viv_uniform float rowSumScale;
-_viv_uniform float scale_inOut;
-_viv_uniform float outputScale;
-_viv_uniform int output_ZP;
-
-_viv_uniform VXC_512Bits uniResetFp32_4x4;
-_viv_uniform int group_stride;
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_U8(
-    image2d_array_t input, image2d_array_t output, float eps, int is2D)
-{
-    int gidx = get_global_id(0) << 4;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(gidx, 0, gidz, 0);
-    vxc_uchar16 src0;
-    float sum = 0, sqr = 0;
-    int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0;
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-    int8 input_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
-    if(gidx < width)
-    {
-        for(coord.y = 0; coord.y < height;)
-        {
-            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
-            tmpSum += (tmpSum1);
-            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
-            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);
-        }
-        sqr += (tmpSqr * e2InScale + rowSumScale);
-        sum = (tmpSum + sumInZp) * input_scale;
-    }
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-        sum = 0; sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_U8_2D(
-    image2d_array_t input, image2d_array_t output, float eps, int is2D)
-{
-    int gidx = get_global_id(0) << 4;
-    int lidx = get_local_id(0);
-
-    int2 coord = (int2)(gidx, get_global_id(1));
-    vxc_uchar16 src0;
-    float sum = 0, sqr = 0;
-    int tmpSqr, tmpSum1, tmpSqr1;
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-    if(gidx < width)
-    {
-        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
-                             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
-        tmpSqr = tmpSqr1 + tmpZp1 * tmpSum1;
-        sqr = (tmpSqr * e2InScale + rowSumScale);
-        sum = (tmpSum1 + sumInZp) * input_scale;
-    }
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-        sum = 0; sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_meanvari(
-    image2d_t input, image2d_t output, float eps, float group_ratio)
-{
-    int gidx = get_global_id(0);
-    int lidx = get_local_id(0);
-
-    int2 coord = (int2)(gidx, get_global_id(1));
-    vxc_uchar16 src0;
-    float2 sum_sqr = (float2)(0);
-    vxc_float4 mean_vari;
-    VXC_DP4x4(mean_vari, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniResetFp32_4x4);
-
-    __local float2 lcl_data[16];
-    __local float2 lcl_sum[4];
-
-    for(; coord.x < group_stride; coord.x += 64)
-    {
-        mean_vari += read_imagef(input, coord);
-    }
-    lcl_data[lidx] = mean_vari.xy;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(lidx < 4)
-    {
-        float2 tmpSum = (float2)(0);
-        for(int i = lidx; i < 16; i+=4)
-        {
-            tmpSum += lcl_data[i];
-        }
-        lcl_sum[lidx] = tmpSum;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(lidx == 0)
-    {
-        for(int i = 0; i < 4; i++)
-        {
-            sum_sqr += lcl_sum[i];
-        }
-        mean_vari.xy = sum_sqr * group_ratio;
-        mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-        mean_vari.s1 = rsqrt(mean_vari.s1);
-
-        coord.x = 0;
-        write_imagef(output, coord, mean_vari);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
-    vxc_uchar16 src0, src2;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    bias_f = read_imagef(bias, coord_para.xy);
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    short zp = inputZP;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    float alpha = scale_inOut * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
-
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    norm = tmpData2 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData3 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8_2D(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidz = get_global_id(1);
-    int2 coord = (int2)(get_global_id(0), gidz);
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
-    vxc_uchar16 src0, src2;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    bias_f = read_imagef(bias, coord_para.xy);
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    short zp = inputZP;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    float alpha = scale_inOut * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
-
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    norm = tmpData2 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData3 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toU8(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
-    vxc_uchar16 src0, src2;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    bias_f = read_imagef(bias, coord_para.xy);
-    scale_f = read_imagef(scale, coord_para.xy);
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    short zp = inputZP;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    float alpha = scale_inOut * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
-
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    norm = tmpData2 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData3 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toU8_2D(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidz = get_global_id(1);
-    int2 coord = (int2)(get_global_id(0), gidz);
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
-    vxc_uchar16 src0, src2;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    bias_f = read_imagef(bias, coord_para.xy);
-    scale_f = read_imagef(scale, coord_para.xy);
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    short zp = inputZP;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    float alpha = scale_inOut * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
-
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    norm = tmpData2 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData3 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx
deleted file mode 100644
index a1f4ce0..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx
+++ /dev/null
@@ -1,207 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int width;
-_viv_uniform int height;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
-
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
-_viv_uniform float input_scale;
-_viv_uniform int inputZP;
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
-    vxc_uchar16 src0;
-    vxc_short8 src1, outval;
-    vxc_half8 scale_h, dst;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    bias_f = read_imagef(bias, coord_para.xy);
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord.x += 8;
-    norm = alpha * tmpData2 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData3 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16_2D(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidz = get_global_id(1);
-    int2 coord = (int2)(get_global_id(0), gidz);
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
-    vxc_uchar16 src0;
-    vxc_short8 src1, outval;
-    vxc_half8 scale_h, dst;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    bias_f = read_imagef(bias, coord_para.xy);
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    coord.x += 8;
-    norm = alpha * tmpData2 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData3 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toF16(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidy = get_global_id(1);
-    int gidz = get_global_id(2);
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
-    vxc_uchar16 src0;
-    vxc_short8 outval;
-    vxc_half8 dst;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    bias_f = read_imagef(bias, coord_para.xy);
-    scale_f = read_imagef(scale, coord_para.xy);
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord.x += 8;
-    norm = alpha * tmpData2 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData3 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toF16_2D(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
-{
-    int gidz = get_global_id(1);
-    int2 coord = (int2)(get_global_id(0), gidz);
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
-    vxc_uchar16 src0;
-    vxc_short8 outval;
-    vxc_half8 dst;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f;
-
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
-    bias_f = read_imagef(bias, coord_para.xy);
-    scale_f = read_imagef(scale, coord_para.xy);
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    coord.x += 8;
-    norm = alpha * tmpData2 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData3 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx
index c1266fc..fce0623 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx
@@ -72,7 +72,8 @@ __kernel void grucell_activation_z_h_F16_F16toF16_##act_name( \
     VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
     VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
 }
-GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)
+GRUCELL_F16_F16TOF16(SIGMOID,  sigmoid_func)
+GRUCELL_F16_F16TOF16(HSIGMOID, hard_sigmoid)
 
 _viv_uniform float hstate_in_scale;
 _viv_uniform float hstate_in_tail;
@@ -121,6 +122,9 @@ __kernel void grucell_activation_z_h_##name0##_F16to##name1##_##act_name( \
     VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
     VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
 }
-GRUCELL_QNT_F16TO_QNT(U8,  U8,  SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8)
-GRUCELL_QNT_F16TO_QNT(I8,  I8,  SIGMOID, sigmoid_func, vxc_char8,  vxc_char8)
-GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)
+GRUCELL_QNT_F16TO_QNT(U8,  U8,  SIGMOID,  sigmoid_func, vxc_uchar8, vxc_uchar8)
+GRUCELL_QNT_F16TO_QNT(I8,  I8,  SIGMOID,  sigmoid_func, vxc_char8,  vxc_char8)
+GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID,  sigmoid_func, vxc_short8, vxc_short8)
+GRUCELL_QNT_F16TO_QNT(U8,  U8,  HSIGMOID, hard_sigmoid, vxc_uchar8, vxc_uchar8)
+GRUCELL_QNT_F16TO_QNT(I8,  I8,  HSIGMOID, hard_sigmoid, vxc_char8,  vxc_char8)
+GRUCELL_QNT_F16TO_QNT(I16, I16, HSIGMOID, hard_sigmoid, vxc_short8, vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx
index a9c8d44..1a037de 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx
@@ -54,7 +54,8 @@ __kernel void grucell_h_times_activation_r_F16_F16toF16_##act_name( \
     _viv_asm(COPY, dst, dst1, 8); \
     VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
 }
-GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)
+GRUCELL_F16_F16TOF16(SIGMOID,  sigmoid_func)
+GRUCELL_F16_F16TOF16(HSIGMOID, hard_sigmoid)
 
 _viv_uniform float hstate_in_scale;
 _viv_uniform float hstate_in_tail;
@@ -91,6 +92,9 @@ __kernel void grucell_h_times_activation_r_##name0##_F16toF16_##act_name( \
     _viv_asm(COPY, dst, dst1, 8); \
     VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
 }
-GRUCELL_QNT_F16TO_F16(U8,  SIGMOID, sigmoid_func, vxc_uchar8)
-GRUCELL_QNT_F16TO_F16(I8,  SIGMOID, sigmoid_func, vxc_char8)
-GRUCELL_QNT_F16TO_F16(I16, SIGMOID, sigmoid_func, vxc_short8)
+GRUCELL_QNT_F16TO_F16(U8,  SIGMOID,  sigmoid_func, vxc_uchar8)
+GRUCELL_QNT_F16TO_F16(I8,  SIGMOID,  sigmoid_func, vxc_char8)
+GRUCELL_QNT_F16TO_F16(I16, SIGMOID,  sigmoid_func, vxc_short8)
+GRUCELL_QNT_F16TO_F16(U8,  HSIGMOID, hard_sigmoid, vxc_uchar8)
+GRUCELL_QNT_F16TO_F16(I8,  HSIGMOID, hard_sigmoid, vxc_char8)
+GRUCELL_QNT_F16TO_F16(I16, HSIGMOID, hard_sigmoid, vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_0.vx
new file mode 100644
index 0000000..1644ecd
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_0.vx
@@ -0,0 +1,268 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform float inv_multiplier;
+_viv_uniform int group_num;
+
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uniSum_X_X2_16x2;
+_viv_uniform float input_scale;
+_viv_uniform float input_scale2;
+_viv_uniform float input_zp;
+_viv_uniform float sum_x_tail;
+_viv_uniform float sum_x2_tail0;
+_viv_uniform float sum_x2_tail1;
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+_viv_uniform VXC_512Bits uniSumX_16x1;
+_viv_uniform VXC_512Bits uniSumX2_16x1;
+
+#define INSTANCE_NORM_SUMS_8BITS_IMPL(name, src_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+                 float eps, int rs_flag) \
+{ \
+    int gidx = get_global_id(0) << 4; \
+    int lidx = get_local_id(0); \
+    int gidz = get_global_id(1); \
+    int4 coord = (int4)(gidx, 0, gidz, gidz); \
+    src_type src0; \
+    float2 sums_f32 = 0; \
+    int2 sums = 0, sum_x_x2; \
+ \
+    __local float lcl_sum[16]; \
+    __local float lcl_sqr[16]; \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr_a); \
+    if(gidx < width) \
+    { \
+        for(coord.y = 0; coord.y < height;) \
+        { \
+            VXC_OP4(img_load_3d, src0, input, coord, 0, \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+            coord.y++; \
+            VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \
+            VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \
+            sums = sums + sum_x_x2; \
+        } \
+        sums_f32 = convert_float2(sums); \
+        sums_f32.y = sums_f32.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sums_f32.x; \
+        sums_f32.x = sums_f32.x * input_scale + sum_x_tail; \
+    } \
+    lcl_sum[lidx] = sums_f32.x; \
+    lcl_sqr[lidx] = sums_f32.y; \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \
+    if(lidx == 0) \
+    { \
+        float4 one = (float4)(1, 1, 1, 1); \
+        __local float4* tmp_sum = (__local float4*)lcl_sum; \
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr; \
+        float sum = 0, sqr = 0; \
+        for(int i = 0; i < 4; i++) \
+        { \
+            sum += dot(tmp_sum[i], one); \
+            sqr += dot(tmp_sqr[i], one); \
+        } \
+        float4 data = (float4)(sum, sqr, 0, 0); \
+        write_imagef(output, coord_out, data); \
+    } \
+}
+INSTANCE_NORM_SUMS_8BITS_IMPL(U8, vxc_uchar16)
+INSTANCE_NORM_SUMS_8BITS_IMPL(I8, vxc_char16)
+
+#define INSTANCE_NORM_SUMS_8BITS_IMPL_2D(name, src_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+                 float eps, int rs_flag) \
+{ \
+    int gidx = get_global_id(0) << 4; \
+    int lidx = get_local_id(0); \
+    int gidz = get_global_id(1); \
+    int gidy = gidz * height; \
+ \
+    int2 coord = (int2)(gidx, gidy); \
+    src_type src0; \
+    float2 sums_f32 = 0; \
+    int2 sums = 0, sum_x_x2; \
+    int endH = gidy + height; \
+ \
+    __local float lcl_sum[16]; \
+    __local float lcl_sqr[16]; \
+    if (gidx < width) \
+    { \
+        for(; coord.y < endH;) \
+        { \
+            VXC_ReadImage(src0, input, coord, 0,  \
+                             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+            coord.y++; \
+            VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \
+            VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \
+            sums = sums + sum_x_x2; \
+        } \
+        sums_f32 = convert_float2(sums); \
+        sums_f32.y = sums_f32.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sums_f32.x; \
+        sums_f32.x = sums_f32.x * input_scale + sum_x_tail; \
+    } \
+    lcl_sum[lidx] = sums_f32.x; \
+    lcl_sqr[lidx] = sums_f32.y; \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \
+    if(lidx == 0) \
+    { \
+        float4 one = (float4)(1, 1, 1, 1); \
+        __local float4* tmp_sum = (__local float4*)lcl_sum; \
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr; \
+        float sum = 0, sqr = 0; \
+        for(int i = 0; i < 4; i++) \
+        { \
+            sum += dot(tmp_sum[i], one); \
+            sqr += dot(tmp_sqr[i], one); \
+        } \
+        float4 data = (float4)(sum, sqr, 0, 0); \
+        write_imagef(output, coord_out, data); \
+    } \
+}
+INSTANCE_NORM_SUMS_8BITS_IMPL_2D(U8, vxc_uchar16)
+INSTANCE_NORM_SUMS_8BITS_IMPL_2D(I8, vxc_char16)
+
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;
+#define INSTANCE_NORM_8BITS_IMPL(name, src_type, dst_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+                 float eps, int rs_flag) \
+{ \
+    int gidz = get_global_id(1); \
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int2 coord_para = (int2)(0, gidz); \
+    src_type src0; \
+    dst_type dst; \
+    float scale_vari, bias_val; \
+    float4 bias_f, scale_f, mean_vari = (float4)(0); \
+ \
+    scale_f = read_imagef(scale, coord_para); \
+    bias_f = read_imagef(bias, coord_para); \
+    for(int i = 0; i < group_num; i++) \
+    { \
+        mean_vari += read_imagef(meanVari, coord_para); \
+        coord_para.x += 4; \
+    } \
+    mean_vari *= inv_multiplier; \
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
+    mean_vari.s1 = rsqrt(mean_vari.s1); \
+ \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    vxc_int4 tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
+    float alpha = input_scale * output_scale * scale_vari; \
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
+    bias_val = bias_val - input_zp * alpha; \
+ \
+    int8 input_desc, output_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.z, baseAddr_a); \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr); \
+ \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+    VXC_OP4(img_load_3d, src0, input, coord_in, 0, \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.y ++; \
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \
+    norm = tmpData0 * alpha + bias_val; \
+    tmpVal0 = convert_int4_rte(norm); \
+    norm = tmpData1 * alpha + bias_val; \
+    tmpVal1 = convert_int4_rte(norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    norm = tmpData2 * alpha + bias_val; \
+    tmpVal0 = convert_int4_rte(norm); \
+    norm = tmpData3 * alpha + bias_val; \
+    tmpVal1 = convert_int4_rte(norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+INSTANCE_NORM_8BITS_IMPL(U8_F32toU8, vxc_uchar16, vxc_uchar16)
+INSTANCE_NORM_8BITS_IMPL(I8_F32toI8, vxc_char16,  vxc_char16)
+
+#define INSTANCE_NORM_8BITS_IMPL_2D(name, src_type, dst_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+                 float eps, int rs_flag) \
+{ \
+    int gidz = get_global_id(1); \
+    int gidy = gidz * height; \
+    int2 coord = (int2)(get_global_id(0), gidy); \
+    int2 coord_para = (int2)(0, gidz); \
+    int endH = gidy + height; \
+    src_type src0; \
+    dst_type dst; \
+    float scale_vari, bias_val; \
+    float4 bias_f, scale_f, mean_vari = (float4)(0); \
+ \
+    scale_f = read_imagef(scale, coord_para); \
+    bias_f = read_imagef(bias, coord_para); \
+    for(int i = 0; i < group_num; i++) \
+    { \
+        mean_vari += read_imagef(meanVari, coord_para); \
+        coord_para.x += 4; \
+    } \
+    mean_vari *= inv_multiplier; \
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
+    mean_vari.s1 = rsqrt(mean_vari.s1); \
+ \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    vxc_int4 tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
+    float alpha = input_scale * output_scale * scale_vari; \
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
+    bias_val = bias_val - input_zp * alpha; \
+ \
+    for(; coord.y < endH; coord.y++) \
+    { \
+    VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \
+    norm = tmpData0 * alpha + bias_val; \
+    tmpVal0 = convert_int4_rte(norm); \
+    norm = tmpData1 * alpha + bias_val; \
+    tmpVal1 = convert_int4_rte(norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    norm = tmpData2 * alpha + bias_val; \
+    tmpVal0 = convert_int4_rte(norm); \
+    norm = tmpData3 * alpha + bias_val; \
+    tmpVal1 = convert_int4_rte(norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+INSTANCE_NORM_8BITS_IMPL_2D(U8_F32toU8, vxc_uchar16, vxc_uchar16)
+INSTANCE_NORM_8BITS_IMPL_2D(I8_F32toI8, vxc_char16,  vxc_char16)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_1.vx
new file mode 100644
index 0000000..82d1704
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_1.vx
@@ -0,0 +1,154 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform float inv_multiplier;
+_viv_uniform int group_num;
+_viv_uniform float input_scale;
+_viv_uniform float input_zp;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;
+
+#define INSTANCE_NORM_8_TO_F16_IMPL(name, src_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+                 float eps, int rs_flag) \
+{ \
+    int gidz = get_global_id(1); \
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int4 coord_para = (int4)(0, gidz, 0, 0); \
+    src_type src0; \
+    vxc_short8 outval; \
+    vxc_half8 dst; \
+    float scale_vari, bias_val; \
+    float4 bias_f, scale_f, mean_vari = (float4)(0); \
+ \
+    scale_f = read_imagef(scale, coord_para.xy); \
+    bias_f = read_imagef(bias, coord_para.xy); \
+    for(int i = 0; i < group_num; i++) \
+    { \
+        mean_vari += read_imagef(meanVari, coord_para.xy); \
+        coord_para.x += 4; \
+    } \
+    mean_vari *= inv_multiplier; \
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
+    mean_vari.s1 = rsqrt(mean_vari.s1); \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
+    half4 tmpVal0, tmpVal1; \
+    float alpha = scale_vari * input_scale; \
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \
+    bias_val = bias_val - input_zp * alpha; \
+ \
+    coord_para = coord; \
+    int8 input_desc, output_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr_a); \
+ \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_para.z, baseAddr); \
+    for(coord.y = 0; coord.y < height;) \
+    { \
+    VXC_OP4(img_load_3d, src0, input, coord, 0, \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    coord_para.xy = coord.xy; \
+    coord.y++; \
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \
+    norm = alpha * tmpData0 + bias_val; \
+    _viv_asm(CONV, tmpVal0, norm); \
+    norm = alpha * tmpData1 + bias_val; \
+    _viv_asm(CONV, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    coord_para.x += 8; \
+    norm = alpha * tmpData2 + bias_val; \
+    _viv_asm(CONV, tmpVal0, norm); \
+    norm = alpha * tmpData3 + bias_val; \
+    _viv_asm(CONV, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+INSTANCE_NORM_8_TO_F16_IMPL(U8_F32toF16, vxc_uchar16)
+INSTANCE_NORM_8_TO_F16_IMPL(I8_F32toF16, vxc_char16)
+
+#define INSTANCE_NORM_8_TO_F16_IMPL_2D(name, src_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+                 float eps, int rs_flag) \
+{ \
+    int gidz = get_global_id(1); \
+    int gidy = gidz * height; \
+    int4 coord = (int4)(get_global_id(0), gidy, 0, 0); \
+    int4 coord_para = (int4)(0, gidz, 0, 0); \
+    int endH = gidy + height; \
+    src_type src0; \
+    vxc_short8 outval; \
+    vxc_half8 dst; \
+    float scale_vari, bias_val; \
+    float4 bias_f, scale_f, mean_vari = (float4)(0); \
+ \
+    scale_f = read_imagef(scale, coord_para.xy); \
+    bias_f = read_imagef(bias, coord_para.xy); \
+    for(int i = 0; i < group_num; i++) \
+    { \
+        mean_vari += read_imagef(meanVari, coord_para.xy); \
+        coord_para.x += 4; \
+    } \
+    mean_vari *= inv_multiplier; \
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
+    mean_vari.s1 = rsqrt(mean_vari.s1); \
+ \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
+    half4 tmpVal0, tmpVal1; \
+    float alpha = scale_vari * input_scale; \
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \
+    bias_val = bias_val - input_zp * alpha; \
+    for(; coord.y < endH;) \
+    { \
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    coord_para = coord; \
+    coord.y++; \
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \
+    norm = alpha * tmpData0 + bias_val; \
+    _viv_asm(CONV, tmpVal0, norm); \
+    norm = alpha * tmpData1 + bias_val; \
+    _viv_asm(CONV, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    coord_para.x += 8; \
+    norm = alpha * tmpData2 + bias_val; \
+    _viv_asm(CONV, tmpVal0, norm); \
+    norm = alpha * tmpData3 + bias_val; \
+    _viv_asm(CONV, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+INSTANCE_NORM_8_TO_F16_IMPL_2D(U8_F32toF16, vxc_uchar16)
+INSTANCE_NORM_8_TO_F16_IMPL_2D(I8_F32toF16, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_2.vx
new file mode 100644
index 0000000..75221f4
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_2.vx
@@ -0,0 +1,285 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform float inv_multiplier;
+_viv_uniform int group_num;
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uniSum_X_X2_8x2;
+_viv_uniform float input_scale;
+_viv_uniform float input_scale2;
+_viv_uniform float input_zp;
+_viv_uniform float sum_x_tail;
+_viv_uniform float sum_x2_tail0;
+_viv_uniform float sum_x2_tail1;
+
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+#define INSTANCE_NORM_SUMS_16BITS_IMPL(name, src_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+                 float eps, int rs_flag) \
+{ \
+    int gidx = get_global_id(0) << 3; \
+    int lidx = get_local_id(0); \
+    int gidz = get_global_id(1); \
+    int4 coord = (int4)(gidx, 0, gidz, gidz); \
+    vxc_short8 src0; \
+    src_type in_h; \
+    float4 sumsqr; \
+    float4 tmpSumSqr = (float4)(0); \
+ \
+    __local float lcl_sum[16]; \
+    __local float lcl_sqr[16]; \
+ \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr_a); \
+ \
+    if(gidx < width) \
+    { \
+        for(coord.y = 0; coord.y < height;) \
+        { \
+            VXC_OP4(img_load_3d, src0, input, coord, 0, \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            coord.y++; \
+            _viv_asm(COPY, in_h, src0, 16); \
+            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \
+                uniSum_X_X2_8x2); \
+            tmpSumSqr += sumsqr; \
+        } \
+        tmpSumSqr.y = tmpSumSqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * tmpSumSqr.x; \
+        tmpSumSqr.x = tmpSumSqr.x * input_scale + sum_x_tail; \
+    } \
+ \
+    lcl_sum[lidx] = tmpSumSqr.x; \
+    lcl_sqr[lidx] = tmpSumSqr.y; \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \
+    if(lidx == 0) \
+    { \
+        float4 one = (float4)(1, 1, 1, 1); \
+        __local float4* tmp_sum = (__local float4*)lcl_sum; \
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr; \
+ \
+        float sum = 0; \
+        float sqr = 0; \
+        for(int i = 0; i < 4; i++) \
+        { \
+            sum += dot(tmp_sum[i], one); \
+            sqr += dot(tmp_sqr[i], one); \
+        } \
+ \
+        float4 data = (float4)(sum, sqr, 0, 0); \
+        write_imagef(output, coord_out, data); \
+    } \
+}
+INSTANCE_NORM_SUMS_16BITS_IMPL(F16, vxc_half8)
+INSTANCE_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)
+
+#define INSTANCE_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+                 float eps, int rs_flag) \
+{ \
+    int gidx = get_global_id(0) << 3; \
+    int lidx = get_local_id(0); \
+    int gidz = get_global_id(1); \
+    int gidy = gidz * height; \
+ \
+    int2 coord = (int2)(gidx, gidy); \
+    vxc_short8 src0; \
+    src_type in_h; \
+    float4 sumsqr; \
+    float4 tmpSumSqr = (float4)(0); \
+ \
+    __local float lcl_sum[16]; \
+    __local float lcl_sqr[16]; \
+ \
+    int endH = gidy + height; \
+    if(gidx < width) \
+    { \
+        for(; coord.y < endH;) \
+        { \
+            VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            coord.y++; \
+            _viv_asm(COPY, in_h, src0, 16); \
+            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \
+                uniSum_X_X2_8x2); \
+            tmpSumSqr += sumsqr; \
+        } \
+        tmpSumSqr.y = tmpSumSqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * tmpSumSqr.x; \
+        tmpSumSqr.x = tmpSumSqr.x * input_scale + sum_x_tail; \
+    } \
+ \
+    lcl_sum[lidx] = tmpSumSqr.x; \
+    lcl_sqr[lidx] = tmpSumSqr.y; \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \
+    if(lidx == 0) \
+    { \
+        float4 one = (float4)(1, 1, 1, 1); \
+        __local float4* tmp_sum = (__local float4*)lcl_sum; \
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr; \
+ \
+        float sum = 0; \
+        float sqr = 0; \
+        for(int i = 0; i < 4; i++) \
+        { \
+            sum += dot(tmp_sum[i], one); \
+            sqr += dot(tmp_sqr[i], one); \
+        } \
+ \
+        float4 data = (float4)(sum, sqr, 0, 0); \
+        write_imagef(output, coord_out, data); \
+    } \
+}
+INSTANCE_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)
+INSTANCE_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)
+
+#define INSTANCE_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+                 float eps, int rs_flag) \
+{ \
+    int gidz = get_global_id(1); \
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int4 coord_para = (int4)(0, gidz, 0, 0); \
+    vxc_short8 src0; \
+    src_type in_h; \
+    float scale_vari, bias_val; \
+    float4 bias_f, scale_f, mean_vari = (float4)(0); \
+ \
+    scale_f = read_imagef(scale, coord_para.xy); \
+    bias_f = read_imagef(bias, coord_para.xy); \
+ \
+    for(int i = 0; i < group_num; i++) \
+    { \
+        mean_vari += read_imagef(meanVari, coord_para.xy); \
+        coord_para.x += 4; \
+    } \
+    mean_vari *= inv_multiplier; \
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
+    mean_vari.s1 = rsqrt(mean_vari.s1); \
+ \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    float alpha = input_scale * output_scale * scale_vari; \
+    float4  tmpData0, tmpData1; \
+    copy_type outval; \
+    conv_type tmpVal0, tmpVal1; \
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
+    bias_val = bias_val - input_zp * alpha; \
+    dst_type dst; \
+ \
+    int8 input_desc, output_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.z, baseAddr_a); \
+ \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr); \
+ \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+    VXC_OP4(img_load_3d, src0, input, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, in_h, src0, 16); \
+ \
+    coord_in.y ++; \
+ \
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
+ \
+    float4 norm; \
+    norm = alpha * tmpData0 + bias_val; \
+    _viv_asm(CONV_RTE, tmpVal0, norm); \
+    norm = alpha * tmpData1 + bias_val; \
+    _viv_asm(CONV_RTE, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_OP4_NoDest(img_store_3d, output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+INSTANCE_NORM_16BITS_IMPL(F16_F32toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)
+INSTANCE_NORM_16BITS_IMPL(F16_F32toI16, vxc_half8,  vxc_short8, vxc_short8, int4)
+INSTANCE_NORM_16BITS_IMPL(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)
+INSTANCE_NORM_16BITS_IMPL(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)
+INSTANCE_NORM_16BITS_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
+INSTANCE_NORM_16BITS_IMPL(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)
+
+#define INSTANCE_NORM_16BITS_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __read_only  image2d_t       meanVari, \
+    __write_only image2d_array_t output, \
+                 float eps, int rs_flag) \
+{ \
+    int gidz = get_global_id(1); \
+    int gidy = gidz * height; \
+    int4 coord = (int4)(get_global_id(0), gidy, 0, 0); \
+    int4 coord_para = (int4)(0, gidz, 0, 0); \
+    int endH = gidy + height; \
+    vxc_short8 src0; \
+    src_type in_h; \
+    float scale_vari, bias_val; \
+    float4 bias_f, scale_f, mean_vari = (float4)(0); \
+ \
+    scale_f = read_imagef(scale, coord_para.xy); \
+    bias_f = read_imagef(bias, coord_para.xy); \
+ \
+    for(int i = 0; i < group_num; i++) \
+    { \
+        mean_vari += read_imagef(meanVari, coord_para.xy); \
+        coord_para.x += 4; \
+    } \
+    mean_vari *= inv_multiplier; \
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
+    mean_vari.s1 = rsqrt(mean_vari.s1); \
+ \
+    scale_vari = scale_f.s0 * mean_vari.s1; \
+    float alpha = input_scale * output_scale * scale_vari; \
+    float4  tmpData0, tmpData1; \
+    copy_type outval; \
+    conv_type tmpVal0, tmpVal1; \
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
+    bias_val = bias_val - input_zp * alpha; \
+    dst_type dst; \
+ \
+    for(; coord.y < endH; coord.y++) \
+    { \
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, in_h, src0, 16); \
+ \
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
+    float4 norm; \
+    norm = alpha * tmpData0 + bias_val; \
+    _viv_asm(CONV, tmpVal0, norm); \
+    norm = alpha * tmpData1 + bias_val; \
+    _viv_asm(CONV, tmpVal1, norm); \
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, outval, dst, 16); \
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+INSTANCE_NORM_16BITS_IMPL_2D(F16_F32toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)
+INSTANCE_NORM_16BITS_IMPL_2D(F16_F32toI16, vxc_half8,  vxc_short8, vxc_short8, int4)
+INSTANCE_NORM_16BITS_IMPL_2D(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)
+INSTANCE_NORM_16BITS_IMPL_2D(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)
+INSTANCE_NORM_16BITS_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
+INSTANCE_NORM_16BITS_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_3.vx
similarity index 87%
rename from src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx
rename to src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_3.vx
index bba8627..19f335b 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_3.vx
@@ -2,16 +2,13 @@
 
 _viv_uniform int width;
 _viv_uniform int height;
-_viv_uniform float dimRatio;
+_viv_uniform float inv_multiplier;
 _viv_uniform int group_num;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
 _viv_uniform VXC_512Bits uniExtractOddData_2x8;
 
-constant vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
-constant float4 one = (float4)(1.0, 1.0, 1.0, 1.0);
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_BF16(
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16(
     image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
 {
     int gidx = get_global_id(0) << 3;
@@ -20,8 +17,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     int4 coord = (int4)(gidx, 0, gidz, gidz);
     vxc_short8 src0, src1, src2;
     float4 srcA, srcB;
-    vxc_float sum = 0, sqr = 0;
-
+    float sum = 0, sqr = 0;
+    float4 one = (float4)(1.0, 1.0, 1.0, 1.0);
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
     __local float lcl_sum[16];
     __local float lcl_sqr[16];
 
@@ -71,7 +69,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     }
 }
 
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_BF16_2D(
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16_2D(
     image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
 {
     int gidx = get_global_id(0) << 3;
@@ -82,7 +80,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     int2 coord = (int2)(gidx, gidy);
     vxc_short8 src0, src1, src2;
     float4 srcA, srcB;
-    vxc_float sum = 0, sqr = 0;
+    float sum = 0, sqr = 0;
+    float4 one = (float4)(1.0, 1.0, 1.0, 1.0);
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
 
     __local float lcl_sum[16];
     __local float lcl_sqr[16];
@@ -129,7 +129,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     }
 }
 
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16F32toBF16(
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16_F32toBF16(
     image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
     image2d_array_t output, float eps, int rsFlg)
 {
@@ -138,30 +138,26 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
     int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
     vxc_short8 src0, src1, src2;
     float scale_vari, bias_val;
-    vxc_float4 mean_vari = (vxc_float4)(0);
+    float4 mean_vari = (float4)(0);
 
-    Image img1 = create_image_from_image2d(bias, 4);
-    Image img2 = create_image_from_image2d(scale, 4);
     Image img3 = create_image_from_image2d(meanVari, 4);
-    __global float* bias_ptr = (__global float*)img1.ptr;
-    __global float* scal_ptr = (__global float*)img2.ptr;
     __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz));
     __global float4* vari_ptr = (__global float4*)sumVari_ptr;
 
-    float bval = bias_ptr[gidz];
-    float sval = scal_ptr[gidz];
+    float sval = read_imagef(scale, coord.yz).x;
+    float bval = read_imagef(bias, coord.yz).x;
 
     for(int i = 0; i < group_num; i++)
     {
         mean_vari += vari_ptr[i];
     }
 
-    mean_vari *= dimRatio;
+    mean_vari *= inv_multiplier;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
     mean_vari.s1 = rsqrt(mean_vari.s1);
 
     scale_vari = sval * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
+    float4  tmpData0, tmpData1;
     bias_val = (bval - scale_vari * mean_vari.s0);
 
     int8 input_desc, output_desc;
@@ -185,7 +181,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
     _viv_asm(COPY, tmpData0, src1, 16);
     _viv_asm(COPY, tmpData1, src2, 16);
 
-    vxc_float4 norm;
+    float4 norm;
     norm = scale_vari * tmpData0 + bias_val;
     _viv_asm(COPY, src0, norm, 16);
     norm = scale_vari * tmpData1 + bias_val;
@@ -207,30 +203,26 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
     int endH = gidy + height;
     vxc_short8 src0, src1, src2;
     float scale_vari, bias_val;
-    vxc_float4 mean_vari = (vxc_float4)(0);
+    float4 mean_vari = (float4)(0);
 
-    Image img1 = create_image_from_image2d(bias, 4);
-    Image img2 = create_image_from_image2d(scale, 4);
     Image img3 = create_image_from_image2d(meanVari, 4);
-    __global float* bias_ptr = (__global float*)img1.ptr;
-    __global float* scal_ptr = (__global float*)img2.ptr;
     __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);
     __global float4* vari_ptr = (__global float4*)sumVari_ptr;
 
-    float bval = bias_ptr[gidz];
-    float sval = scal_ptr[gidz];
+    float sval = read_imagef(scale, coord_para.yx).x;
+    float bval = read_imagef(bias, coord_para.yx).x;
 
     for(int i = 0; i < group_num; i++)
     {
         mean_vari += vari_ptr[i];
     }
 
-    mean_vari *= dimRatio;
+    mean_vari *= inv_multiplier;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
     mean_vari.s1 = rsqrt(mean_vari.s1);
 
     scale_vari = sval * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
+    float4  tmpData0, tmpData1;
     bias_val = (bval - scale_vari * mean_vari.s0);
 
     for(; coord.y < endH; coord.y++)
@@ -244,7 +236,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
     _viv_asm(COPY, tmpData0, src1, 16);
     _viv_asm(COPY, tmpData1, src2, 16);
 
-    vxc_float4 norm;
+    float4 norm;
     norm = scale_vari * tmpData0 + bias_val;
     _viv_asm(COPY, src0, norm, 16);
     norm = scale_vari * tmpData1 + bias_val;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx
deleted file mode 100644
index 2fd2d44..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx
+++ /dev/null
@@ -1,259 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int width;
-_viv_uniform int height;
-_viv_uniform float dimRatio;
-_viv_uniform int group_num;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
-
-_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
-_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16(
-    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
-{
-    int gidx = get_global_id(0) << 3;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(gidx, 0, gidz, gidz);
-    vxc_short8 src0;
-    vxc_half8 in_h;
-    vxc_float4 sumsqr;
-    vxc_float4 tmpSumSqr = (vxc_float4)(0);
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    int8 input_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
-
-    if(gidx < width)
-    {
-        for(coord.y = 0; coord.y < height;)
-        {
-            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            _viv_asm(COPY, in_h, src0, 16);
-            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-                uniFp16SumSqr_dp8x2);
-            tmpSumSqr += sumsqr;
-        }
-    }
-
-    lcl_sum[lidx] = tmpSumSqr.x;
-    lcl_sqr[lidx] = tmpSumSqr.y;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
-        float sum = 0;
-        float sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            //sum += lcl_sum[i];
-            //sqr += lcl_sqr[i];
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16_2D(
-    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
-{
-    int gidx = get_global_id(0) << 3;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-
-    int2 coord = (int2)(gidx, gidy);
-    vxc_short8 src0;
-    vxc_half8 in_h;
-    vxc_float4 sumsqr;
-    vxc_float4 tmpSumSqr = (vxc_float4)(0);
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    int endH = gidy + height;
-    if(gidx < width)
-    {
-        for(; coord.y < endH;)
-        {
-            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            _viv_asm(COPY, in_h, src0, 16);
-            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-                uniFp16SumSqr_dp8x2);
-            tmpSumSqr += sumsqr;
-        }
-    }
-
-    lcl_sum[lidx] = tmpSumSqr.x;
-    lcl_sqr[lidx] = tmpSumSqr.y;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
-        float sum = 0;
-        float sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            //sum += lcl_sum[i];
-            //sqr += lcl_sqr[i];
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16(
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_para = (int4)(gidz, 0, 0, 0);
-    vxc_short8 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h, in_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
-
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-    bias_f = read_imagef(bias, coord_para);
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_para.yx);
-        coord_para.y += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
-
-    int8 input_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord_in.z, baseAddr_a);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr);
-
-    for(coord.y = 0; coord.y < height; coord.y++)
-    {
-    VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, in_h, src0, 16);
-
-    coord_in.y ++;
-
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        UniFP16toFP32Lo4_dp4x4);
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertEndInt16Fp32_4x4);
-
-    vxc_float4 norm;
-    norm = scale_vari * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = scale_vari * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
-                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16_2D(
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-    int4 coord = (int4)(get_global_id(0), gidy, 0, 0);
-    int4 coord_para = (int4)(gidz, 0, 0, 0);
-    int endH = gidy + height;
-    vxc_short8 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h, in_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
-
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-    bias_f = read_imagef(bias, coord_para);
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_para.yx);
-        coord_para.y += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
-
-    for(; coord.y < endH; coord.y++)
-    {
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, in_h, src0, 16);
-
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        UniFP16toFP32Lo4_dp4x4);
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertEndInt16Fp32_4x4);
-    vxc_float4 norm;
-    norm = scale_vari * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = scale_vari * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx
deleted file mode 100644
index fa5538c..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx
+++ /dev/null
@@ -1,416 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int width;
-_viv_uniform int height;
-_viv_uniform float dimRatio;
-_viv_uniform int group_num;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
-
-_viv_uniform float inFlScale_s2;
-_viv_uniform float input_fl_scale;
-_viv_uniform float inOut_fl_scale;
-_viv_uniform float output_fl_scale;
-
-_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;
-_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;
-_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I16(
-    image2d_array_t input,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
-{
-    int gidx = get_global_id(0) << 3;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(gidx, 0, gidz, gidz);
-    vxc_short8 src0;
-    float sum = 0, sqr = 0;
-    vxc_float4 sumsqr = (vxc_float4)(0);
-    vxc_float4 tmpSumSqr = (vxc_float4)(0);
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    int8 input_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
-    if(gidx < width)
-    {
-        for(coord.y = 0; coord.y < height;)
-        {
-            VXC_OP4(img_load_3d, src0, input, coord, 0, \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-                uniInt16SumSqr_dp8x2);
-            //tmpSumSqr += sumsqr;
-            tmpSumSqr.x += sumsqr.x;
-            sqr += (sumsqr.y * inFlScale_s2);
-        }
-        sum = tmpSumSqr.x * input_fl_scale;
-        //sqr = tmpSumSqr.y * inFlScale_s2;
-    }
-
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
-        sum = 0; sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            //sum += lcl_sum[i];
-            //sqr += lcl_sqr[i];
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I16_2D(
-    image2d_array_t input,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
-{
-    int gidx = get_global_id(0) << 3;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-
-    int2 coord = (int2)(gidx, gidy);
-    vxc_short8 src0;
-    float sum = 0, sqr = 0;
-    vxc_float4 sumsqr = (vxc_float4)(0);
-    vxc_float4 tmpSumSqr = (vxc_float4)(0);
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    int endH = gidy + height;
-    if(gidx < width)
-    {
-        for(; coord.y < endH;)
-        {
-            VXC_ReadImage(src0, input, coord, 0,
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-                uniInt16SumSqr_dp8x2);
-            //tmpSumSqr += sumsqr;
-            tmpSumSqr.x += sumsqr.x;
-            sqr += (sumsqr.y * inFlScale_s2);
-        }
-        sum = tmpSumSqr.x * input_fl_scale;
-        //sqr = tmpSumSqr.y * inFlScale_s2;
-    }
-
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
-        sum = 0; sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            //sum += lcl_sum[i];
-            //sqr += lcl_sqr[i];
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toF16(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_para = (int4)(gidz, 0, 0, 0);
-    vxc_short8 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
-
-    VXC_ReadImage(src1, scale, coord_para.xy, 0,\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        UniFP16toFP32Lo4_dp4x4);
-
-    bias_f = read_imagef(bias, coord_para);
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_para.yx);
-        coord_para.y += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
-
-    int8 input_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord_in.z, baseAddr_a);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr);
-
-    for(coord.y = 0; coord.y < height; coord.y++)
-    {
-    VXC_OP4(img_load_3d, src0, input, coord_in, 0, \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    coord_in.y ++;
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Fst_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Secd_4x4);
-
-    vxc_float4 norm;
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
-                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toF16_2D(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-    int4 coord = (int4)(get_global_id(0), gidy, 0, 0);
-    int4 coord_para = (int4)(gidz, 0, 0, 0);
-    int endH = gidy + height;
-    vxc_short8 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
-
-    VXC_ReadImage(src1, scale, coord_para.xy, 0,\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        UniFP16toFP32Lo4_dp4x4);
-
-    bias_f = read_imagef(bias, coord_para);
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_para.yx);
-        coord_para.y += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
-
-    for(; coord.y < endH; coord.y++)
-    {
-    VXC_ReadImage(src0, input, coord.xy, 0,\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Fst_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Secd_4x4);
-    vxc_float4 norm;
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toI16(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_para = (int4)(gidz, 0, 0, 0);
-    vxc_short8 src0, src2;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
-
-    VXC_ReadImage(src1, scale, coord_para.xy, 0,\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    bias_f = read_imagef(bias, coord_para);
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_para.yx);
-        coord_para.y += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1;
-    float alpha = inOut_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
-
-    int8 input_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord_in.z, baseAddr_a);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr);
-    for(coord.y = 0; coord.y < height; coord.y++)
-    {
-    VXC_OP4(img_load_3d, src0, input, coord_in, 0, \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    coord_in.y ++;
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Fst_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Secd_4x4);
-    vxc_float4 norm;
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt32toInt16_2x8);
-    VXC_OP4_NoDest(img_store_3d, output, coord, src2, \
-                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toI16_2D(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-    int2 coord = (int2)(get_global_id(0), gidy);
-    int4 coord_para = (int4)(gidz, 0, 0, 0);
-    int endH = gidy + height;
-    vxc_short8 src0, src2;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
-
-    VXC_ReadImage(src1, scale, coord_para.xy, 0,\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    bias_f = read_imagef(bias, coord_para);
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_para.yx);
-        coord_para.y += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1;
-    float alpha = inOut_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
-
-    for(; coord.y < endH; coord.y++)
-    {
-    VXC_ReadImage(src0, input, coord, 0,\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Fst_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Secd_4x4);
-    vxc_float4 norm;
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt32toInt16_2x8);
-    VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx
deleted file mode 100644
index a6c98ef..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx
+++ /dev/null
@@ -1,397 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int width;
-_viv_uniform int height;
-_viv_uniform float dimRatio;
-_viv_uniform int group_num;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-_viv_uniform VXC_512Bits uniSumInt8_16x1;
-_viv_uniform VXC_512Bits uniSqrSumInt8_16x1;
-_viv_uniform float inFlScale_s2;
-_viv_uniform float input_fl_scale;
-
-_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4;
-_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4;
-_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4;
-_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4;
-
-_viv_uniform float inOut_fl_scale;
-_viv_uniform float output_fl_scale;
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8(
-    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
-{
-    int gidx = get_global_id(0) << 4;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(gidx, 0, gidz, gidz);
-    vxc_char16 src0;
-    float sum = 0, sqr = 0;
-    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    int8 input_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
-
-    if(gidx < width)
-    {
-        for(coord.y = 0; coord.y < height;)
-        {
-            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);
-            tmpSum += (tmpSum1);
-            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);
-            tmpSqr += (tmpSqr1);
-        }
-        sqr = tmpSqr * inFlScale_s2;
-        sum = tmpSum * input_fl_scale;
-    }
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
-        sum = 0; sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8_2D(
-    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
-{
-    int gidx = get_global_id(0) << 4;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-
-    int2 coord = (int2)(gidx, gidy);
-    vxc_char16 src0;
-    float sum = 0, sqr = 0;
-    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    int endH = gidy + height;
-    if(gidx < width)
-    {
-        for(; coord.y < endH;)
-        {
-            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);
-            tmpSum += (tmpSum1);
-            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);
-            tmpSqr += (tmpSqr1);
-        }
-        sqr = tmpSqr * inFlScale_s2;
-        sum = tmpSum * input_fl_scale;
-    }
-
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
-        sum = 0; sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16(
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_para = (int4)(gidz, 0, 0, 0);
-    vxc_char16 src0;
-    vxc_short8 src1, outval;
-    vxc_half8 scale_h, dst;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
-
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-    bias_f = read_imagef(bias, coord_para);
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_para.yx);
-        coord_para.y += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-
-    coord_para = coord;
-    int8 input_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord_para.z, baseAddr);
-
-    for(coord.y = 0; coord.y < height;)
-    {
-    VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    coord_para.xy = coord.xy;
-    coord.y++;
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
-
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_para.x += 8;
-    norm = alpha * tmpData2 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData3 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16_2D(
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-    int4 coord = (int4)(get_global_id(0), gidy, 0, 0);
-    int4 coord_para = (int4)(gidz, 0, 0, 0);
-    int endH = gidy + height;
-    vxc_char16 src0;
-    vxc_short8 src1, outval;
-    vxc_half8 scale_h, dst;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
-
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    bias_f = read_imagef(bias, coord_para);
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_para.yx);
-        coord_para.y += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-
-    for(; coord.y < endH;)
-    {
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    coord_para = coord;
-    coord.y++;
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    coord_para.x += 8;
-    norm = alpha * tmpData2 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData3 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8(
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_para = (int4)(gidz, 0, 0, 0);
-    vxc_char16 src0, src2;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
-
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    bias_f = read_imagef(bias, coord_para);
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_para.yx);
-        coord_para.y += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    float alpha = inOut_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
-
-    int8 input_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord_in.z, baseAddr_a);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr);
-
-    for(coord.y = 0; coord.y < height; coord.y++)
-    {
-    VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    coord_in.y ++;
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    norm = tmpData2 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData3 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8_2D(
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-    int2 coord = (int2)(get_global_id(0), gidy);
-    int4 coord_para = (int4)(gidz, 0, 0, 0);
-    int endH = gidy + height;
-    vxc_char16 src0, src2;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
-
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    bias_f = read_imagef(bias, coord_para);
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_para.yx);
-        coord_para.y += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    float alpha = inOut_fl_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
-
-    for(; coord.y < endH; coord.y++)
-    {
-    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    norm = tmpData2 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData3 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    }
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx
deleted file mode 100644
index b81a1a1..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx
+++ /dev/null
@@ -1,289 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int height;
-_viv_uniform float dimRatio;
-_viv_uniform int group_num;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
-_viv_uniform int inputZP;
-_viv_uniform float scale_inOut;
-_viv_uniform float outputScale;
-_viv_uniform int output_ZP;
-_viv_uniform float inOut_fl_scale;
-_viv_uniform float output_fl_scale;
-_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;
-_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;
-
-#define INSTANCENORM_8BITS_F32(src1_type_name, read_type) \
-__kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, \
-    image2d_array_t output, float eps, int rsFlg) \
-{ \
-    int gidz = get_global_id(1); \
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \
-    int2 coord_para = (int2)(gidz, 0); \
-    read_type src0, src2; \
-    float scale_vari, bias_val; \
-    vxc_float4 mean_vari = (vxc_float4)(0); \
- \
-    Image img1 = create_image_from_image2d(bias, 4); \
-    Image img2 = create_image_from_image2d(scale, 4); \
-    Image img3 = create_image_from_image2d(meanVari, 4); \
-    __global float* bias_ptr = (__global float*)img1.ptr; \
-    __global float* scal_ptr = (__global float*)img2.ptr; \
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); \
-    __global float4* vari_ptr = (__global float4*)sumVari_ptr; \
- \
-    float bval = bias_ptr[gidz]; \
-    float sval = scal_ptr[gidz]; \
- \
-    for(int i = 0; i < group_num; i++) \
-    { \
-        mean_vari += vari_ptr[i]; \
-    } \
-    mean_vari *= dimRatio; \
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
-    mean_vari.s1 = rsqrt(mean_vari.s1); \
- \
-    scale_vari = sval * mean_vari.s1; \
-    short zp = inputZP; \
-    vxc_int4 tmpVal0, tmpVal1; \
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
-    float alpha = scale_inOut * scale_vari; \
-    bias_val = (bval - scale_vari * mean_vari.s0) * outputScale + output_ZP; \
- \
-    int8 input_desc, output_desc; \
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \
-    _viv_asm(MOV, coord_in.z, baseAddr_a); \
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \
-    _viv_asm(MOV, coord.z, baseAddr); \
- \
-    for(coord.y = 0; coord.y < height; coord.y++) \
-    { \
-    VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
-    coord_in.y ++; \
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
-             uniConvert1stUint8SubZpToFp32_4x4); \
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
-             uniConvert2ndUint8SubZpToFp32_4x4); \
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
-             uniConvert3rdUint8SubZpToFp32_4x4); \
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
-             uniConvert4thUint8SubZpToFp32_4x4); \
-    norm = tmpData0 * alpha + bias_val; \
-    tmpVal0 = convert_int4_rte(norm); \
-    norm = tmpData1 * alpha + bias_val; \
-    tmpVal1 = convert_int4_rte(norm); \
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
-    norm = tmpData2 * alpha + bias_val; \
-    tmpVal0 = convert_int4_rte(norm); \
-    norm = tmpData3 * alpha + bias_val; \
-    tmpVal1 = convert_int4_rte(norm); \
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
-    VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
-    } \
-}
-INSTANCENORM_8BITS_F32(U8, vxc_uchar16)
-INSTANCENORM_8BITS_F32(I8, vxc_char16)
-
-#define INSTANCENORM_8BITS_F32_2D(src1_type_name, read_type) \
-__kernel void instance_norm_##src1_type_name##F32to##src1_type_name##_2D( \
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, \
-    image2d_array_t output, float eps, int rsFlg) \
-{ \
-    int gidz = get_global_id(1); \
-    int gidy = gidz * height; \
-    int2 coord = (int2)(get_global_id(0), gidy); \
-    int2 coord_para = (int2)(gidz, 0); \
-    int endH = gidy + height; \
-    read_type src0, src2; \
-    float scale_vari, bias_val; \
-    vxc_float4 mean_vari = (vxc_float4)(0); \
- \
-    Image img1 = create_image_from_image2d(bias, 4); \
-    Image img2 = create_image_from_image2d(scale, 4); \
-    Image img3 = create_image_from_image2d(meanVari, 4); \
-    __global float* bias_ptr = (__global float*)img1.ptr; \
-    __global float* scal_ptr = (__global float*)img2.ptr; \
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); \
-    __global float4* vari_ptr = (__global float4*)sumVari_ptr; \
- \
-    float bval = bias_ptr[gidz]; \
-    float sval = scal_ptr[gidz]; \
- \
-    for(int i = 0; i < group_num; i++) \
-    { \
-        mean_vari += vari_ptr[i]; \
-    } \
- \
-    mean_vari *= dimRatio; \
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
-    mean_vari.s1 = rsqrt(mean_vari.s1); \
- \
-    scale_vari = sval * mean_vari.s1; \
-    short zp = inputZP; \
-    vxc_int4 tmpVal0, tmpVal1; \
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
-    float alpha = scale_inOut * scale_vari; \
-    bias_val = (bval - scale_vari * mean_vari.s0) * outputScale + output_ZP; \
- \
-    for(; coord.y < endH; coord.y++) \
-    { \
-    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
-             uniConvert1stUint8SubZpToFp32_4x4); \
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
-             uniConvert2ndUint8SubZpToFp32_4x4); \
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
-             uniConvert3rdUint8SubZpToFp32_4x4); \
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
-             uniConvert4thUint8SubZpToFp32_4x4); \
-    norm = tmpData0 * alpha + bias_val; \
-    tmpVal0 = convert_int4_rte(norm); \
-    norm = tmpData1 * alpha + bias_val; \
-    tmpVal1 = convert_int4_rte(norm); \
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
-    norm = tmpData2 * alpha + bias_val; \
-    tmpVal0 = convert_int4_rte(norm); \
-    norm = tmpData3 * alpha + bias_val; \
-    tmpVal1 = convert_int4_rte(norm); \
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
-    VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
-    } \
-}
-INSTANCENORM_8BITS_F32_2D(U8, vxc_uchar16)
-INSTANCENORM_8BITS_F32_2D(I8, vxc_char16)
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F32toI16(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
-    int2 coord_para = (int2)(gidz, 0);
-    vxc_short8 src0, src2;
-    float scale_vari, bias_val;
-    vxc_float4  mean_vari = (vxc_float4)(0);
-
-    Image img1 = create_image_from_image2d(bias, 4);
-    Image img2 = create_image_from_image2d(scale, 4);
-    Image img3 = create_image_from_image2d(meanVari, 4);
-    __global float* bias_ptr = (__global float*)img1.ptr;
-    __global float* scal_ptr = (__global float*)img2.ptr;
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);
-    __global float4* vari_ptr = (__global float4*)sumVari_ptr;
-
-    float bval = bias_ptr[gidz];
-    float sval = scal_ptr[gidz];
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += vari_ptr[i];
-    }
-
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = sval * mean_vari.s1;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1;
-    float alpha = inOut_fl_scale * scale_vari;
-    bias_val = (bval - scale_vari * mean_vari.s0) * output_fl_scale;
-
-    int8 input_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord_in.z, baseAddr_a);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr);
-    for(coord.y = 0; coord.y < height; coord.y++)
-    {
-    VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    coord_in.y ++;
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Fst_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Secd_4x4);
-    vxc_float4 norm;
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt32toInt16_2x8);
-    VXC_OP4_NoDest(img_store_3d, output, coord, src2, \
-                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F32toI16_2D(
-    image2d_t input, image2d_t bias, image2d_t scale,
-    image2d_t meanVari, image2d_array_t output, float eps, int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-    int2 coord = (int2)(get_global_id(0), gidy);
-    int2 coord_para = (int2)(gidz, 0);
-    int endH = gidy + height;
-    vxc_short8 src0, src2;
-    float scale_vari, bias_val;
-    vxc_float4 mean_vari = (vxc_float4)(0);
-
-    Image img1 = create_image_from_image2d(bias, 4);
-    Image img2 = create_image_from_image2d(scale, 4);
-    Image img3 = create_image_from_image2d(meanVari, 4);
-    __global float* bias_ptr = (__global float*)img1.ptr;
-    __global float* scal_ptr = (__global float*)img2.ptr;
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);
-    __global float4* vari_ptr = (__global float4*)sumVari_ptr;
-
-    float bval = bias_ptr[gidz];
-    float sval = scal_ptr[gidz];
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += vari_ptr[i];
-    }
-
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = sval * mean_vari.s1;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1;
-    float alpha = inOut_fl_scale * scale_vari;
-    bias_val = (bval - scale_vari * mean_vari.s0) * output_fl_scale;
-
-    for(; coord.y < endH; coord.y++)
-    {
-    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Fst_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertInt16Fp32Secd_4x4);
-    vxc_float4 norm;
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt32toInt16_2x8);
-    VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx
deleted file mode 100644
index d51e38e..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx
+++ /dev/null
@@ -1,146 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int height;
-_viv_uniform float dimRatio;
-_viv_uniform int group_num;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
-
-_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F32toF16(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
-    vxc_short8 src0;
-    vxc_half8  in_h;
-    float scale_vari, bias_val;
-    vxc_float4 mean_vari = (vxc_float4)(0);
-
-    Image img1 = create_image_from_image2d(bias, 4);
-    Image img2 = create_image_from_image2d(scale, 4);
-    Image img3 = create_image_from_image2d(meanVari, 4);
-    __global float* bias_ptr = (__global float*)img1.ptr;
-    __global float* scal_ptr = (__global float*)img2.ptr;
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz));
-    __global float4* vari_ptr = (__global float4*)sumVari_ptr;
-
-    float bval = bias_ptr[gidz];
-    float sval = scal_ptr[gidz];
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += vari_ptr[i];
-    }
-
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = sval * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    bias_val = (bval - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
-
-    int8 input_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord_in.z, baseAddr_a);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr);
-
-    for(coord.y = 0; coord.y < height; coord.y++)
-    {
-    VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, in_h, src0, 16);
-
-    coord_in.y ++;
-
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        UniFP16toFP32Lo4_dp4x4);
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertEndInt16Fp32_4x4);
-
-    vxc_float4 norm;
-    norm = scale_vari * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = scale_vari * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
-                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F32toF16_2D(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-    int2 coord = (int2)(get_global_id(0), gidy);
-    int2 coord_para = (int2)(gidz, 0);
-    int endH = gidy + height;
-    vxc_short8 src0;
-    vxc_half8  in_h;
-    float scale_vari, bias_val;
-    vxc_float4 mean_vari = (vxc_float4)(0);
-
-    Image img1 = create_image_from_image2d(bias, 4);
-    Image img2 = create_image_from_image2d(scale, 4);
-    Image img3 = create_image_from_image2d(meanVari, 4);
-    __global float* bias_ptr = (__global float*)img1.ptr;
-    __global float* scal_ptr = (__global float*)img2.ptr;
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);
-    __global float4* vari_ptr = (__global float4*)sumVari_ptr;
-
-    float bval = bias_ptr[gidz];
-    float sval = scal_ptr[gidz];
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += vari_ptr[i];
-    }
-
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = sval * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    bias_val = (bval - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
-
-    for(; coord.y < endH; coord.y++)
-    {
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, in_h, src0, 16);
-
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        UniFP16toFP32Lo4_dp4x4);
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertEndInt16Fp32_4x4);
-    vxc_float4 norm;
-    norm = scale_vari * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = scale_vari * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx
deleted file mode 100644
index 5c0f235..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx
+++ /dev/null
@@ -1,254 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int width;
-_viv_uniform int height;
-_viv_uniform float dimRatio;
-_viv_uniform int group_num;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniSumU8_16x1;
-_viv_uniform VXC_512Bits uniSqrSum_16x1;
-_viv_uniform float input_scale;
-_viv_uniform int inputZP;
-_viv_uniform int sumInZp;
-_viv_uniform int tmpZp1;
-_viv_uniform float e2InScale;
-_viv_uniform float rowSumScale;
-_viv_uniform float scale_inOut;
-_viv_uniform float outputScale;
-_viv_uniform int output_ZP;
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8(
-    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
-{
-    int gidx = get_global_id(0) << 4;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(gidx, 0, gidz, gidz);
-    vxc_uchar16 src0;
-    float sum = 0, sqr = 0;
-    int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0;
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-    int8 input_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
-    if(gidx < width)
-    {
-        for(coord.y = 0; coord.y < height;)
-        {
-            VXC_OP4(img_load_3d, src0, input, coord, 0, \
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
-            tmpSum += (tmpSum1);
-            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
-            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);
-        }
-        sqr += (tmpSqr * e2InScale + rowSumScale);
-        sum = (tmpSum + sumInZp) * input_scale;
-    }
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-        sum = 0; sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8_2D(
-    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
-{
-    int gidx = get_global_id(0) << 4;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-
-    int2 coord = (int2)(gidx, gidy);
-    vxc_uchar16 src0;
-    float sum = 0, sqr = 0;
-    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
-    int endH = gidy + height;
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-    if(gidx < width)
-    {
-        for(; coord.y < endH;)
-        {
-            VXC_ReadImage(src0, input, coord, 0,
-                             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
-            tmpSum += (tmpSum1);
-            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
-            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);
-        }
-        sqr += (tmpSqr * e2InScale + rowSumScale);
-        sum = (tmpSum + sumInZp) * input_scale;
-    }
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-        sum = 0; sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8(
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_para = (int4)(gidz, 0, 0, 0);
-    vxc_uchar16 src0, src2;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
-
-    VXC_ReadImage(src1, scale, coord_para.xy, 0,\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    bias_f = read_imagef(bias, coord_para);
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_para.yx);
-        coord_para.y += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    short zp = inputZP;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    float alpha = scale_inOut * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
-
-    int8 input_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord_in.z, baseAddr_a);
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr);
-
-    for(coord.y = 0; coord.y < height; coord.y++)
-    {
-    VXC_OP4(img_load_3d, src0, input, coord_in, 0, \
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    coord_in.y ++;
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    norm = tmpData2 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData3 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8_2D(
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-    int2 coord = (int2)(get_global_id(0), gidy);
-    int4 coord_para = (int4)(gidz, 0, 0, 0);
-    int endH = gidy + height;
-    vxc_uchar16 src0, src2;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
-
-    VXC_ReadImage(src1, scale, coord_para.xy, 0,\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-
-    bias_f = read_imagef(bias, coord_para);
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_para.yx);
-        coord_para.y += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    short zp = inputZP;
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    float alpha = scale_inOut * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
-
-    for(; coord.y < endH; coord.y++)
-    {
-    VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
-    norm = tmpData0 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData1 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    norm = tmpData2 * alpha + bias_val;
-    tmpVal0 = convert_int4_rte(norm);
-    norm = tmpData3 * alpha + bias_val;
-    tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    }
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx
deleted file mode 100644
index b737ffe..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx
+++ /dev/null
@@ -1,147 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int width;
-_viv_uniform int height;
-_viv_uniform float dimRatio;
-_viv_uniform int group_num;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
-
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
-_viv_uniform float input_scale;
-_viv_uniform int inputZP;
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16(
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_para = (int4)(gidz, 0, 0, 0);
-    vxc_uchar16 src0;
-    vxc_short8 src1, outval;
-    vxc_half8 scale_h, dst;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
-
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-    bias_f = read_imagef(bias, coord_para);
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_para.yx);
-        coord_para.y += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-
-    coord_para = coord;
-    int8 input_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord_para.z, baseAddr);
-    for(coord.y = 0; coord.y < height;)
-    {
-    VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    coord_para.xy = coord.xy;
-    coord.y++;
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    coord_para.x += 8;
-    norm = alpha * tmpData2 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData3 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16_2D(
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-    int4 coord = (int4)(get_global_id(0), gidy, 0, 0);
-    int4 coord_para = (int4)(gidz, 0, 0, 0);
-    int endH = gidy + height;
-    vxc_uchar16 src0;
-    vxc_short8 src1, outval;
-    vxc_half8 scale_h, dst;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
-
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
-    bias_f = read_imagef(bias, coord_para);
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_para.yx);
-        coord_para.y += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-    for(; coord.y < endH;)
-    {
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    coord_para = coord;
-    coord.y++;
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    coord_para.x += 8;
-    norm = alpha * tmpData2 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData3 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx
index bd3a733..95d9c87 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx
@@ -94,6 +94,7 @@ _viv_uniform float zpSqrt16x;
 _viv_uniform VXC_512Bits uniSumAll_16x1;
 _viv_uniform int inputZP;
 
+
 #define L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \
     vxc_float4 rsqrt0;\
     Image dst_img = create_image_from_image2d(output, 1); \
@@ -143,31 +144,31 @@ _viv_uniform int inputZP;
                     dst_ptr[0] = dst.s0; \
                 break; \
                 case 2: \
-                    VXC_Vstore2(dst_ptr, 0, dst); \
+                    VXC_Vstore2(dst_ptr, 0, dst.s01); \
                 break; \
                 case 3: \
-                    VXC_Vstore3(dst_ptr, 0, dst); \
+                    VXC_Vstore3(dst_ptr, 0, dst.s012); \
                 break; \
                 case 4: \
-                    VXC_Vstore4(dst_ptr, 0, dst); \
+                    VXC_Vstore4(dst_ptr, 0, dst.0123); \
                 break; \
                 case 5: \
-                    VXC_Vstore2(dst_ptr, 0, dst); \
+                    VXC_Vstore2(dst_ptr, 0, dst.s01); \
                     dst.s012 = dst.s234; \
                     dst_ptr += 2; \
-                    VXC_Vstore3(dst_ptr, 0, dst); \
+                    VXC_Vstore3(dst_ptr, 0, dst.s012); \
                 break; \
                 case 6: \
-                    VXC_Vstore3(dst_ptr, 0, dst); \
+                    VXC_Vstore3(dst_ptr, 0, dst.s012); \
                     dst.s012 = dst.s345; \
                     dst_ptr += 3; \
-                    VXC_Vstore3(dst_ptr, 0, dst); \
+                    VXC_Vstore3(dst_ptr, 0, dst.s012); \
                 break; \
                 case 7: \
-                    VXC_Vstore4(dst_ptr, 0, dst); \
+                    VXC_Vstore4(dst_ptr, 0, dst.0123); \
                      dst.s012 = dst.s456; \
                     dst_ptr += 4; \
-                    VXC_Vstore3(dst_ptr, 0, dst); \
+                    VXC_Vstore3(dst_ptr, 0, dst.s012); \
                 break; \
                 default: \
                     VXC_Vstore8(dst_ptr, 0, dst); \
@@ -177,16 +178,13 @@ _viv_uniform int inputZP;
     } \
 
 
-#define L2NORMSCALE_AXIS0_2D(in0_name, in1_name, out_name, read_type, read_type2, src_type, INPUTSCALE, \
+#define L2NORMSCALE_AXIS0(in0_name, in1_name, out_name, read_type, read_type2, src_type, INPUTSCALE, \
                             dst_type, convert_type, output_type, copy_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \
-     void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \
+     void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name \
     (\
-    __read_only  image2d_t input,\
-    __read_only  image2d_t scale,\
-    __write_only image2d_t output,\
-    int axis\
-    )\
+    __read_only  image2d_t input, __read_only  image2d_t scale, __write_only image2d_t output,\
+    int axis )\
 { \
     int lidx = get_local_id(0); \
     int offset  = get_global_id(0); \
@@ -236,19 +234,15 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \
     L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \
 }
 
-L2NORMSCALE_AXIS0_2D(F16, F16, F16, ushort, vxc_ushort8, vxc_half8, 1, \
+L2NORMSCALE_AXIS0(F16, F16, F16, ushort, vxc_ushort8, vxc_half8, 1, \
                      ushort, half4, vxc_half8, vxc_ushort8)
 
-#define L2NORMSCALE_AXIS0_QNT_2D(in0_name, in1_name, out_name,\
+#define L2NORMSCALE_AXIS0_QNT(in0_name, in1_name, out_name,\
                     src_type, src_scalar_type, dst_type, convert_type, output_type, copy_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \
-void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \
+void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name \
     (\
-    __read_only  image2d_t input,\
-    __read_only  image2d_t scale,\
-    __write_only image2d_t output,\
-    int axis\
-    )\
+    __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output, int axis)\
 { \
     int lidx = get_local_id(0); \
     int offset  = get_global_id(0); \
@@ -302,9 +296,9 @@ void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \
     L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \
 }
 
-L2NORMSCALE_AXIS0_QNT_2D(U8,  F16, F16, vxc_uchar8, uchar, ushort, half4, vxc_half8,  vxc_ushort8)
-L2NORMSCALE_AXIS0_QNT_2D(U8,  F16, U8,  vxc_uchar8, uchar, uchar,  int4,  vxc_uchar8, vxc_uchar8)
-L2NORMSCALE_AXIS0_QNT_2D(I8,  F16, F16, vxc_char8,  char,  ushort, half4, vxc_half8,  vxc_ushort8)
-L2NORMSCALE_AXIS0_QNT_2D(I8,  F16, I8,  vxc_char8,  char,  char,   int4,  vxc_char8,  vxc_char8)
-L2NORMSCALE_AXIS0_QNT_2D(I16, F16, F16, vxc_short8, short, ushort, half4, vxc_half8,  vxc_ushort8)
-L2NORMSCALE_AXIS0_QNT_2D(I16, F16, I16, vxc_short8, short, short,  int4,  vxc_short8, vxc_short8)
+L2NORMSCALE_AXIS0_QNT(U8,  F16, F16, vxc_uchar8, uchar, ushort, half4, vxc_half8,  vxc_ushort8)
+L2NORMSCALE_AXIS0_QNT(U8,  F16, U8,  vxc_uchar8, uchar, uchar,  int4,  vxc_uchar8, vxc_uchar8)
+L2NORMSCALE_AXIS0_QNT(I8,  F16, F16, vxc_char8,  char,  ushort, half4, vxc_half8,  vxc_ushort8)
+L2NORMSCALE_AXIS0_QNT(I8,  F16, I8,  vxc_char8,  char,  char,   int4,  vxc_char8,  vxc_char8)
+L2NORMSCALE_AXIS0_QNT(I16, F16, F16, vxc_short8, short, ushort, half4, vxc_half8,  vxc_ushort8)
+L2NORMSCALE_AXIS0_QNT(I16, F16, I16, vxc_short8, short, short,  int4,  vxc_short8, vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0_2d.vx
new file mode 100644
index 0000000..f214d53
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0_2d.vx
@@ -0,0 +1,207 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int inputWidth;
+_viv_uniform float output_ZP;
+_viv_uniform float zP2x;
+_viv_uniform int inputZP;
+
+_viv_uniform float inOutScale;
+_viv_uniform float e2InScale;
+_viv_uniform float zpSqr8x;
+_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
+
+_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1)))
+     void l2normalizescale_axis0_F16_F16toF16_2D(
+    __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output, int axis)
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int lidx = get_local_id(0);
+    vxc_short8 src0, src1, dst;
+    vxc_half8 in_h, scale_h, tmpDst;
+    float sum = 0;
+    vxc_float4 scale_f0, scale_f1, sumsqr, tmpData0, tmpData1;
+    __local float lcl_sum[16];
+    float4 one = (float4)(1, 1, 1, 1);
+    for(; coord.x < inputWidth;)
+    {
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord.x += 128;
+        _viv_asm(COPY, in_h, src0, 16);
+        VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \
+                    uniFp16SumSqr_dp8x2);
+        sum += sumsqr.y;
+    }
+    lcl_sum[lidx] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0];
+    float4 data0;
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];
+    sum = dot(data0, one);
+    float alpha = rsqrt(sum);
+
+    for(coord.x = get_global_id(0); coord.x < inputWidth; coord.x += 128)
+    {
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_ReadImage(src1, scale, coord.xz, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+        _viv_asm(COPY, in_h, src0, 16);
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4);
+
+        half4 tmpVal0, tmpVal1;
+        tmpData0 *= scale_f0 * alpha;
+        tmpData1 *= scale_f1 * alpha;
+        _viv_asm(CONV, tmpVal0, tmpData0);
+        _viv_asm(CONV, tmpVal1, tmpData1);
+        VXC_DP2x8(tmpDst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+            uniConvertHalfToFp16_2x8);
+        _viv_asm(COPY, dst, tmpDst, 16);
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+
+#define L2NORMSCALE_QINTF16TOQINT_AXIS0_2D(in0_name, in1_name, out_name, read_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) \
+     void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D( \
+    __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output, int axis)\
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+    int lidx = get_local_id(0); \
+    read_type src0, dst; \
+    vxc_short8 src1; \
+    vxc_half8 scale_h; \
+    float sum = 0; \
+    vxc_float4 scale_f0, scale_f1, sumsqr; \
+    __local float lcl_sum[16]; \
+    float4 one = (float4)(1, 1, 1, 1); \
+    for(; coord.x < inputWidth;) \
+    { \
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.x += 128; \
+        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \
+                    uniInt16SumSqr_dp8x2); \
+        sum += sumsqr.y - zP2x * sumsqr.x + zpSqr8x; \
+    } \
+    sum *= e2InScale; \
+    lcl_sum[lidx] = sum; \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \
+    float4 data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \
+    sum = dot(data0, one); \
+    float alpha = rsqrt(sum) * inOutScale; \
+    short zp = inputZP; \
+    vxc_float4  tmpData0, tmpData1; \
+    for(coord.x = get_global_id(0); coord.x < inputWidth; coord.x += 128) \
+    { \
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_ReadImage(src1, scale, coord.xz, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, scale_h, src1, 16); \
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4); \
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4); \
+ \
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert1stUint8SubZpToFp32_4x4); \
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert2ndUint8SubZpToFp32_4x4); \
+ \
+        int4 tmpVal0 = convert_int4_rte(tmpData0 * scale_f0 * alpha + output_ZP); \
+        int4 tmpVal1 = convert_int4_rte(tmpData1 * scale_f1 * alpha + output_ZP); \
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniConvertInt32toUint8_2x8); \
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+
+L2NORMSCALE_QINTF16TOQINT_AXIS0_2D(U8,  F16,  U8,   vxc_uchar8)
+L2NORMSCALE_QINTF16TOQINT_AXIS0_2D(I8,  F16,  I8,   vxc_char8)
+L2NORMSCALE_QINTF16TOQINT_AXIS0_2D(I16, F16,  I16,  vxc_short8)
+
+#define L2NORMSCALE_QINTF16TOF16_AXIS0_2D(in0_name, in1_name, out_name, read_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) \
+     void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D( \
+    __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output, int axis)\
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+    int lidx = get_local_id(0); \
+    read_type src0; \
+    vxc_short8 src1, dst; \
+    vxc_half8 scale_h, tmpDst; \
+    float sum = 0; \
+    vxc_float4 scale_f0, scale_f1, sumsqr, tmpData0, tmpData1; \
+    __local float lcl_sum[16]; \
+    float4 one = (float4)(1, 1, 1, 1); \
+    for(; coord.x < inputWidth;) \
+    { \
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.x += 128; \
+        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \
+                    uniInt16SumSqr_dp8x2); \
+        sum += sumsqr.y - zP2x * sumsqr.x + zpSqr8x; \
+    } \
+    sum *= e2InScale; \
+    lcl_sum[lidx] = sum; \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \
+    float4 data0; \
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \
+    sum = dot(data0, one); \
+    float alpha = rsqrt(sum) * inOutScale; \
+    short zp = inputZP; \
+    for(coord.x = get_global_id(0); coord.x < inputWidth; coord.x += 128) \
+    { \
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_ReadImage(src1, scale, coord.xz, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, scale_h, src1, 16); \
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4); \
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertSecFp16Fp32_4x4); \
+ \
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert1stUint8SubZpToFp32_4x4); \
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert2ndUint8SubZpToFp32_4x4); \
+ \
+        half4 tmpVal0, tmpVal1; \
+        tmpData0 *= scale_f0 * alpha; \
+        tmpData1 *= scale_f1 * alpha; \
+        _viv_asm(CONV, tmpVal0, tmpData0); \
+        _viv_asm(CONV, tmpVal1, tmpData1); \
+        VXC_DP2x8(tmpDst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+            uniConvertHalfToFp16_2x8); \
+        _viv_asm(COPY, dst, tmpDst, 16); \
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+
+L2NORMSCALE_QINTF16TOF16_AXIS0_2D(U8,  F16,  F16,   vxc_uchar8)
+L2NORMSCALE_QINTF16TOF16_AXIS0_2D(I8,  F16,  F16,   vxc_char8)
+L2NORMSCALE_QINTF16TOF16_AXIS0_2D(I16, F16,  F16,   vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx
deleted file mode 100644
index c0a6e19..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx
+++ /dev/null
@@ -1,279 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-/**************************layernorm float16***********************************/
-_viv_uniform int width;
-_viv_uniform float dimRatio;
-_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;
-
-__kernel void layer_norm_F16toF16(
-    image2d_array_t input, image2d_t bias, image2d_t scale,
-    image2d_array_t output, float eps)
-{
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
-    int4 coord_out = coord;
-
-    int8 input_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
-
-    vxc_short8 src0, src1;
-    vxc_float sum = 0, sqr = 0;
-    VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord_out.z, baseAddr);
-
-    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
-    {
-        vxc_half8  val0_h;
-        _viv_asm(COPY, val0_h, src0, 16);
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        vxc_float4 sumsqr;
-        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-            uniFp16SumSqr_dp8x2);
-        sum += sumsqr.x;
-        sqr += sumsqr.y;
-    }
-    vxc_float mean;
-    mean = sum * dimRatio;
-    vxc_float vari;
-    vari = sqr*dimRatio - mean*mean;
-    vari += eps;
-    vari = rsqrt(vari);
-    vxc_float4 bias_f;
-    for(coord.x = 0; coord.x < width; coord.x += 4)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-        bias_f = read_imagef(bias, coord.xw);
-        vxc_half8 in_h, scale_h;
-        _viv_asm(COPY, in_h, src0, 16);
-        _viv_asm(COPY, scale_h, src1, 16);
-        vxc_float4 in_f, scale_f;
-        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        vxc_float4 sub, norm;
-        sub = in_f - mean;
-        norm = scale_f * vari * sub + bias_f;
-        half4 norm_h;
-        _viv_asm(CONV, norm_h, norm);
-        vxc_half8 dst;
-        VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniExtractHalf4_dp4x4);
-        vxc_short8 dstval;
-        _viv_asm(COPY, dstval, dst, 16);
-        coord_out.x = coord.x;
-        VXC_OP4_NoDest(img_store_3d, output, coord_out, dstval, \
-                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
-    }
-}
-/*****************************layernorm uint8 to uint8****************************/
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniSumU8_16x1;
-_viv_uniform VXC_512Bits uniSqrSum_16x1;
-_viv_uniform float input_scale;
-_viv_uniform int inputZP;
-_viv_uniform float outputScale;
-_viv_uniform float output_zp;
-_viv_uniform int sumInZp;
-_viv_uniform int tmpZp1;
-_viv_uniform int tmpZp2;
-_viv_uniform float e2InScale;
-_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-__kernel void layer_norm_U8toU8(
-    image2d_array_t input, image2d_t bias, image2d_t scale,
-    image2d_array_t output, float eps)
-{
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
-    int4 coord_out = coord;
-
-    vxc_uchar16 src0, src2;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float sum = 0, sqr = 0;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    int tmpSum = 0, tmpSqr = 0;
-    vxc_int4 tmpSum1;
-    vxc_int4 tmpSqr1;
-    short zp = inputZP;
-
-    int8 input_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord_out.z, baseAddr);
-
-    for(coord.x = 0; coord.x < width; coord.x += 16)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
-        tmpSum += (tmpSum1.x);
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
-        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
-    }
-    sum = (tmpSum + sumInZp) * input_scale;
-    sqr = (tmpSqr + tmpZp2) * e2InScale;
-
-    float mean, vari;
-    mean = sum * dimRatio;
-    vari = sqr*dimRatio - mean*mean;
-    vari += eps;
-    vari = rsqrt(vari);
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
-    int2 coord_bias = (int2)(0, 0);
-
-    for(coord.x = 0; coord.x < width; coord.x += 16)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_bias.x = coord.x;
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertSecFp16Fp32_4x4);
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        bias_f1 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert2ndUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert3rdUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert4thUint8SubZpToFp32_4x4);
-        tmpData0 *= input_scale;
-        tmpData1 *= input_scale;
-        tmpData2 *= input_scale;
-        tmpData3 *= input_scale;
-
-        vxc_float4 norm;
-        tmpData0 -= mean;
-        norm = scale_f0 * vari * tmpData0 + bias_f0;
-        bias_f0 = read_imagef(bias, coord_bias);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        coord_bias.x += 4;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-
-        tmpData1 -= mean;
-        norm = scale_f1 * vari * tmpData1 + bias_f1;
-        bias_f1 = read_imagef(bias, coord_bias);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertSecFp16Fp32_4x4);
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-
-        tmpData2 -= mean;
-        norm = scale_f0 * vari * tmpData2 + bias_f0;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-
-        tmpData3 -= mean;
-        norm = scale_f1 * vari * tmpData3 + bias_f1;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-        coord_out.x = coord.x;
-        VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \
-                VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    }
-}
-/***************************layernorm float16 to uint8**************************/
-__kernel void layer_norm_F16toU8(
-    image2d_array_t input, image2d_t bias, image2d_t scale,
-    image2d_array_t output, float eps)
-{
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
-    int4 coord_out = coord;
-
-    int8 input_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
-
-    vxc_short8 src0, src1;
-    vxc_float sum = 0, sqr = 0;
-    VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord_out.z, baseAddr);
-
-    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
-    {
-        vxc_half8  val0_h;
-        _viv_asm(COPY, val0_h, src0, 16);
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        vxc_float4 sumsqr;
-        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-            uniFp16SumSqr_dp8x2);
-        sum += sumsqr.x;
-        sqr += sumsqr.y;
-    }
-    vxc_float mean;
-    mean = sum * dimRatio;
-    vxc_float vari;
-    vari = sqr*dimRatio - mean*mean;
-    vari += eps;
-    vari = rsqrt(vari);
-    vxc_float4 bias_f;
-    for(coord.x = 0; coord.x < width; coord.x += 4)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-        bias_f = read_imagef(bias, coord.xw);
-        vxc_half8 in_h, scale_h;
-        _viv_asm(COPY, in_h, src0, 16);
-        _viv_asm(COPY, scale_h, src1, 16);
-        vxc_float4 in_f, scale_f;
-        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        vxc_float4 sub, norm;
-        sub = in_f - mean;
-        norm = scale_f * vari * sub + bias_f;
-        norm = norm * outputScale + output_zp;
-        int4 output_int4;
-        output_int4 = convert_int4_rte(norm);
-        vxc_uchar8 dst;
-        VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),
-            uniConvertInt32toUint8_2x8);
-        coord_out.x = coord.x;
-        VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, \
-                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
-    }
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_0.vx
new file mode 100644
index 0000000..5674bc8
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_0.vx
@@ -0,0 +1,390 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniSumX_16x1;
+_viv_uniform VXC_512Bits uniSumX2_16x1;
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform int width;
+_viv_uniform float inv_multiplier;
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+#define CONV2F32(dst, src, section) \
+        VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+            uniDataToFP32_##section##_4x4);
+
+#define LAYER_NORM_8BITS_IMPL(name, src_type) \
+__kernel void layer_norm_axis0_##name( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __write_only image2d_array_t output, \
+                 float           eps) \
+{ \
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \
+    int4 coord_out = coord; \
+ \
+    src_type src0, dst; \
+    vxc_short8 src1; \
+    vxc_half8 scale_h; \
+    float sum = 0, sqr = 0; \
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+    uint2 _sums = 0, sum_x_x2; \
+ \
+    int8 input_desc, output_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr_a); \
+ \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.z, baseAddr); \
+ \
+    for (coord.x = 0; coord.x < width; ) \
+    { \
+        VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        coord.x += 16; \
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \
+        _sums = _sums + sum_x_x2; \
+    } \
+ \
+    float2 sums = convert_float2(_sums) * inv_multiplier; \
+ \
+    sums.y = sums.y - sums.x * sums.x + eps; \
+    sums.y = rsqrt(sums.y); \
+    int4 tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1, tmpData2, tmpData3; \
+    int2 coord_bias = (int2)(0, 0); \
+ \
+    for(coord.x = 0; coord.x < width; coord.x += 16) \
+    { \
+        VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord_bias.x = coord.x; \
+        _viv_asm(COPY, scale_h, src1, 16); \
+        CONV2F32(scale_f0, scale_h, 0); \
+        CONV2F32(scale_f1, scale_h, 1); \
+        bias_f0 = read_imagef(bias, coord_bias); \
+        coord_bias.x += 4; \
+        bias_f1 = read_imagef(bias, coord_bias); \
+        coord_bias.x += 4; \
+ \
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0), \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, scale_h, src1, 16); \
+        CONV2F32(tmpData0, src0, 0); \
+        CONV2F32(tmpData1, src0, 1); \
+        CONV2F32(tmpData2, src0, 2); \
+        CONV2F32(tmpData3, src0, 3); \
+ \
+        float4 norm; \
+        tmpData0 = tmpData0 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \
+        bias_f0 = read_imagef(bias, coord_bias); \
+        CONV2F32(scale_f0, scale_h, 0); \
+        coord_bias.x += 4; \
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \
+ \
+        tmpData1 = tmpData1 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \
+        bias_f1 = read_imagef(bias, coord_bias); \
+        CONV2F32(scale_f1, scale_h, 1); \
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+ \
+        tmpData2 = tmpData2 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData2 + bias_f0; \
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \
+ \
+        tmpData3 = tmpData3 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData3 + bias_f1; \
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+        coord_out.x = coord.x; \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+LAYER_NORM_8BITS_IMPL(U8_F16toU8, vxc_uchar16)
+LAYER_NORM_8BITS_IMPL(I8_F16toI8, vxc_char16)
+
+#define LAYER_NORM_SUMS_2D() \
+    uint2 _sums = 0, sum_x_x2; \
+ \
+    for (coord.x = 0; coord.x < width; ) \
+    { \
+        VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        coord.x += 16; \
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \
+        _sums = _sums + sum_x_x2; \
+    } \
+ \
+    float2 sums = convert_float2(_sums) * inv_multiplier; \
+ \
+    sums.y = sums.y - sums.x * sums.x + eps; \
+    sums.y = rsqrt(sums.y);
+
+#define LAYER_NORM_8BITS_IMPL_2D(name, src_type) \
+__kernel void layer_norm_axis0_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __write_only image2d_array_t output, \
+                 float           eps) \
+{ \
+    int4 coord = (int4)(0, get_global_id(1), 0, 0); \
+ \
+    src_type src0, dst; \
+    vxc_short8 src1; \
+    vxc_half8 scale_h; \
+    float sum = 0, sqr = 0; \
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+ \
+    LAYER_NORM_SUMS_2D(); \
+ \
+    int4 tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1, tmpData2, tmpData3; \
+    int2 coord_bias = (int2)(0, 0); \
+ \
+    for (coord.x = 0; coord.x < width; coord.x += 16) \
+    { \
+        VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord_bias.x = coord.x; \
+        _viv_asm(COPY, scale_h, src1, 16); \
+        CONV2F32(scale_f0, scale_h, 0); \
+        CONV2F32(scale_f1, scale_h, 1); \
+        bias_f0 = read_imagef(bias, coord_bias); \
+        coord_bias.x += 4; \
+        bias_f1 = read_imagef(bias, coord_bias); \
+        coord_bias.x += 4; \
+ \
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0), \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, scale_h, src1, 16); \
+        CONV2F32(tmpData0, src0, 0); \
+        CONV2F32(tmpData1, src0, 1); \
+        CONV2F32(tmpData2, src0, 2); \
+        CONV2F32(tmpData3, src0, 3); \
+ \
+        float4 norm; \
+        tmpData0 = tmpData0 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \
+        bias_f0 = read_imagef(bias, coord_bias); \
+        CONV2F32(scale_f0, scale_h, 0); \
+        coord_bias.x += 4; \
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \
+ \
+        tmpData1 = tmpData1 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \
+        bias_f1 = read_imagef(bias, coord_bias); \
+        CONV2F32(scale_f1, scale_h, 1); \
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+ \
+        tmpData2 = tmpData2 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData2 + bias_f0; \
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \
+ \
+        tmpData3 = tmpData3 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData3 + bias_f1; \
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+LAYER_NORM_8BITS_IMPL_2D(U8_F16toU8, vxc_uchar16)
+LAYER_NORM_8BITS_IMPL_2D(I8_F16toI8, vxc_char16)
+
+#define LAYER_NORM_8TOF16_IMPL(name, src_type) \
+__kernel void layer_norm_axis0_##name( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __write_only image2d_array_t output, \
+                 float           eps) \
+{ \
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \
+    int4 coord_out = coord; \
+ \
+    src_type src0; \
+    vxc_short8 src1; \
+    vxc_half8 scale_h; \
+    float sum = 0, sqr = 0; \
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+    uint2 _sums = 0, sum_x_x2; \
+ \
+    int8 input_desc, output_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr_a); \
+ \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.z, baseAddr); \
+ \
+    for (coord.x = 0; coord.x < width; ) \
+    { \
+        VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        coord.x += 16; \
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \
+        _sums = _sums + sum_x_x2; \
+    } \
+ \
+    float2 sums = convert_float2(_sums) * inv_multiplier; \
+ \
+    sums.y = sums.y - sums.x * sums.x + eps; \
+    sums.y = rsqrt(sums.y); \
+    half4 tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1, tmpData2, tmpData3; \
+    int2 coord_bias = (int2)(0, 0); \
+ \
+    vxc_short8 dst; \
+    vxc_half8 result; \
+    for(coord.x = 0; coord.x < width; coord.x += 16) \
+    { \
+        VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord_bias.x = coord.x; \
+        _viv_asm(COPY, scale_h, src1, 16); \
+        CONV2F32(scale_f0, scale_h, 0); \
+        CONV2F32(scale_f1, scale_h, 1); \
+        bias_f0 = read_imagef(bias, coord_bias); \
+        coord_bias.x += 4; \
+        bias_f1 = read_imagef(bias, coord_bias); \
+        coord_bias.x += 4; \
+ \
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0), \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, scale_h, src1, 16); \
+        CONV2F32(tmpData0, src0, 0); \
+        CONV2F32(tmpData1, src0, 1); \
+        CONV2F32(tmpData2, src0, 2); \
+        CONV2F32(tmpData3, src0, 3); \
+ \
+        float4 norm; \
+        tmpData0 = tmpData0 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \
+        bias_f0 = read_imagef(bias, coord_bias); \
+        CONV2F32(scale_f0, scale_h, 0); \
+        coord_bias.x += 4; \
+        _viv_asm(CONV, tmpVal0, norm); \
+ \
+        tmpData1 = tmpData1 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \
+        bias_f1 = read_imagef(bias, coord_bias); \
+        CONV2F32(scale_f1, scale_h, 1); \
+        _viv_asm(CONV, tmpVal1, norm); \
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+        _viv_asm(COPY, dst, result, 16); \
+        coord_out.x = coord.x; \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+ \
+        tmpData2 = tmpData2 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData2 + bias_f0; \
+        _viv_asm(CONV, tmpVal0, norm); \
+ \
+        tmpData3 = tmpData3 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData3 + bias_f1; \
+        _viv_asm(CONV, tmpVal1, norm); \
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+        _viv_asm(COPY, dst, result, 16); \
+        coord_out.x += 8; \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+LAYER_NORM_8TOF16_IMPL(U8_F16toF16, vxc_uchar16)
+LAYER_NORM_8TOF16_IMPL(I8_F16toF16, vxc_char16)
+
+#define LAYER_NORM_8TOF16_IMPL_2D(name, src_type) \
+__kernel void layer_norm_axis0_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __write_only image2d_array_t output, \
+                 float           eps) \
+{ \
+    int4 coord = (int4)(0, get_global_id(1), 0, 0); \
+ \
+    src_type src0; \
+    vxc_short8 src1; \
+    vxc_half8 scale_h; \
+    float sum = 0, sqr = 0; \
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+ \
+    LAYER_NORM_SUMS_2D(); \
+ \
+    half4 tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1, tmpData2, tmpData3; \
+    int2 coord_bias = (int2)(0, 0); \
+ \
+    vxc_short8 dst; \
+    vxc_half8 result; \
+    for (coord.x = 0; coord.x < width; coord.x += 16) \
+    { \
+        VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord_bias.x = coord.x; \
+        _viv_asm(COPY, scale_h, src1, 16); \
+        CONV2F32(scale_f0, scale_h, 0); \
+        CONV2F32(scale_f1, scale_h, 1); \
+        bias_f0 = read_imagef(bias, coord_bias); \
+        coord_bias.x += 4; \
+        bias_f1 = read_imagef(bias, coord_bias); \
+        coord_bias.x += 4; \
+ \
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0), \
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, scale_h, src1, 16); \
+        CONV2F32(tmpData0, src0, 0); \
+        CONV2F32(tmpData1, src0, 1); \
+        CONV2F32(tmpData2, src0, 2); \
+        CONV2F32(tmpData3, src0, 3); \
+ \
+        float4 norm; \
+        tmpData0 = tmpData0 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \
+        bias_f0 = read_imagef(bias, coord_bias); \
+        CONV2F32(scale_f0, scale_h, 0); \
+        coord_bias.x += 4; \
+        _viv_asm(CONV, tmpVal0, norm); \
+ \
+        tmpData1 = tmpData1 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \
+        bias_f1 = read_imagef(bias, coord_bias); \
+        CONV2F32(scale_f1, scale_h, 1); \
+        _viv_asm(CONV, tmpVal1, norm); \
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+        _viv_asm(COPY, dst, result, 16); \
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.x += 8; \
+ \
+        tmpData2 = tmpData2 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData2 + bias_f0; \
+        _viv_asm(CONV, tmpVal0, norm); \
+ \
+        tmpData3 = tmpData3 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData3 + bias_f1; \
+        _viv_asm(CONV, tmpVal1, norm); \
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+        _viv_asm(COPY, dst, result, 16); \
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.x -= 8; \
+    } \
+}
+LAYER_NORM_8TOF16_IMPL_2D(U8_F16toF16, vxc_uchar16)
+LAYER_NORM_8TOF16_IMPL_2D(I8_F16toF16, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_1.vx
new file mode 100644
index 0000000..d2be567
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_1.vx
@@ -0,0 +1,343 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniSum_X_X2_8x2;
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform int width;
+_viv_uniform float inv_multiplier;
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+#define CONV2F32(dst, src, section) \
+        VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+            uniDataToFP32_##section##_4x4);
+
+#define LAYER_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \
+__kernel void layer_norm_axis0_##name( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __write_only image2d_array_t output, \
+                 float           eps) \
+{ \
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \
+    int4 coord_out = coord; \
+ \
+    vxc_short8 in0; \
+    src_type src0; \
+    copy_type dst; \
+    vxc_short8 src1; \
+    dst_type result; \
+    vxc_half8 scale_h; \
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+    float2 _sums = 0, sum_x_x2; \
+ \
+    int8 input_desc, output_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr_a); \
+ \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.z, baseAddr); \
+ \
+    for (coord.x = 0; coord.x < width; ) \
+    { \
+        VXC_OP4(img_load_3d, in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, in0, 16); \
+        coord.x += 8; \
+        VXC_DP8x2(sum_x_x2, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
+        _sums = _sums + sum_x_x2; \
+    } \
+ \
+    float2 sums = _sums * inv_multiplier; \
+ \
+    sums.y = sums.y - sums.x * sums.x + eps; \
+    sums.y = rsqrt(sums.y); \
+    conv_type tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1; \
+    int2 coord_bias = (int2)(0, 0); \
+ \
+    for(coord.x = 0; coord.x < width; coord.x += 8) \
+    { \
+        VXC_OP4(img_load_3d, in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, in0, 16); \
+        VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord_bias.x = coord.x; \
+        _viv_asm(COPY, scale_h, src1, 16); \
+        CONV2F32(scale_f0, scale_h, 0); \
+        CONV2F32(scale_f1, scale_h, 1); \
+        bias_f0 = read_imagef(bias, coord_bias); \
+        coord_bias.x += 4; \
+        bias_f1 = read_imagef(bias, coord_bias); \
+        coord_bias.x += 4; \
+ \
+        CONV2F32(tmpData0, src0, 0); \
+        CONV2F32(tmpData1, src0, 1); \
+ \
+        float4 norm; \
+        tmpData0 = tmpData0 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \
+        norm = norm * output_scale + output_zp; \
+        _viv_asm(CONV_RTE, tmpVal0, norm); \
+ \
+        tmpData1 = tmpData1 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \
+        norm = norm * output_scale + output_zp; \
+        _viv_asm(CONV_RTE, tmpVal1, norm); \
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+        _viv_asm(COPY, dst, result, 16); \
+ \
+        coord_out.x = coord.x; \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+LAYER_NORM_16BITS_IMPL(F16_F16toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)
+LAYER_NORM_16BITS_IMPL(F16_F16toI16, vxc_half8,  vxc_short8, vxc_short8, int4)
+LAYER_NORM_16BITS_IMPL(F16_F16toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)
+LAYER_NORM_16BITS_IMPL(F16_F16toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)
+LAYER_NORM_16BITS_IMPL(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, int4)
+LAYER_NORM_16BITS_IMPL(I16_F16toF16, vxc_short8, vxc_half8,  vxc_short8, half4)
+
+#define LAYER_NORM_16BITS_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \
+__kernel void layer_norm_axis0_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __write_only image2d_array_t output, \
+                 float           eps) \
+{ \
+    int4 coord = (int4)(0, get_global_id(1), 0, 0); \
+ \
+    vxc_short8 in0; \
+    src_type src0; \
+    copy_type dst; \
+    dst_type result; \
+    vxc_short8 src1; \
+    vxc_half8 scale_h; \
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+    float2 _sums = 0, sum_x_x2; \
+ \
+    for (coord.x = 0; coord.x < width; ) \
+    { \
+        VXC_ReadImage(in0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, in0, 16); \
+        coord.x += 8; \
+        VXC_DP8x2(sum_x_x2, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
+        _sums = _sums + sum_x_x2; \
+    } \
+ \
+    float2 sums = _sums * inv_multiplier; \
+ \
+    sums.y = sums.y - sums.x * sums.x + eps; \
+    sums.y = rsqrt(sums.y); \
+ \
+    conv_type tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1; \
+    int2 coord_bias = (int2)(0, 0); \
+ \
+    for (coord.x = 0; coord.x < width; coord.x += 8) \
+    { \
+        VXC_ReadImage(in0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, in0, 16); \
+        VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord_bias.x = coord.x; \
+        _viv_asm(COPY, scale_h, src1, 16); \
+        CONV2F32(scale_f0, scale_h, 0); \
+        CONV2F32(scale_f1, scale_h, 1); \
+        bias_f0 = read_imagef(bias, coord_bias); \
+        coord_bias.x += 4; \
+        bias_f1 = read_imagef(bias, coord_bias); \
+        coord_bias.x += 4; \
+ \
+        CONV2F32(tmpData0, src0, 0); \
+        CONV2F32(tmpData1, src0, 1); \
+ \
+        float4 norm; \
+        tmpData0 = tmpData0 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \
+        coord_bias.x += 4; \
+        norm = norm * output_scale + output_zp; \
+        _viv_asm(CONV_RTE, tmpVal0, norm); \
+ \
+        tmpData1 = tmpData1 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \
+        norm = norm * output_scale + output_zp; \
+        _viv_asm(CONV_RTE, tmpVal1, norm); \
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+        _viv_asm(COPY, dst, result, 16); \
+ \
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+LAYER_NORM_16BITS_IMPL_2D(F16_F16toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)
+LAYER_NORM_16BITS_IMPL_2D(F16_F16toI16, vxc_half8,  vxc_short8, vxc_short8, int4)
+LAYER_NORM_16BITS_IMPL_2D(F16_F16toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)
+LAYER_NORM_16BITS_IMPL_2D(F16_F16toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)
+LAYER_NORM_16BITS_IMPL_2D(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, int4)
+LAYER_NORM_16BITS_IMPL_2D(I16_F16toF16, vxc_short8, vxc_half8,  vxc_short8, half4)
+
+#define LAYER_NORM_16_32_IMPL(name, src_type, dst_type, copy_type, conv_type) \
+__kernel void layer_norm_axis0_##name( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __write_only image2d_array_t output, \
+                 float           eps) \
+{ \
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \
+    int4 coord_out = coord; \
+ \
+    vxc_short8 in0; \
+    src_type src0; \
+    copy_type dst; \
+    dst_type result; \
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+    float2 _sums = 0, sum_x_x2; \
+ \
+    int8 input_desc, output_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr_a); \
+ \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.z, baseAddr); \
+ \
+    for (coord.x = 0; coord.x < width; ) \
+    { \
+        VXC_OP4(img_load_3d, in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, in0, 16); \
+        coord.x += 8; \
+        VXC_DP8x2(sum_x_x2, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
+        _sums = _sums + sum_x_x2; \
+    } \
+ \
+    float2 sums = _sums * inv_multiplier; \
+ \
+    sums.y = sums.y - sums.x * sums.x + eps; \
+    sums.y = rsqrt(sums.y); \
+    conv_type tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1; \
+ \
+    Image img1 = create_image_from_image2d(bias, 4); \
+    Image img2 = create_image_from_image2d(scale, 4); \
+    __global float* bias_ptr = (__global float*)img1.ptr; \
+    __global float* scale_ptr = (__global float*)img2.ptr; \
+    for(coord.x = 0; coord.x < width; coord.x += 8) \
+    { \
+        VXC_OP4(img_load_3d, in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, in0, 16); \
+        bias_f0 = vload4(0, bias_ptr); \
+        bias_f1 = vload4(1, bias_ptr); \
+        scale_f0 = vload4(0, scale_ptr); \
+        scale_f1 = vload4(1, scale_ptr); \
+        bias_ptr += 8; \
+        scale_ptr += 8; \
+ \
+        CONV2F32(tmpData0, src0, 0); \
+        CONV2F32(tmpData1, src0, 1); \
+ \
+        float4 norm; \
+        tmpData0 = tmpData0 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \
+        norm = norm * output_scale + output_zp; \
+        _viv_asm(CONV_RTE, tmpVal0, norm); \
+ \
+        tmpData1 = tmpData1 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \
+        norm = norm * output_scale + output_zp; \
+        _viv_asm(CONV_RTE, tmpVal1, norm); \
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+        _viv_asm(COPY, dst, result, 16); \
+ \
+        coord_out.x = coord.x; \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+LAYER_NORM_16_32_IMPL(F16_F32toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)
+LAYER_NORM_16_32_IMPL(F16_F32toI16, vxc_half8,  vxc_short8, vxc_short8, int4)
+LAYER_NORM_16_32_IMPL(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)
+LAYER_NORM_16_32_IMPL(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)
+LAYER_NORM_16_32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
+LAYER_NORM_16_32_IMPL(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)
+
+#define LAYER_NORM_16_32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \
+__kernel void layer_norm_axis0_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __write_only image2d_array_t output, \
+                 float           eps) \
+{ \
+    int4 coord = (int4)(0, get_global_id(1), 0, 0); \
+ \
+    vxc_short8 in0; \
+    src_type src0; \
+    copy_type dst; \
+    dst_type result; \
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+    float2 _sums = 0, sum_x_x2; \
+ \
+    for (coord.x = 0; coord.x < width; ) \
+    { \
+        VXC_ReadImage(in0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, in0, 16); \
+        coord.x += 8; \
+        VXC_DP8x2(sum_x_x2, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
+        _sums = _sums + sum_x_x2; \
+    } \
+ \
+    float2 sums = _sums * inv_multiplier; \
+ \
+    sums.y = sums.y - sums.x * sums.x + eps; \
+    sums.y = rsqrt(sums.y); \
+ \
+    conv_type tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1; \
+ \
+    Image img1 = create_image_from_image2d(bias, 4); \
+    Image img2 = create_image_from_image2d(scale, 4); \
+    __global float* bias_ptr = (__global float*)img1.ptr; \
+    __global float* scale_ptr = (__global float*)img2.ptr; \
+    for (coord.x = 0; coord.x < width; coord.x += 8) \
+    { \
+        VXC_ReadImage(in0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, in0, 16); \
+        bias_f0 = vload4(0, bias_ptr); \
+        bias_f1 = vload4(1, bias_ptr); \
+        scale_f0 = vload4(0, scale_ptr); \
+        scale_f1 = vload4(1, scale_ptr); \
+        bias_ptr += 8; \
+        scale_ptr += 8; \
+ \
+        CONV2F32(tmpData0, src0, 0); \
+        CONV2F32(tmpData1, src0, 1); \
+ \
+        float4 norm; \
+        tmpData0 = tmpData0 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \
+        norm = norm * output_scale + output_zp; \
+        _viv_asm(CONV_RTE, tmpVal0, norm); \
+ \
+        tmpData1 = tmpData1 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \
+        norm = norm * output_scale + output_zp; \
+        _viv_asm(CONV_RTE, tmpVal1, norm); \
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+        _viv_asm(COPY, dst, result, 16); \
+ \
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+LAYER_NORM_16_32_IMPL_2D(F16_F32toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)
+LAYER_NORM_16_32_IMPL_2D(F16_F32toI16, vxc_half8,  vxc_short8, vxc_short8, int4)
+LAYER_NORM_16_32_IMPL_2D(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)
+LAYER_NORM_16_32_IMPL_2D(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)
+LAYER_NORM_16_32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
+LAYER_NORM_16_32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2.vx
new file mode 100644
index 0000000..a45a0ab
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2.vx
@@ -0,0 +1,385 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniSumX_16x1;
+_viv_uniform VXC_512Bits uniSumX2_16x1;
+_viv_uniform VXC_512Bits uniSum_X_X2_8x2;
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform int width;
+_viv_uniform float inv_multiplier;
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+#define CONV2F32(dst, src, section) \
+        VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+            uniDataToFP32_##section##_4x4);
+
+#define LAYER_NORM_8_32_IMPL(name, src_type) \
+__kernel void layer_norm_axis0_##name( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __write_only image2d_array_t output, \
+                 float           eps) \
+{ \
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \
+    int4 coord_out = coord; \
+ \
+    src_type src0, dst; \
+    vxc_short8 src1; \
+    vxc_half8 scale_h; \
+    float sum = 0, sqr = 0; \
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+    uint2 _sums = 0, sum_x_x2; \
+ \
+    int8 input_desc, output_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr_a); \
+ \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.z, baseAddr); \
+ \
+    for (coord.x = 0; coord.x < width; ) \
+    { \
+        VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        coord.x += 16; \
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \
+        _sums = _sums + sum_x_x2; \
+    } \
+ \
+    float2 sums = convert_float2(_sums) * inv_multiplier; \
+ \
+    sums.y = sums.y - sums.x * sums.x + eps; \
+    sums.y = rsqrt(sums.y); \
+    int4 tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1, tmpData2, tmpData3; \
+ \
+    Image img1 = create_image_from_image2d(bias, 4); \
+    Image img2 = create_image_from_image2d(scale, 4); \
+    __global float* bias_ptr = (__global float*)img1.ptr; \
+    __global float* scale_ptr = (__global float*)img2.ptr; \
+    for(coord.x = 0; coord.x < width; coord.x += 16) \
+    { \
+        VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        bias_f0 = vload4(0, bias_ptr); \
+        bias_f1 = vload4(1, bias_ptr); \
+        scale_f0 = vload4(0, scale_ptr); \
+        scale_f1 = vload4(1, scale_ptr); \
+ \
+        CONV2F32(tmpData0, src0, 0); \
+        CONV2F32(tmpData1, src0, 1); \
+        CONV2F32(tmpData2, src0, 2); \
+        CONV2F32(tmpData3, src0, 3); \
+ \
+        float4 norm; \
+        tmpData0 = tmpData0 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \
+        bias_f0 = vload4(2, bias_ptr); \
+        scale_f0 = vload4(2, scale_ptr); \
+ \
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \
+ \
+        tmpData1 = tmpData1 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \
+        bias_f1 = vload4(3, bias_ptr); \
+        scale_f1 = vload4(3, scale_ptr); \
+        bias_ptr += 16; \
+        scale_ptr += 16; \
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+ \
+        tmpData2 = tmpData2 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData2 + bias_f0; \
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \
+ \
+        tmpData3 = tmpData3 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData3 + bias_f1; \
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+        coord_out.x = coord.x; \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+LAYER_NORM_8_32_IMPL(U8_F32toU8, vxc_uchar16)
+LAYER_NORM_8_32_IMPL(I8_F32toI8, vxc_char16)
+
+#define LAYER_NORM_8_32_IMPL_2D(name, src_type) \
+__kernel void layer_norm_axis0_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __write_only image2d_array_t output, \
+                 float           eps) \
+{ \
+    int4 coord = (int4)(0, get_global_id(1), 0, 0); \
+ \
+    src_type src0, dst; \
+    vxc_short8 src1; \
+    vxc_half8 scale_h; \
+    float sum = 0, sqr = 0; \
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+    uint2 _sums = 0, sum_x_x2; \
+ \
+    for (coord.x = 0; coord.x < width; ) \
+    { \
+        VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        coord.x += 16; \
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \
+        _sums = _sums + sum_x_x2; \
+    } \
+ \
+    float2 sums = convert_float2(_sums) * inv_multiplier; \
+ \
+    sums.y = sums.y - sums.x * sums.x + eps; \
+    sums.y = rsqrt(sums.y); \
+    int4 tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1, tmpData2, tmpData3; \
+ \
+    Image img1 = create_image_from_image2d(bias, 4); \
+    Image img2 = create_image_from_image2d(scale, 4); \
+    __global float* bias_ptr = (__global float*)img1.ptr; \
+    __global float* scale_ptr = (__global float*)img2.ptr; \
+    for (coord.x = 0; coord.x < width; coord.x += 16) \
+    { \
+        VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        bias_f0 = vload4(0, bias_ptr); \
+        bias_f1 = vload4(1, bias_ptr); \
+        scale_f0 = vload4(0, scale_ptr); \
+        scale_f1 = vload4(1, scale_ptr); \
+ \
+        CONV2F32(tmpData0, src0, 0); \
+        CONV2F32(tmpData1, src0, 1); \
+        CONV2F32(tmpData2, src0, 2); \
+        CONV2F32(tmpData3, src0, 3); \
+ \
+        float4 norm; \
+        tmpData0 = tmpData0 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \
+        bias_f0 = vload4(2, bias_ptr); \
+        scale_f0 = vload4(2, scale_ptr); \
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \
+ \
+        tmpData1 = tmpData1 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \
+ \
+        bias_f1 = vload4(3, bias_ptr); \
+        scale_f1 = vload4(3, scale_ptr); \
+        bias_ptr += 16; \
+        scale_ptr += 16; \
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+ \
+        tmpData2 = tmpData2 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData2 + bias_f0; \
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \
+ \
+        tmpData3 = tmpData3 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData3 + bias_f1; \
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+LAYER_NORM_8_32_IMPL_2D(U8_F32toU8, vxc_uchar16)
+LAYER_NORM_8_32_IMPL_2D(I8_F32toI8, vxc_char16)
+
+#define LAYER_NORM_8_32TOF16_IMPL(name, src_type) \
+__kernel void layer_norm_axis0_##name( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __write_only image2d_array_t output, \
+                 float           eps) \
+{ \
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \
+    int4 coord_out = coord; \
+ \
+    src_type src0; \
+    vxc_short8 src1; \
+    vxc_half8 scale_h; \
+    float sum = 0, sqr = 0; \
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+    uint2 _sums = 0, sum_x_x2; \
+ \
+    int8 input_desc, output_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr_a); \
+ \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.z, baseAddr); \
+ \
+    for (coord.x = 0; coord.x < width; ) \
+    { \
+        VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        coord.x += 16; \
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \
+        _sums = _sums + sum_x_x2; \
+    } \
+ \
+    float2 sums = convert_float2(_sums) * inv_multiplier; \
+ \
+    sums.y = sums.y - sums.x * sums.x + eps; \
+    sums.y = rsqrt(sums.y); \
+    half4 tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1, tmpData2, tmpData3; \
+ \
+    vxc_short8 dst; \
+    vxc_half8 result; \
+    Image img1 = create_image_from_image2d(bias, 4); \
+    Image img2 = create_image_from_image2d(scale, 4); \
+    __global float* bias_ptr = (__global float*)img1.ptr; \
+    __global float* scale_ptr = (__global float*)img2.ptr; \
+    for(coord.x = 0; coord.x < width; coord.x += 16) \
+    { \
+        VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        bias_f0 = vload4(0, bias_ptr); \
+        bias_f1 = vload4(1, bias_ptr); \
+        scale_f0 = vload4(0, scale_ptr); \
+        scale_f1 = vload4(1, scale_ptr); \
+ \
+        CONV2F32(tmpData0, src0, 0); \
+        CONV2F32(tmpData1, src0, 1); \
+        CONV2F32(tmpData2, src0, 2); \
+        CONV2F32(tmpData3, src0, 3); \
+ \
+        float4 norm; \
+        tmpData0 = tmpData0 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \
+        bias_f0 = vload4(2, bias_ptr); \
+        scale_f0 = vload4(2, scale_ptr); \
+        _viv_asm(CONV, tmpVal0, norm); \
+ \
+        tmpData1 = tmpData1 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \
+ \
+        bias_f1 = vload4(3, bias_ptr); \
+        scale_f1 = vload4(3, scale_ptr); \
+        bias_ptr += 16; \
+        scale_ptr += 16; \
+        _viv_asm(CONV, tmpVal1, norm); \
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+        _viv_asm(COPY, dst, result, 16); \
+        coord_out.x = coord.x; \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+ \
+        tmpData2 = tmpData2 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData2 + bias_f0; \
+        _viv_asm(CONV, tmpVal0, norm); \
+ \
+        tmpData3 = tmpData3 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData3 + bias_f1; \
+        _viv_asm(CONV, tmpVal1, norm); \
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+        _viv_asm(COPY, dst, result, 16); \
+        coord_out.x += 8; \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+LAYER_NORM_8_32TOF16_IMPL(U8_F32toF16, vxc_uchar16)
+LAYER_NORM_8_32TOF16_IMPL(I8_F32toF16, vxc_char16)
+
+#define LAYER_NORM_8_32TOF16_IMPL_2D(name, src_type) \
+__kernel void layer_norm_axis0_##name##_2D( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       bias, \
+    __read_only  image2d_t       scale, \
+    __write_only image2d_array_t output, \
+                 float           eps) \
+{ \
+    int4 coord = (int4)(0, get_global_id(1), 0, 0); \
+ \
+    src_type src0; \
+    vxc_short8 src1; \
+    vxc_half8 scale_h; \
+    float sum = 0, sqr = 0; \
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+    uint2 _sums = 0, sum_x_x2; \
+ \
+    for (coord.x = 0; coord.x < width; ) \
+    { \
+        VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        coord.x += 16; \
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \
+        _sums = _sums + sum_x_x2; \
+    } \
+ \
+    float2 sums = convert_float2(_sums) * inv_multiplier; \
+ \
+    sums.y = sums.y - sums.x * sums.x + eps; \
+    sums.y = rsqrt(sums.y); \
+    half4 tmpVal0, tmpVal1; \
+    float4  tmpData0, tmpData1, tmpData2, tmpData3; \
+ \
+    vxc_short8 dst; \
+    vxc_half8 result; \
+    Image img1 = create_image_from_image2d(bias, 4); \
+    Image img2 = create_image_from_image2d(scale, 4); \
+    __global float* bias_ptr = (__global float*)img1.ptr; \
+    __global float* scale_ptr = (__global float*)img2.ptr; \
+    for (coord.x = 0; coord.x < width; coord.x += 16) \
+    { \
+        VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        bias_f0 = vload4(0, bias_ptr); \
+        bias_f1 = vload4(1, bias_ptr); \
+        scale_f0 = vload4(0, scale_ptr); \
+        scale_f1 = vload4(1, scale_ptr); \
+        bias_ptr += 8; \
+        scale_ptr += 8; \
+ \
+        CONV2F32(tmpData0, src0, 0); \
+        CONV2F32(tmpData1, src0, 1); \
+        CONV2F32(tmpData2, src0, 2); \
+        CONV2F32(tmpData3, src0, 3); \
+ \
+        float4 norm; \
+        tmpData0 = tmpData0 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \
+        bias_f0 = vload4(2, bias_ptr); \
+        scale_f0 = vload4(2, scale_ptr); \
+        _viv_asm(CONV, tmpVal0, norm); \
+ \
+        tmpData1 = tmpData1 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \
+        bias_f1 = vload4(3, bias_ptr); \
+        scale_f1 = vload4(3, scale_ptr); \
+        _viv_asm(CONV, tmpVal1, norm); \
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+        _viv_asm(COPY, dst, result, 16); \
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.x += 8; \
+ \
+        tmpData2 = tmpData2 - sums.x; \
+        norm = scale_f0 * sums.y * tmpData2 + bias_f0; \
+        _viv_asm(CONV, tmpVal0, norm); \
+ \
+        tmpData3 = tmpData3 - sums.x; \
+        norm = scale_f1 * sums.y * tmpData3 + bias_f1; \
+        _viv_asm(CONV, tmpVal1, norm); \
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+            uniExtract8Data_2x8); \
+        _viv_asm(COPY, dst, result, 16); \
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.x -= 8; \
+    } \
+}
+LAYER_NORM_8_32TOF16_IMPL_2D(U8_F32toF16, vxc_uchar16)
+LAYER_NORM_8_32TOF16_IMPL_2D(I8_F32toF16, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2d.vx
deleted file mode 100644
index d517d7d..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2d.vx
+++ /dev/null
@@ -1,234 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-/**************************layernorm float16***********************************/
-_viv_uniform int width;
-_viv_uniform float dimRatio;
-_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;
-
-__kernel void layer_norm_F16toF16_2D(
-    image2d_t input, image2d_t bias, image2d_t scale,
-    image2d_t output, float eps)
-{
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);
-    vxc_short8 src0, src1;
-    vxc_float sum = 0, sqr = 0;
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
-    {
-        vxc_half8  val0_h;
-        _viv_asm(COPY, val0_h, src0, 16);
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        vxc_float4 sumsqr;
-        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-            uniFp16SumSqr_dp8x2);
-        sum += sumsqr.x;
-        sqr += sumsqr.y;
-    }
-    vxc_float mean;
-    mean = sum * dimRatio;
-    vxc_float vari;
-    vari = sqr*dimRatio - mean*mean;
-    vari += eps;
-    vari = rsqrt(vari);
-    vxc_float4 bias_f;
-    for(coord.x = 0; coord.x < width; coord.x += 4)
-    {
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-        bias_f = read_imagef(bias, coord.xw);
-        vxc_half8 in_h, scale_h;
-        _viv_asm(COPY, in_h, src0, 16);
-        _viv_asm(COPY, scale_h, src1, 16);
-        vxc_float4 in_f, scale_f;
-        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        vxc_float4 sub, norm;
-        sub = in_f - mean;
-        norm = scale_f * vari * sub + bias_f;
-        half4 norm_h;
-        _viv_asm(CONV, norm_h, norm);
-        vxc_half8 dst;
-        VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniExtractHalf4_dp4x4);
-        vxc_short8 dstval;
-        _viv_asm(COPY, dstval, dst, 16);
-        VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-    }
-}
-/*****************************layernorm uint8 to uint8****************************/
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniSumU8_16x1;
-_viv_uniform VXC_512Bits uniSqrSum_16x1;
-_viv_uniform float input_scale;
-_viv_uniform int inputZP;
-_viv_uniform float outputScale;
-_viv_uniform float output_zp;
-_viv_uniform int sumInZp;
-_viv_uniform int tmpZp1;
-_viv_uniform int tmpZp2;
-_viv_uniform float e2InScale;
-_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-__kernel void layer_norm_U8toU8_2D(
-    image2d_t input, image2d_t bias, image2d_t scale,
-    image2d_t output, float eps)
-{
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);
-    vxc_uchar16 src0, src2;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float sum = 0, sqr = 0;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    int tmpSum = 0, tmpSqr = 0;
-    vxc_int4 tmpSum1;
-    vxc_int4 tmpSqr1;
-    short zp = inputZP;
-
-    for(coord.x = 0; coord.x < width; coord.x += 16)
-    {
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
-        tmpSum += (tmpSum1.x);
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
-        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
-    }
-    sum = (tmpSum + sumInZp) * input_scale;
-    sqr = (tmpSqr + tmpZp2) * e2InScale;
-
-    float mean, vari;
-    mean = sum * dimRatio;
-    vari = sqr*dimRatio - mean*mean;
-    vari += eps;
-    vari = rsqrt(vari);
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
-    int2 coord_bias = (int2)(0, 0);
-
-    for(coord.x = 0; coord.x < width; coord.x += 16)
-    {
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_bias.x = coord.x;
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertSecFp16Fp32_4x4);
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        bias_f1 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert2ndUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert3rdUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert4thUint8SubZpToFp32_4x4);
-        tmpData0 = tmpData0 * input_scale - mean;
-        tmpData1 = tmpData1 * input_scale - mean;
-        tmpData2 = tmpData2 * input_scale - mean;
-        tmpData3 = tmpData3 * input_scale - mean;
-
-        vxc_float4 norm;
-        norm = scale_f0 * vari * tmpData0 + bias_f0;
-        bias_f0 = read_imagef(bias, coord_bias);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        coord_bias.x += 4;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-
-        norm = scale_f1 * vari * tmpData1 + bias_f1;
-        bias_f1 = read_imagef(bias, coord_bias);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertSecFp16Fp32_4x4);
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-
-        norm = scale_f0 * vari * tmpData2 + bias_f0;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-
-        norm = scale_f1 * vari * tmpData3 + bias_f1;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-        VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    }
-}
-/***************************layernorm float16 to uint8**************************/
-__kernel void layer_norm_F16toU8_2D(
-    image2d_t input, image2d_t bias, image2d_t scale,
-    image2d_t output, float eps)
-{
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);
-    vxc_short8 src0, src1;
-    vxc_float sum = 0, sqr = 0;
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
-    {
-        vxc_half8  val0_h;
-        _viv_asm(COPY, val0_h, src0, 16);
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        vxc_float4 sumsqr;
-        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-            uniFp16SumSqr_dp8x2);
-        sum += sumsqr.x;
-        sqr += sumsqr.y;
-    }
-    vxc_float mean;
-    mean = sum * dimRatio;
-    vxc_float vari;
-    vari = sqr*dimRatio - mean*mean;
-    vari += eps;
-    vari = rsqrt(vari);
-    vxc_float4 bias_f;
-    for(coord.x = 0; coord.x < width; coord.x += 4)
-    {
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-        bias_f = read_imagef(bias, coord.xw);
-        vxc_half8 in_h, scale_h;
-        _viv_asm(COPY, in_h, src0, 16);
-        _viv_asm(COPY, scale_h, src1, 16);
-        vxc_float4 in_f, scale_f;
-        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        vxc_float4 sub, norm;
-        sub = in_f - mean;
-        norm = scale_f * vari * sub + bias_f;
-        norm = norm * outputScale + output_zp;
-        int4 output_int4;
-        output_int4 = convert_int4_rte(norm);
-        vxc_uchar8 dst;
-        VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),
-            uniConvertInt32toUint8_2x8);
-        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-    }
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_3.vx
similarity index 100%
rename from src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx
rename to src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_3.vx
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx
deleted file mode 100644
index e461f28..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx
+++ /dev/null
@@ -1,168 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-/**************************layernorm float16***********************************/
-_viv_uniform int width;
-_viv_uniform float dimRatio;
-_viv_uniform float dimRatio_scale;
-_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-_viv_uniform float e2InScale;
-_viv_uniform float outputScale;
-_viv_uniform float output_zp;
-_viv_uniform float input_scale;
-_viv_uniform int inputZP;
-
-__kernel void layer_norm_I16toI16(
-    image2d_array_t input, image2d_t bias, image2d_t scale,
-    image2d_array_t output, float eps)
-{
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
-    int4 coord_in = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
-
-    int8 input_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord_in.z, baseAddr_a);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr);
-
-    vxc_short8 src0, src1, dst;
-    vxc_float sum = 0, sqr = 0;
-    for(; coord_in.x < width;)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_in.x += 8;
-        vxc_float4 sumsqr;
-        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-                    uniInt16SumSqr_dp8x2);
-        sum += sumsqr.x;
-        sqr = sqr + sumsqr.y * e2InScale;
-    }
-    vxc_float mean;
-    mean = sum * dimRatio_scale;
-    vxc_float vari;
-    vari = sqr*dimRatio - mean*mean;
-    vari += eps;
-    vari = rsqrt(vari);
-
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_half8 scale_h;
-    vxc_int4 tmpVal0, tmpVal1;
-
-    int2 coord_bias = (int2)(0, 0);
-
-    for(coord_in.x = 0; coord_in.x < width; coord_in.x += 8, coord.x += 8)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_bias.x = coord_in.x;
-        VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        bias_f1 = read_imagef(bias, coord_bias);
-
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert2ndUint8SubZpToFp32_4x4);
-
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvertSecFp16Fp32_4x4);
-
-        vxc_float4 sub, norm;
-        sub = tmpData0 * input_scale - mean;
-        norm = scale_f0 * vari * sub + bias_f0;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-        sub = tmpData1 * input_scale - mean;
-        norm = scale_f1 * vari * sub + bias_f1;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-                uniConvertInt32toUint8_2x8);
-        VXC_OP4_NoDest(img_store_3d, output, coord, dst, \
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel void layer_norm_I16toI16_2D(
-    image2d_t input, image2d_t bias, image2d_t scale,
-    image2d_t output, float eps)
-{
-    int2 coord = (int2)(0, get_global_id(1));
-
-    vxc_short8 src0, src1, dst;
-    vxc_float sum = 0, sqr = 0;
-    for(; coord.x < width;)
-    {
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord.x += 8;
-        vxc_float4 sumsqr;
-        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-                    uniInt16SumSqr_dp8x2);
-        sum += sumsqr.x;
-        sqr = sqr + sumsqr.y * e2InScale;
-    }
-    vxc_float mean, vari;
-    mean = sum * dimRatio_scale;
-    vari = sqr * dimRatio - mean * mean;
-    vari += eps;
-    vari = rsqrt(vari);
-
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_half8 scale_h;
-    vxc_int4 tmpVal0, tmpVal1;
-
-    int2 coord_bias = (int2)(0, 0);
-
-    for(coord.x = 0; coord.x < width; coord.x += 8)
-    {
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_bias.x = coord.x;
-        VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        bias_f1 = read_imagef(bias, coord_bias);
-
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert2ndUint8SubZpToFp32_4x4);
-
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                    UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                    uniConvertSecFp16Fp32_4x4);
-
-        vxc_float4 sub, norm;
-        sub = tmpData0 * input_scale - mean;
-        norm = scale_f0 * vari * sub + bias_f0;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-        sub = tmpData1 * input_scale - mean;
-        norm = scale_f1 * vari * sub + bias_f1;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-                    uniConvertInt32toUint8_2x8);
-        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx
deleted file mode 100644
index 221e93e..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx
+++ /dev/null
@@ -1,276 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-/**************************layernorm float16***********************************/
-_viv_uniform int width;
-_viv_uniform float dimRatio;
-_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;
-
-__kernel void layer_norm_F16F32toF16(
-    image2d_array_t input, image2d_t bias, image2d_t scale,
-    image2d_array_t output, float eps)
-{
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
-    int4 coord_out = coord;
-
-    int8 input_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
-
-    vxc_short8 src0;
-    vxc_float sum = 0, sqr = 0;
-    VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    Image img1 = create_image_from_image2d(bias, 4);
-    Image img2 = create_image_from_image2d(scale, 4);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord_out.z, baseAddr);
-
-    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
-    {
-        vxc_half8  val0_h;
-        _viv_asm(COPY, val0_h, src0, 16);
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        vxc_float4 sumsqr;
-        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-            uniFp16SumSqr_dp8x2);
-        sum += sumsqr.x;
-        sqr += sumsqr.y;
-    }
-    vxc_float mean;
-    mean = sum * dimRatio;
-    vxc_float vari;
-    vari = sqr*dimRatio - mean*mean;
-    vari += eps;
-    vari = rsqrt(vari);
-    vxc_float4 bias_f, scale_f, in_f;
-    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0));
-    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0));
-    for(coord.x = 0; coord.x < width; coord.x += 4)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-        bias_f = vload4(0, bias_ptr + coord.x);
-        scale_f = vload4(0, scale_ptr + coord.x);
-        vxc_half8 in_h;
-        _viv_asm(COPY, in_h, src0, 16);
-        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        vxc_float4 sub, norm;
-        sub = in_f - mean;
-        norm = scale_f * vari * sub + bias_f;
-        half4 norm_h;
-        _viv_asm(CONV, norm_h, norm);
-        vxc_half8 dst;
-        VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniExtractHalf4_dp4x4);
-        vxc_short8 dstval;
-        _viv_asm(COPY, dstval, dst, 16);
-        coord_out.x = coord.x;
-        VXC_OP4_NoDest(img_store_3d, output, coord_out, dstval, \
-                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
-    }
-}
-/*****************************layernorm uint8 to uint8****************************/
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniSumU8_16x1;
-_viv_uniform VXC_512Bits uniSqrSum_16x1;
-_viv_uniform float input_scale;
-_viv_uniform int inputZP;
-_viv_uniform float outputScale;
-_viv_uniform float output_zp;
-_viv_uniform int sumInZp;
-_viv_uniform int tmpZp1;
-_viv_uniform int tmpZp2;
-_viv_uniform float e2InScale;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;
-_viv_uniform float dimRatio_scale;
-
-__kernel void layer_norm_U8F32toU8(
-    image2d_array_t input, image2d_t bias, image2d_t scale,
-    image2d_array_t output, float eps)
-{
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
-    int4 coord_out = coord;
-
-    vxc_uchar16 src0, src2;
-    float sum = 0, sqr = 0;
-    vxc_float4 bias_f0, bias_f1, bias_f2, bias_f3, scale_f0, scale_f1, scale_f2, scale_f3;
-    int tmpSum = 0, tmpSqr = 0;
-    vxc_int4 tmpSum1;
-    vxc_int4 tmpSqr1;
-    short zp = inputZP;
-
-    int8 input_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord_out.z, baseAddr);
-
-    for(coord.x = 0; coord.x < width; coord.x += 16)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
-        tmpSum += (tmpSum1.x);
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
-        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
-    }
-    sum = (tmpSum + sumInZp) * input_scale;
-    sqr = (tmpSqr + tmpZp2) * e2InScale;
-
-    float mean, vari;
-    mean = sum * dimRatio;
-    vari = sqr*dimRatio - mean*mean;
-    vari += eps;
-    vari = rsqrt(vari);
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
-
-    Image img1 = create_image_from_image2d(bias, 4);
-    Image img2 = create_image_from_image2d(scale, 4);
-    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0));
-    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0));
-    for(coord.x = 0; coord.x < width; coord.x += 16)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        bias_f0 = vload4(0, bias_ptr);
-        bias_f1 = vload4(1, bias_ptr);
-        bias_f2 = vload4(2, bias_ptr);
-        bias_f3 = vload4(3, bias_ptr);
-        scale_f0 = vload4(0, scale_ptr);
-        scale_f1 = vload4(1, scale_ptr);
-        scale_f2 = vload4(2, scale_ptr);
-        scale_f3 = vload4(3, scale_ptr);
-        bias_ptr += 16;
-        scale_ptr += 16;
-
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert2ndUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert3rdUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert4thUint8SubZpToFp32_4x4);
-        tmpData0 *= input_scale;
-        tmpData1 *= input_scale;
-        tmpData2 *= input_scale;
-        tmpData3 *= input_scale;
-
-        vxc_float4 norm;
-        tmpData0 -= mean;
-        norm = scale_f0 * vari * tmpData0 + bias_f0;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-
-        tmpData1 -= mean;
-        norm = scale_f1 * vari * tmpData1 + bias_f1;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-
-        tmpData2 -= mean;
-        norm = scale_f2 * vari * tmpData2 + bias_f2;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-
-        tmpData3 -= mean;
-        norm = scale_f3 * vari * tmpData3 + bias_f3;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-        coord_out.x = coord.x;
-        VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \
-                VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel void layer_norm_I16F32toI16(
-    image2d_array_t input, image2d_t bias, image2d_t scale,
-    image2d_array_t output, float eps)
-{
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
-    int4 coord_in = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
-
-    int8 input_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord_in.z, baseAddr_a);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr);
-
-    vxc_short8 src0, dst;
-    vxc_float sum = 0, sqr = 0;
-    for(; coord_in.x < width;)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_in.x += 8;
-        vxc_float4 sumsqr;
-        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-                    uniInt16SumSqr_dp8x2);
-        sum += sumsqr.x;
-        sqr = sqr + sumsqr.y * e2InScale;
-    }
-    vxc_float mean;
-    mean = sum * dimRatio_scale;
-    vxc_float vari;
-    vari = sqr*dimRatio - mean*mean;
-    vari += eps;
-    vari = rsqrt(vari);
-
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_int4 tmpVal0, tmpVal1;
-
-    int2 coord_bias = (int2)(0, 0);
-    Image img1 = create_image_from_image2d(bias, 4);
-    Image img2 = create_image_from_image2d(scale, 4);
-    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord_bias);
-    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord_bias);
-    for(coord_in.x = 0; coord_in.x < width; coord_in.x += 8, coord.x += 8)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        bias_f0 = vload4(0, bias_ptr);
-        bias_f1 = vload4(1, bias_ptr);
-        scale_f0 = vload4(0, scale_ptr);
-        scale_f1 = vload4(1, scale_ptr);
-        bias_ptr += 8;
-        scale_ptr += 8;
-
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert2ndUint8SubZpToFp32_4x4);
-
-        vxc_float4 sub, norm;
-        sub = tmpData0 * input_scale - mean;
-        norm = scale_f0 * vari * sub + bias_f0;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-        sub = tmpData1 * input_scale - mean;
-        norm = scale_f1 * vari * sub + bias_f1;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-                uniConvertInt32toUint8_2x8);
-        VXC_OP4_NoDest(img_store_3d, output, coord, dst, \
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    }
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_2d.vx
deleted file mode 100644
index 8010726..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_2d.vx
+++ /dev/null
@@ -1,237 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-/**************************layernorm float16***********************************/
-_viv_uniform int width;
-_viv_uniform float dimRatio;
-_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;
-
-__kernel void layer_norm_F16F32toF16_2D(
-    image2d_t input, image2d_t bias, image2d_t scale,
-    image2d_t output, float eps)
-{
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);
-    vxc_short8 src0, src1;
-    vxc_float sum = 0, sqr = 0;
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    Image img1 = create_image_from_image2d(bias, 4);
-    Image img2 = create_image_from_image2d(scale, 4);
-
-    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
-    {
-        vxc_half8  val0_h;
-        _viv_asm(COPY, val0_h, src0, 16);
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        vxc_float4 sumsqr;
-        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-            uniFp16SumSqr_dp8x2);
-        sum += sumsqr.x;
-        sqr += sumsqr.y;
-    }
-    vxc_float mean;
-    mean = sum * dimRatio;
-    vxc_float vari;
-    vari = sqr*dimRatio - mean*mean;
-    vari += eps;
-    vari = rsqrt(vari);
-    vxc_float4 bias_f, scale_f, in_f;
-    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);
-    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);
-    for(coord.x = 0; coord.x < width; coord.x += 4)
-    {
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-        bias_f = vload4(0, bias_ptr + coord.x);
-        scale_f = vload4(0, scale_ptr + coord.x);
-
-        vxc_half8 in_h;
-        _viv_asm(COPY, in_h, src0, 16);
-        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        vxc_float4 sub, norm;
-        sub = in_f - mean;
-        norm = scale_f * vari * sub + bias_f;
-        half4 norm_h;
-        _viv_asm(CONV, norm_h, norm);
-        vxc_half8 dst;
-        VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniExtractHalf4_dp4x4);
-        vxc_short8 dstval;
-        _viv_asm(COPY, dstval, dst, 16);
-        VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-    }
-}
-/*****************************layernorm uint8 to uint8****************************/
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniSumU8_16x1;
-_viv_uniform VXC_512Bits uniSqrSum_16x1;
-_viv_uniform float input_scale;
-_viv_uniform int inputZP;
-_viv_uniform float outputScale;
-_viv_uniform float output_zp;
-_viv_uniform int sumInZp;
-_viv_uniform int tmpZp1;
-_viv_uniform int tmpZp2;
-_viv_uniform float e2InScale;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;
-_viv_uniform float dimRatio_scale;
-
-__kernel void layer_norm_U8F32toU8_2D(
-    image2d_t input, image2d_t bias, image2d_t scale,
-    image2d_t output, float eps)
-{
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);
-    vxc_uchar16 src0, src2;
-    float sum = 0, sqr = 0;
-    vxc_float4 bias_f0, bias_f1, bias_f2, bias_f3, scale_f0, scale_f1, scale_f2, scale_f3;
-    int tmpSum = 0, tmpSqr = 0;
-    vxc_int4 tmpSum1;
-    vxc_int4 tmpSqr1;
-    short zp = inputZP;
-
-    for(coord.x = 0; coord.x < width; coord.x += 16)
-    {
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
-        tmpSum += (tmpSum1.x);
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
-        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
-    }
-    sum = (tmpSum + sumInZp) * input_scale;
-    sqr = (tmpSqr + tmpZp2) * e2InScale;
-
-    float mean, vari;
-    mean = sum * dimRatio;
-    vari = sqr*dimRatio - mean*mean;
-    vari += eps;
-    vari = rsqrt(vari);
-    vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
-
-    Image img1 = create_image_from_image2d(bias, 4);
-    Image img2 = create_image_from_image2d(scale, 4);
-    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);
-    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);
-    for(coord.x = 0; coord.x < width; coord.x += 16)
-    {
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        bias_f0 = vload4(0, bias_ptr);
-        bias_f1 = vload4(1, bias_ptr);
-        bias_f2 = vload4(2, bias_ptr);
-        bias_f3 = vload4(3, bias_ptr);
-        scale_f0 = vload4(0, scale_ptr);
-        scale_f1 = vload4(1, scale_ptr);
-        scale_f2 = vload4(2, scale_ptr);
-        scale_f3 = vload4(3, scale_ptr);
-        bias_ptr += 16;
-        scale_ptr += 16;
-
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert2ndUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert3rdUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert4thUint8SubZpToFp32_4x4);
-        tmpData0 = tmpData0 * input_scale - mean;
-        tmpData1 = tmpData1 * input_scale - mean;
-        tmpData2 = tmpData2 * input_scale - mean;
-        tmpData3 = tmpData3 * input_scale - mean;
-
-        vxc_float4 norm;
-        norm = scale_f0 * vari * tmpData0 + bias_f0;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-
-        norm = scale_f1 * vari * tmpData1 + bias_f1;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-
-        norm = scale_f2 * vari * tmpData2 + bias_f2;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-
-        norm = scale_f3 * vari * tmpData3 + bias_f3;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-        VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel void layer_norm_I16F32toI16_2D(
-    image2d_t input, image2d_t bias, image2d_t scale,
-    image2d_t output, float eps)
-{
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);
-
-    vxc_short8 src0, src1, dst;
-    vxc_float sum = 0, sqr = 0;
-    for(; coord.x < width;)
-    {
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord.x += 8;
-        vxc_float4 sumsqr;
-        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-                    uniInt16SumSqr_dp8x2);
-        sum += sumsqr.x;
-        sqr = sqr + sumsqr.y * e2InScale;
-    }
-    vxc_float mean, vari;
-    mean = sum * dimRatio_scale;
-    vari = sqr * dimRatio - mean * mean;
-    vari += eps;
-    vari = rsqrt(vari);
-
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_half8 scale_h;
-    vxc_int4 tmpVal0, tmpVal1;
-
-    Image img1 = create_image_from_image2d(bias, 4);
-    Image img2 = create_image_from_image2d(scale, 4);
-
-    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);
-    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);
-    for(coord.x = 0; coord.x < width; coord.x += 8)
-    {
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        bias_f0 = vload4(0, bias_ptr);
-        bias_f1 = vload4(1, bias_ptr);
-        scale_f0 = vload4(0, scale_ptr);
-        scale_f1 = vload4(1, scale_ptr);
-        bias_ptr += 8;
-        scale_ptr += 8;
-
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert2ndUint8SubZpToFp32_4x4);
-
-        vxc_float4 sub, norm;
-        sub = tmpData0 * input_scale - mean;
-        norm = scale_f0 * vari * sub + bias_f0;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-        sub = tmpData1 * input_scale - mean;
-        norm = scale_f1 * vari * sub + bias_f1;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-                    uniConvertInt32toUint8_2x8);
-        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx
deleted file mode 100644
index a76cb4f..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx
+++ /dev/null
@@ -1,239 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-/*****************************layernorm uint8 to fp16****************************/
-_viv_uniform int width;
-_viv_uniform float dimRatio;
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniSumU8_16x1;
-_viv_uniform VXC_512Bits uniSqrSum_16x1;
-_viv_uniform float input_scale;
-_viv_uniform int inputZP;
-_viv_uniform int sumInZp;
-_viv_uniform int tmpZp1;
-_viv_uniform int tmpZp2;
-_viv_uniform float e2InScale;
-_viv_uniform VXC_512Bits UniPackFP16even_2x8;
-
-__kernel void layer_norm_U8toF16(
-    image2d_array_t input,
-    image2d_t bias,
-    image2d_t scale,
-    image2d_array_t output,
-              float eps)
-{
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
-    int4 coord_out = coord;
-    vxc_uchar16 src0;
-    float sum = 0, sqr = 0;
-    int tmpSum = 0, tmpSqr = 0;
-    vxc_int4 tmpSum1;
-    vxc_int4 tmpSqr1;
-
-    int8 input_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr_a);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord_out.z, baseAddr);
-
-    for(coord.x = 0; coord.x < width; coord.x += 16)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
-        tmpSum += (tmpSum1.x);
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
-        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
-    }
-    sum = (tmpSum + sumInZp) * input_scale;
-    sqr = (tmpSqr + tmpZp2) * e2InScale;
-
-    float mean, vari;
-    mean = sum * dimRatio;
-    vari = sqr*dimRatio - mean*mean;
-    vari += eps;
-    vari = rsqrt(vari);
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
-    int2 coord_bias = (int2)(0, 0);
-    vxc_half8 scale_h;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_short8 src1, outval;
-    short zp = inputZP;
-    half4 tmpVal0, tmpVal1;
-    vxc_half8 dst;
-
-    for(coord.x = 0; coord.x < width; coord.x += 16)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        coord_bias.x = coord.x;
-
-        scale_f0 = read_imagef(scale, coord_bias);
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        scale_f1 = read_imagef(scale, coord_bias);
-        bias_f1 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert2ndUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert3rdUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert4thUint8SubZpToFp32_4x4);
-        tmpData0 *= input_scale;
-        tmpData1 *= input_scale;
-        tmpData2 *= input_scale;
-        tmpData3 *= input_scale;
-
-        vxc_float4 norm;
-        tmpData0 -= mean;
-        norm = scale_f0 * vari * tmpData0 + bias_f0;
-
-        scale_f0 = read_imagef(scale, coord_bias);
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        _viv_asm(CONV, tmpVal0, norm);
-
-        tmpData1 -= mean;
-        norm = scale_f1 * vari * tmpData1 + bias_f1;
-
-        scale_f1 = read_imagef(scale, coord_bias);
-        bias_f1 = read_imagef(bias, coord_bias);
-
-        _viv_asm(CONV, tmpVal1, norm);
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-            UniPackFP16even_2x8);
-        _viv_asm(COPY, outval, dst, 16);
-        coord_out.x = coord.x;
-        VXC_OP4_NoDest(img_store_3d, output, coord_out, outval, \
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-
-        tmpData2 -= mean;
-        norm = scale_f0 * vari * tmpData2 + bias_f0;
-        _viv_asm(CONV, tmpVal0, norm);
-
-        tmpData3 -= mean;
-        norm = scale_f1 * vari * tmpData3 + bias_f1;
-        _viv_asm(CONV, tmpVal1, norm);
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-            UniPackFP16even_2x8);
-        _viv_asm(COPY, outval, dst, 16);
-        coord_out.x += 8;
-        VXC_OP4_NoDest(img_store_3d, output, coord_out, outval, \
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel void layer_norm_U8toF16_2D(
-    image2d_t input,
-    image2d_t bias,
-    image2d_t scale,
-    image2d_t output,
-        float eps)
-{
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);
-    vxc_uchar16 src0;
-    float sum = 0, sqr = 0;
-    int tmpSum = 0, tmpSqr = 0;
-    vxc_int4 tmpSum1;
-    vxc_int4 tmpSqr1;
-
-    for(coord.x = 0; coord.x < width; coord.x += 16)
-    {
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
-        tmpSum += (tmpSum1.x);
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
-        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
-    }
-    sum = (tmpSum + sumInZp) * input_scale;
-    sqr = (tmpSqr + tmpZp2) * e2InScale;
-
-    float mean, vari;
-    mean = sum * dimRatio;
-    vari = sqr*dimRatio - mean*mean;
-    vari += eps;
-    vari = rsqrt(vari);
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
-    int2 coord_bias = (int2)(0, 0);
-    vxc_half8 scale_h;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_short8 src1, outval;
-    short zp = inputZP;
-    half4 tmpVal0, tmpVal1;
-    vxc_half8 dst;
-
-    int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
-
-    for(coord.x = 0; coord.x < width; coord.x += 16)
-    {
-        coord_bias.x = coord.x;
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-        scale_f0 = read_imagef(scale, coord_bias);
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        scale_f1 = read_imagef(scale, coord_bias);
-        bias_f1 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert2ndUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert3rdUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvert4thUint8SubZpToFp32_4x4);
-        tmpData0 *= input_scale;
-        tmpData1 *= input_scale;
-        tmpData2 *= input_scale;
-        tmpData3 *= input_scale;
-
-        vxc_float4 norm;
-        tmpData0 -= mean;
-        norm = scale_f0 * vari * tmpData0 + bias_f0;
-
-        scale_f0 = read_imagef(scale, coord_bias);
-        bias_f0 = read_imagef(bias, coord_bias);
-
-        coord_bias.x += 4;
-        _viv_asm(CONV, tmpVal0, norm);
-
-        tmpData1 -= mean;
-        norm = scale_f1 * vari * tmpData1 + bias_f1;
-
-        scale_f1 = read_imagef(scale, coord_bias);
-        bias_f1 = read_imagef(bias, coord_bias);
-
-        _viv_asm(CONV, tmpVal1, norm);
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-            UniPackFP16even_2x8);
-        _viv_asm(COPY, outval, dst, 16);
-        coord_out.x = coord.x;
-        VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-        tmpData2 -= mean;
-        norm = scale_f0 * vari * tmpData2 + bias_f0;
-        _viv_asm(CONV, tmpVal0, norm);
-
-        tmpData3 -= mean;
-        norm = scale_f1 * vari * tmpData3 + bias_f1;
-        _viv_asm(CONV, tmpVal1, norm);
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-            UniPackFP16even_2x8);
-        _viv_asm(COPY, outval, dst, 16);
-        coord_out.x += 8;
-        VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx
deleted file mode 100644
index d494b6d..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx
+++ /dev/null
@@ -1,430 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
-_viv_uniform int width;
-
-_viv_uniform int height;
-
-_viv_uniform int height_depth;
-_viv_uniform float dimRatio;
-_viv_uniform int group_num;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform float outputScale;
-_viv_uniform float output_zp;
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32(
-    image2d_array_t input, image2d_t output)
-{
-    int gidx = get_global_id(0) << 3;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(gidx, 0, gidz, 0);
-    vxc_short8 src0;
-    vxc_half8 in_h;
-    vxc_float4 sumsqr;
-    vxc_float4 tmpSumSqr = (vxc_float4)(0);
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    int8 input_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr_a);
-
-    if(gidx < width)
-    {
-        for(coord.y = 0; coord.y < height;)
-        {
-            VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            _viv_asm(COPY, in_h, src0, 16);
-            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-                uniFp16SumSqr_dp8x2);
-            tmpSumSqr += sumsqr;
-        }
-    }
-
-    lcl_sum[lidx] = tmpSumSqr.x;
-    lcl_sqr[lidx] = tmpSumSqr.y;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
-        float sum = 0;
-        float sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32_2D(
-    image2d_array_t input, image2d_t output)
-{
-    int gidx = get_global_id(0) << 3;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-
-    int2 coord = (int2)(gidx, gidy);
-    vxc_short8 src0;
-    vxc_half8 in_h;
-    vxc_float4 sumsqr;
-    vxc_float4 tmpSumSqr = (vxc_float4)(0);
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    int endH = gidy + height;
-    if(gidx < width)
-    {
-        for(; coord.y < endH;)
-        {
-            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            _viv_asm(COPY, in_h, src0, 16);
-            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-                uniFp16SumSqr_dp8x2);
-            tmpSumSqr += sumsqr;
-        }
-    }
-
-    lcl_sum[lidx] = tmpSumSqr.x;
-    lcl_sqr[lidx] = tmpSumSqr.y;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
-        float sum = 0;
-        float sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16(
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps)
-{
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
-    int2 coord_sum = (int2)(0, gidz);
-    int4 coord_para = coord;
-    coord_para.z = (ushort)gidz / (ushort)(height_depth);
-    vxc_short8 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h, in_h;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_float4 mean_vari = (vxc_float4)(0);
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_sum);
-        coord_sum.x += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    int4 coord_bias = coord_para;
-
-    int8 input_desc, scale_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord_in.z, baseAddr_a);
-
-    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
-    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
-    _viv_asm(MOV, coord_para.w, baseAddr_c);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr);
-
-    vxc_float4  tmpData0, tmpData1;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    vxc_half8 dst;
-
-    for(coord.y = 0; coord.y < height; coord.y++)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),
-                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_in.y ++;
-        coord_para.y = coord.y;
-        coord_bias.y = coord.y;
-        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),
-                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        bias_f1 = read_imagef(bias, coord_bias);
-        coord_bias.x = coord.x;
-
-        _viv_asm(COPY, in_h, src0, 16);
-        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertSecFp16Fp32_4x4);
-
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvertSecFp16Fp32_4x4);
-
-        vxc_float4 sub, norm;
-        sub = tmpData0 - mean_vari.s0;
-        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;
-        _viv_asm(CONV, tmpVal0, norm);
-        sub = tmpData1 - mean_vari.s0;
-        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;
-        _viv_asm(CONV, tmpVal1, norm);
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-                    uniConvertHalfToFp16_2x8);
-        _viv_asm(COPY, outval, dst, 16);
-        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16_2D(
-    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_t output, float eps)
-{
-    int2 coord = (int2)(get_global_id(0), 0);
-    int2 coord_bias = (int2)(0, 0);
-    vxc_short8 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h, in_h;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_float4 mean_vari = (vxc_float4)(0);
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_bias);
-        coord_bias.x += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    coord_bias = coord;
-
-    vxc_float4  tmpData0, tmpData1;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    vxc_half8 dst;
-
-    for(coord.y = 0; coord.y < height; coord.y++)
-    {
-        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_bias.y = coord.y;
-        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        bias_f1 = read_imagef(bias, coord_bias);
-        coord_bias.x = coord.x;
-
-        _viv_asm(COPY, in_h, src0, 16);
-        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertSecFp16Fp32_4x4);
-
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvertSecFp16Fp32_4x4);
-
-        vxc_float4 sub, norm;
-        sub = tmpData0 - mean_vari.s0;
-        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;
-        _viv_asm(CONV, tmpVal0, norm);
-        sub = tmpData1 - mean_vari.s0;
-        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;
-        _viv_asm(CONV, tmpVal1, norm);
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-                    uniConvertHalfToFp16_2x8);
-        _viv_asm(COPY, outval, dst, 16);
-        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8(
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps)
-{
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
-    int2 coord_sum = (int2)(0, gidz);
-    int4 coord_para = coord;
-    coord_para.z = (ushort)gidz / (ushort)(height_depth);
-    vxc_short8 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h, in_h;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_float4 mean_vari = (vxc_float4)(0);
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_sum);
-        coord_sum.x += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    int4 coord_bias = coord_para;
-
-    int8 input_desc, scale_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord_in.z, baseAddr_a);
-
-    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
-    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
-    _viv_asm(MOV, coord_para.w, baseAddr_c);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr);
-
-    vxc_float4  tmpData0, tmpData1;
-    vxc_uchar16 outval;
-    vxc_int4 tmpVal0, tmpVal1;
-
-    for(coord.y = 0; coord.y < height; coord.y++)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),
-                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_in.y ++;
-        coord_para.y = coord.y;
-        coord_bias.y = coord.y;
-        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),
-                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        bias_f1 = read_imagef(bias, coord_bias);
-        coord_bias.x = coord.x;
-
-        _viv_asm(COPY, in_h, src0, 16);
-        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertSecFp16Fp32_4x4);
-
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvertSecFp16Fp32_4x4);
-
-        vxc_float4 sub, norm;
-        sub = tmpData0 - mean_vari.s0;
-        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-        sub = tmpData1 - mean_vari.s0;
-        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-                uniConvertInt32toUint8_2x8);
-        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8_2D(
-    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_t output, float eps)
-{
-    int2 coord = (int2)(get_global_id(0), 0);
-    int2 coord_bias = (int2)(0, 0);
-    vxc_short8 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h, in_h;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_float4 mean_vari = (vxc_float4)(0);
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_bias);
-        coord_bias.x += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    coord_bias = coord;
-
-    vxc_float4  tmpData0, tmpData1;
-    vxc_uchar16 outval;
-    vxc_int4 tmpVal0, tmpVal1;
-
-    for(coord.y = 0; coord.y < height; coord.y++)
-    {
-        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_bias.y = coord.y;
-        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        bias_f1 = read_imagef(bias, coord_bias);
-        coord_bias.x = coord.x;
-
-        _viv_asm(COPY, in_h, src0, 16);
-        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniConvertSecFp16Fp32_4x4);
-
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvertSecFp16Fp32_4x4);
-
-        vxc_float4 sub, norm;
-        sub = tmpData0 - mean_vari.s0;
-        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-        sub = tmpData1 - mean_vari.s0;
-        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-                uniConvertInt32toUint8_2x8);
-        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx
deleted file mode 100644
index 7c92a66..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx
+++ /dev/null
@@ -1,268 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;
-_viv_uniform float e2InScale;
-_viv_uniform int width;
-
-_viv_uniform float input_scale;
-_viv_uniform int height;
-
-_viv_uniform int height_depth;
-_viv_uniform float dimRatio;
-_viv_uniform int group_num;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform float outputScale;
-_viv_uniform float output_zp;
-
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
-_viv_uniform int inputZP;
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32(
-    image2d_array_t input, image2d_t output)
-{
-    int gidx = get_global_id(0) << 4;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(gidx, 0, gidz, 0);
-    vxc_short8 src0;
-    float4 tmpSumSqr = (float4)(0);
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    int8 input_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr_a);
-
-    if(gidx < width)
-    {
-        for(coord.y = 0; coord.y < height;)
-        {
-            VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            vxc_float4 sumsqr;
-            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-                    uniInt16SumSqr_dp8x2);
-            tmpSumSqr += sumsqr;
-        }
-        tmpSumSqr.x *= input_scale;
-        tmpSumSqr.y *= e2InScale;
-    }
-    lcl_sum[lidx] = tmpSumSqr.x;
-    lcl_sqr[lidx] = tmpSumSqr.y;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-        float4 data = (float4)(0);
-        for(int i = 0; i < 4; i++)
-        {
-            data.x += dot(tmp_sum[i], one);
-            data.y += dot(tmp_sqr[i], one);
-        }
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32_2D(
-    image2d_t input, image2d_t output)
-{
-    int gidx = get_global_id(0) << 4;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-
-    int2 coord = (int2)(gidx, gidy);
-    vxc_short8 src0;
-    float4 tmpSumSqr = (float4)(0);
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    int endH = gidy + height;
-    if(gidx < width)
-    {
-        for(; coord.y < endH;)
-        {
-            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            vxc_float4 sumsqr;
-            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
-                    uniInt16SumSqr_dp8x2);
-            tmpSumSqr += sumsqr;
-        }
-        tmpSumSqr.x *= input_scale;
-        tmpSumSqr.y *= e2InScale;
-    }
-    lcl_sum[lidx] = tmpSumSqr.x;
-    lcl_sqr[lidx] = tmpSumSqr.y;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-        float4 data = (float4)(0);
-        for(int i = 0; i < 4; i++)
-        {
-            data.x += dot(tmp_sum[i], one);
-            data.y += dot(tmp_sqr[i], one);
-        }
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16(
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps)
-{
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
-    int2 coord_sum = (int2)(0, gidz);
-    int4 coord_para = coord;
-    coord_para.z = (ushort)gidz / (ushort)(height_depth);
-    vxc_short8 src0, src1, outval;
-    vxc_half8 scale_h;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_float4 mean_vari = (vxc_float4)(0);
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_sum);
-        coord_sum.x += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    int4 coord_bias = coord_para;
-
-    int8 input_desc, scale_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord_in.z, baseAddr_a);
-
-    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
-    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
-    _viv_asm(MOV, coord_para.w, baseAddr_c);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr);
-
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1, norm;
-    vxc_int4 tmpVal0, tmpVal1;
-
-    for(coord.y = 0; coord.y < height; coord.y++)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),
-                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_in.y ++;
-        coord_para.y = coord.y;
-        coord_bias.y = coord.y;
-        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),
-                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        bias_f1 = read_imagef(bias, coord_bias);
-        coord_bias.x = coord.x;
-
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvertSecFp16Fp32_4x4);
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert2ndUint8SubZpToFp32_4x4);
-        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
-        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
-
-        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-
-        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-                uniConvertInt32toUint8_2x8);
-        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16_2D(
-    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_t output, float eps)
-{
-    int2 coord = (int2)(get_global_id(0), 0);
-    int2 coord_bias = (int2)(0, 0);
-    vxc_short8 src0, src1, outval;
-    vxc_half8 scale_h;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_float4 mean_vari = (vxc_float4)(0);
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_bias);
-        coord_bias.x += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    coord_bias = coord;
-
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1, norm;
-    vxc_int4 tmpVal0, tmpVal1;
-
-    for(coord.y = 0; coord.y < height; coord.y++)
-    {
-        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_bias.y = coord.y;
-        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        bias_f1 = read_imagef(bias, coord_bias);
-        coord_bias.x = coord.x;
-
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvertSecFp16Fp32_4x4);
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert2ndUint8SubZpToFp32_4x4);
-        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
-        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
-
-        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-
-        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-                uniConvertInt32toUint8_2x8);
-        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx
deleted file mode 100644
index 4c9e46b..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx
+++ /dev/null
@@ -1,423 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniSumU8_16x1;
-_viv_uniform VXC_512Bits uniSqrSum_16x1;
-_viv_uniform int sumInZp;
-_viv_uniform int tmpZp1;
-_viv_uniform float e2InScale;
-_viv_uniform float rowSumScale;
-_viv_uniform int width;
-
-_viv_uniform float input_scale;
-_viv_uniform int height;
-
-_viv_uniform int height_depth;
-_viv_uniform float dimRatio;
-_viv_uniform int group_num;
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform float outputScale;
-_viv_uniform float output_zp;
-
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
-_viv_uniform int inputZP;
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32(
-    image2d_array_t input, image2d_t output)
-{
-    int gidx = get_global_id(0) << 4;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(gidx, 0, gidz, 0);
-    vxc_uchar16 src0;
-    float sum = 0, sqr = 0;
-    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    int8 input_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord.w, baseAddr_a);
-
-    if(gidx < width)
-    {
-        for(coord.y = 0; coord.y < height;)
-        {
-            VXC_OP4(img_load_3d, src0, input, coord.xywz, 0,
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
-            tmpSum += (tmpSum1);
-            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
-            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);
-        }
-        sqr += (tmpSqr * e2InScale + rowSumScale);
-        sum = (tmpSum + sumInZp) * input_scale;
-    }
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-        sum = 0; sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32_2D(
-    image2d_t input, image2d_t output)
-{
-    int gidx = get_global_id(0) << 4;
-    int lidx = get_local_id(0);
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-
-    int2 coord = (int2)(gidx, gidy);
-    vxc_uchar16 src0;
-    float sum = 0, sqr = 0;
-    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
-
-    __local float lcl_sum[16];
-    __local float lcl_sqr[16];
-
-    int endH = gidy + height;
-    if(gidx < width)
-    {
-        for(; coord.y < endH;)
-        {
-            VXC_ReadImage(src0, input, coord, 0,
-                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-            coord.y++;
-            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
-            tmpSum += (tmpSum1);
-            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
-            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);
-        }
-        sqr += (tmpSqr * e2InScale + rowSumScale);
-        sum = (tmpSum + sumInZp) * input_scale;
-    }
-    lcl_sum[lidx] = sum;
-    lcl_sqr[lidx] = sqr;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
-    if(lidx == 0)
-    {
-        float4 one = (float4)(1, 1, 1, 1);
-        __local float4* tmp_sum = (__local float4*)lcl_sum;
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-        sum = 0; sqr = 0;
-        for(int i = 0; i < 4; i++)
-        {
-            sum += dot(tmp_sum[i], one);
-            sqr += dot(tmp_sqr[i], one);
-        }
-        float4 data = (float4)(sum, sqr, 0, 0);
-        write_imagef(output, coord_out, data);
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16(
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps)
-{
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
-    int2 coord_sum = (int2)(0, gidz);
-    int4 coord_para = coord;
-    coord_para.z = (ushort)gidz / (ushort)(height_depth);
-    vxc_uchar16 src0;
-    vxc_short8 src1, outval;
-    vxc_half8 scale_h, dst;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_float4 mean_vari = (vxc_float4)(0);
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_sum);
-        coord_sum.x += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    int4 coord_bias = coord_para;
-
-    int8 input_desc, scale_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord_in.z, baseAddr_a);
-
-    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
-    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
-    _viv_asm(MOV, coord_para.w, baseAddr_c);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr);
-
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1, norm;
-    half4 tmpVal0, tmpVal1;
-
-    for(coord.y = 0; coord.y < height; coord.y++)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord_in, 0,
-                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_in.y ++;
-        coord_para.y = coord.y; coord_bias.y = coord.y;
-        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,
-                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        bias_f1 = read_imagef(bias, coord_bias);
-        coord_bias.x = coord.x;
-
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvertSecFp16Fp32_4x4);
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert2ndUint8SubZpToFp32_4x4);
-        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
-        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
-
-        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
-        _viv_asm(CONV, tmpVal0, norm);
-        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
-        _viv_asm(CONV, tmpVal1, norm);
-
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-                uniConvertHalfToFp16_2x8);
-        _viv_asm(COPY, outval, dst, 16);
-        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16_2D(
-    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_t output, float eps)
-{
-    int2 coord = (int2)(get_global_id(0), 0);
-    int2 coord_bias = (int2)(0, 0);
-    vxc_uchar16 src0;
-    vxc_short8 src1, outval;
-    vxc_half8 scale_h, dst;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_float4 mean_vari = (vxc_float4)(0);
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_bias);
-        coord_bias.x += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    coord_bias = coord;
-
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1, norm;
-    half4 tmpVal0, tmpVal1;
-
-    for(coord.y = 0; coord.y < height; coord.y++)
-    {
-        VXC_ReadImage(src0, input, coord, 0,\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_bias.y = coord.y;
-        VXC_ReadImage(src1, scale, coord, 0,\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        bias_f1 = read_imagef(bias, coord_bias);
-        coord_bias.x = coord.x;
-
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvertSecFp16Fp32_4x4);
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert2ndUint8SubZpToFp32_4x4);
-        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
-        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
-
-        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
-        _viv_asm(CONV, tmpVal0, norm);
-        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
-        _viv_asm(CONV, tmpVal1, norm);
-
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-                uniConvertHalfToFp16_2x8);
-        _viv_asm(COPY, outval, dst, 16);
-        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8(
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps)
-{
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
-    int2 coord_sum = (int2)(0, gidz);
-    int4 coord_para = coord;
-    coord_para.z = (ushort)gidz / (ushort)(height_depth);
-    vxc_uchar16 src0 , outval;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_float4 mean_vari = (vxc_float4)(0);
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_sum);
-        coord_sum.x += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    int4 coord_bias = coord_para;
-
-    int8 input_desc, scale_desc, output_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord_in.z, baseAddr_a);
-
-    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
-    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
-    _viv_asm(MOV, coord_para.w, baseAddr_c);
-
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord.z, baseAddr);
-
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1, norm;
-    vxc_int4 tmpVal0, tmpVal1;
-
-    for(coord.y = 0; coord.y < height; coord.y++)
-    {
-        VXC_OP4(img_load_3d, src0, input, coord_in, 0,
-                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_in.y ++;
-        coord_para.y = coord.y;
-        coord_bias.y = coord.y;
-        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,
-                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        bias_f1 = read_imagef(bias, coord_bias);
-        coord_bias.x = coord.x;
-
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvertSecFp16Fp32_4x4);
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert2ndUint8SubZpToFp32_4x4);
-        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
-        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
-
-        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-
-        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-                uniConvertInt32toUint8_2x8);
-        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8_2D(
-    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_t output, float eps)
-{
-    int2 coord = (int2)(get_global_id(0), 0);
-    int2 coord_bias = (int2)(0, 0);
-    vxc_uchar16 src0, outval;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
-    vxc_float4 mean_vari = (vxc_float4)(0);
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_bias);
-        coord_bias.x += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    coord_bias = coord;
-
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1, norm;
-    vxc_int4 tmpVal0, tmpVal1;
-
-    for(coord.y = 0; coord.y < height; coord.y++)
-    {
-        VXC_ReadImage(src0, input, coord, 0,\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        coord_bias.y = coord.y;
-        VXC_ReadImage(src1, scale, coord, 0,\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-        bias_f0 = read_imagef(bias, coord_bias);
-        coord_bias.x += 4;
-        bias_f1 = read_imagef(bias, coord_bias);
-        coord_bias.x = coord.x;
-
-        _viv_asm(COPY, scale_h, src1, 16);
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                UniFP16toFP32Lo4_dp4x4);
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvertSecFp16Fp32_4x4);
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert1stUint8SubZpToFp32_4x4);
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-                uniConvert2ndUint8SubZpToFp32_4x4);
-        tmpData0 = tmpData0 * input_scale - mean_vari.s0;
-        tmpData1 = tmpData1 * input_scale - mean_vari.s0;
-
-        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
-        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
-
-        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-                uniConvertInt32toUint8_2x8);
-        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx
index 433dc4f..a25eb64 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx
@@ -76,6 +76,7 @@ __kernel void gemm_BF16BF16toBF16(image2d_array_t inputA,
         sum3 += (tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3);
     }
     coord_b.y = gidy;
+    coord_b.z = get_global_id(2);
     _viv_asm(COPY, output_desc, output, sizeof(output_desc));
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
     _viv_asm(MOV, coord_b.w, baseAddr);
@@ -153,6 +154,7 @@ __kernel void gemm_transa_BF16BF16toBF16(
         sum3 = (sum3 + tempA0.w * tempB0);
     }
     coord_b.y = gidy;
+    coord_b.z = get_global_id(2);
     _viv_asm(COPY, output_desc, output, sizeof(output_desc));
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
     _viv_asm(MOV, coord_b.w, baseAddr);
@@ -187,7 +189,7 @@ __kernel void gemm_transb_BF16BF16toBF16(image2d_array_t inputA,
                                     int adjointB,
                         uint M, uint K, uint N)
 {
-    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
     int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);
     int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx
index bd211d4..bdedc7a 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx
@@ -82,6 +82,7 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA,
         sum3 += (tempA3);
     }
     coord_b.y = gidy;
+    coord_b.z = get_global_id(2);
     _viv_asm(COPY, output_desc, output, sizeof(output_desc));
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
     _viv_asm(MOV, coord_b.w, baseAddr);
@@ -170,6 +171,7 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA,
         sum3 += (tempA3 + tempB3);
     }
     coord_b.y = gidy;
+    coord_b.z = get_global_id(2);
     _viv_asm(COPY, output_desc, output, sizeof(output_desc));
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
     _viv_asm(MOV, coord_b.w, baseAddr);
@@ -244,7 +246,7 @@ __kernel void gemm_F32F32toF32(
         sum2 += (tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3);
         sum3 += (tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3);
     }
-    coord_b = (int4)(gidx, gidy, get_global_id(2), 0);
+    coord_b = (int4)(gidx, gidy, get_global_id(2), get_global_id(2));
     write_imagef(output, coord_b, sum0);
     coord_b.y++;
     write_imagef(output, coord_b, sum1);
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16f16_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16f16_u8.vx
index 1929119..0e9bcd5 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16f16_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16f16_u8.vx
@@ -82,6 +82,7 @@ __kernel void gemm_F16F16to##dst_type_name( \
     vxc_int4 tmpOut0, tmpOut1; \
     write_type outC; \
     coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
     _viv_asm(MOV, coord_b.w, baseAddr); \
@@ -171,6 +172,7 @@ __kernel void gemm_F16F16to##dst_type_name( \
     vxc_int4 tmpOut0, tmpOut1; \
     write_type outC; \
     coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
     _viv_asm(MOV, coord_b.w, baseAddr); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16i16_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16i16_i16.vx
index 7cdf087..520f70d 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16i16_i16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16i16_i16.vx
@@ -79,6 +79,7 @@ __kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \
     } \
     vxc_int4 tmpOut0, tmpOut1; \
     coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
     _viv_asm(MOV, coord_b.w, baseAddr); \
@@ -167,6 +168,7 @@ __kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \
     } \
     vxc_int4 tmpOut0, tmpOut1; \
     coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
     _viv_asm(MOV, coord_b.w, baseAddr); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_f16.vx
index 515d2fb..322f474 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_f16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_f16.vx
@@ -75,6 +75,7 @@ __kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \
     sum2 *= input1Scale; \
     sum3 *= input1Scale; \
     coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
     _viv_asm(MOV, coord_b.w, baseAddr); \
@@ -171,6 +172,7 @@ __kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \
         sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \
     } \
     coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
     _viv_asm(MOV, coord_b.w, baseAddr); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_u8.vx
index 39ddada..b9e803e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_u8.vx
@@ -70,6 +70,7 @@ __kernel void gemm_F16##src1_type_name##to##src1_type_name(image2d_array_t input
     } \
     vxc_int4 tmpOut0, tmpOut1; \
     coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
     _viv_asm(MOV, coord_b.w, baseAddr); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx
index 7792e92..b4db308 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx
@@ -79,6 +79,7 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \
     } \
     vxc_int4 tmpOut0, tmpOut1; \
     coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
     _viv_asm(MOV, coord_b.w, baseAddr); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx
index 2fb3d26..d55fa59 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx
@@ -65,6 +65,7 @@ __kernel void gemm_transa_##src0_type_name##src1_type_name##to##dst_type_name( \
     } \
     vxc_int4 tmpOut0, tmpOut1; \
     coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
     _viv_asm(MOV, coord_b.w, baseAddr); \
@@ -150,6 +151,7 @@ __kernel void gemm_transa_##src0_type_name##F16to##src0_type_name( \
     } \
     vxc_int4 tmpOut0, tmpOut1; \
     coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
     _viv_asm(MOV, coord_b.w, baseAddr); \
@@ -232,6 +234,7 @@ __kernel void gemm_transa_F16F16toF16(
         sum3 = (sum3 + tempA0.w * tempB0);
     }
     coord_b.y = gidy;
+    coord_b.z = get_global_id(2);
     _viv_asm(COPY, output_desc, output, sizeof(output_desc));
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
     _viv_asm(MOV, coord_b.w, baseAddr);
@@ -257,4 +260,4 @@ __kernel void gemm_transa_F16F16toF16(
     _viv_asm(COPY, outC, valC, 16);
     VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \
                 VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16.vx
index 8548fe7..2e7aab1 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16.vx
@@ -15,7 +15,7 @@ __kernel void gemm_transb_F16F16toF16(image2d_array_t inputA,
                                     int adjointB,
                         uint M, uint K, uint N)
 {
-    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
     int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);
     int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16_mix.vx
index 1c6ad3d..e7be4f4 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16_mix.vx
@@ -148,7 +148,7 @@ __kernel void gemm_transb_F16U8toU8(image2d_array_t inputA,
                                     int adjointB,
                         uint M, uint K, uint N)
 {
-    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
     int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);
     int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_u8_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_u8_mix.vx
index 71bd242..50e992f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_u8_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_u8_mix.vx
@@ -19,7 +19,7 @@ __kernel void gemm_transb_U8U8toF16(image2d_array_t inputA,
                                     int adjointB,
                         uint M, uint K, uint N)
 {
-    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
     int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);
     int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);
 
@@ -154,7 +154,7 @@ __kernel void gemm_transb_U8U8toU8(image2d_array_t inputA,
                                     int adjointB,
                         uint M, uint K, uint N)
 {
-    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
     int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);
     int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8.vx
index 1b1e92f..64fe053 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8.vx
@@ -74,6 +74,7 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \
     } \
     vxc_int4 tmpOut0, tmpOut1; \
     coord_b.y = get_global_id(1); \
+    coord_b.z = get_global_id(2); \
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
     _viv_asm(MOV, coord_b.w, baseAddr); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_f16.vx
index 021ff4b..f8aa096 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_f16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_f16.vx
@@ -85,6 +85,7 @@ __kernel void gemm_##src0_type_name##F16toF16( \
     sum2 *= input0Scale; \
     sum3 *= input0Scale; \
     coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
     int baseAddr = (int)gidz * output_desc.s4 + output_desc.s0; \
     _viv_asm(MOV, coord_b.w, baseAddr);  \
@@ -190,6 +191,7 @@ __kernel void gemm_##src0_type_name##F16toF16( \
     sum2 *= input0Scale; \
     sum3 *= input0Scale; \
     coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
     int baseAddr = (int)gidz * output_desc.s4 + output_desc.s0; \
     _viv_asm(MOV, coord_b.w, baseAddr);  \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_u8.vx
index 6cdf89e..7617e4e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_u8.vx
@@ -79,6 +79,7 @@ __kernel void gemm_##src0_type_name##F16to##src0_type_name( \
     } \
     vxc_int4 tmpOut0, tmpOut1; \
     coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
     _viv_asm(MOV, coord_b.w, baseAddr); \
@@ -177,6 +178,7 @@ __kernel void gemm_##src0_type_name##F16to##src0_type_name( \
     } \
     vxc_int4 tmpOut0, tmpOut1; \
     coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
     _viv_asm(MOV, coord_b.w, baseAddr); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8u8_f16.vx
index 3816d56..ce3838e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8u8_f16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8u8_f16.vx
@@ -10,6 +10,7 @@ _viv_uniform int bc2zero;
 _viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4;
 _viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4;
 _viv_uniform float input01Scale;
+_viv_uniform float mulKIn0In1Zp;
 
 #define GEMM_QINT_TO_F16(src0_type_name, read_type) \
 __kernel void gemm_##src0_type_name##src0_type_name##toF16( \
@@ -23,10 +24,8 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \
  \
     int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \
     int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \
-    vxc_float4 sum0 = (vxc_float4)(0); \
-    vxc_float4 sum1 = (vxc_float4)(0); \
-    vxc_float4 sum2 = (vxc_float4)(0); \
-    vxc_float4 sum3 = (vxc_float4)(0); \
+    vxc_float4 sum0 = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp); \
+    vxc_float4 sum1 = sum0, sum2 = sum0, sum3 = sum0; \
  \
     int8 inputA_desc, inputB_desc, output_desc; \
     _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \
@@ -84,6 +83,7 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \
     sum2 *= input01Scale; \
     sum3 *= input01Scale; \
     coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
     _viv_asm(MOV, coord_b.w, baseAddr); \
@@ -185,6 +185,7 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \
         sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \
     } \
     coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
     _viv_asm(MOV, coord_b.w, baseAddr); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/mod.vx b/src/tim/vx/internal/src/libnnext/ops/vx/mod.vx
new file mode 100644
index 0000000..163c840
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/mod.vx
@@ -0,0 +1,185 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+_viv_uniform VXC_512Bits uniConvertFstToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvertSecToFp32_4x4;
+
+_viv_uniform float in_scale0;
+_viv_uniform float in_scale1;
+_viv_uniform float out_scale;
+_viv_uniform float in0Tail;
+_viv_uniform float in1Tail;
+_viv_uniform float out_zp;
+
+#define MOD_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\
+                      IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \
+    save_type data; \
+    read_type read_data0, read_data1; \
+    copy_type tmpData0, tmpData1; \
+    vxc_float4 in0Val1, in0Val2, in1Val1, in1Val2; \
+    vxc_float4 tmpVal1, tmpVal2; \
+    dst_type tmpOut1, tmpOut2; \
+    read_fun(read_data0, input0, coord, VXC_5BITOFFSET_XY(0,0), \
+    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, tmpData0, read_data0, 16); \
+    read_fun(read_data1, input1, coord, VXC_5BITOFFSET_XY(0,0), \
+    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, tmpData1, read_data1, 16); \
+    VXC_DP4x4(in0Val1, tmpData0, tmpData0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \
+    VXC_DP4x4(in0Val2, tmpData0, tmpData0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \
+    VXC_DP4x4(in1Val1, tmpData1, tmpData1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \
+    VXC_DP4x4(in1Val2, tmpData1, tmpData1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \
+    in0Val1 = in0Val1 * IN0_SCALE + IN0_TAIL; \
+    in0Val2 = in0Val2 * IN0_SCALE + IN0_TAIL; \
+    in1Val1 = in1Val1 * IN1_SCALE + IN1_TAIL; \
+    in1Val2 = in1Val2 * IN1_SCALE + IN1_TAIL; \
+    if (isfmod) \
+    { \
+    tmpVal1 = fmod(in0Val1, in1Val1) * OUT_SCALE + OUT_OFFSET; \
+    tmpVal2 = fmod(in0Val2, in1Val2) * OUT_SCALE + OUT_OFFSET; \
+    } \
+    else \
+    { \
+    tmpVal1 = (in0Val1 - in1Val1 * floor(in0Val1 / in1Val1)) * OUT_SCALE + OUT_OFFSET; \
+    tmpVal2 = (in0Val2 - in1Val2 * floor(in0Val2 / in1Val2)) * OUT_SCALE + OUT_OFFSET; \
+    } \
+    _viv_asm(conv_mode, tmpOut1, tmpVal1); \
+    _viv_asm(conv_mode, tmpOut2, tmpVal2); \
+    VXC_DP2x8(data, tmpOut1, tmpOut2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \
+    write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+
+#define TENSOR_MOD(src0_name, src1_name, dst_name, dst_type, save_type, read_type, copy_type, \
+                    conv_mode, IN0_SCALE, IN0_TAIL, IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET) \
+__kernel void mod_##src0_name##src1_name##to##dst_name \
+    ( \
+    image2d_array_t input0, \
+    image2d_array_t input1, \
+    image2d_array_t output, \
+    int             isfmod \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    MOD_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\
+                  IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \
+}
+
+
+TENSOR_MOD(F16, F16, F16, half4, vxc_short8, vxc_short8,\
+                vxc_half8, CONV, 1, 0, 1, 0, 1, 0)
+TENSOR_MOD(F16, F16, I16, short4, vxc_short8, vxc_short8,\
+               vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)
+TENSOR_MOD(F16, F16, I8,  char4, vxc_char8, vxc_short8,\
+                vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)
+TENSOR_MOD(F16, F16, U8,  uchar4, vxc_uchar8, vxc_short8,\
+               vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)
+
+TENSOR_MOD(I16, I16, I16, short4, vxc_short8, vxc_short8,\
+                vxc_short8, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)
+TENSOR_MOD(I16, I16, F16, half4, vxc_short8, vxc_short8,\
+                vxc_short8, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)
+
+TENSOR_MOD(I8, I8, I8, char4, vxc_char8, vxc_char16,\
+                vxc_char16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)
+TENSOR_MOD(I8, I8, F16, half4, vxc_short8, vxc_char16,\
+                vxc_char16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)
+
+TENSOR_MOD(U8, U8, U8,  uchar4, vxc_uchar8, vxc_uchar16,\
+                vxc_uchar16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)
+TENSOR_MOD(U8, U8, F16, half4, vxc_short8, vxc_uchar16,\
+                vxc_uchar16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)
+
+
+#define TENSOR_MOD_2D(src0_name, src1_name, dst_name, dst_type, save_type, read_type, copy_type, \
+    conv_mode, IN0_SCALE, IN0_TAIL, IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET) \
+__kernel void mod_##src0_name##src1_name##to##dst_name##_2D \
+    ( \
+    image2d_array_t input0, \
+    image2d_array_t input1, \
+    image2d_array_t output, \
+    int             isfmod \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    MOD_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\
+                  IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, VXC_ReadImage, VXC_WriteImage); \
+}
+
+
+TENSOR_MOD_2D(F16, F16, F16, half4, vxc_short8, vxc_short8,\
+                vxc_half8, CONV, 1, 0, 1, 0, 1, 0)
+TENSOR_MOD_2D(F16, F16, I16, short4, vxc_short8, vxc_short8,\
+               vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)
+TENSOR_MOD_2D(F16, F16, I8,  char4, vxc_char8, vxc_short8,\
+                vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)
+TENSOR_MOD_2D(F16, F16, U8,  uchar4, vxc_uchar8, vxc_short8,\
+               vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)
+
+TENSOR_MOD_2D(I16, I16, I16, short4, vxc_short8, vxc_short8,\
+                vxc_short8, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)
+TENSOR_MOD_2D(I16, I16, F16, half4, vxc_short8, vxc_short8,\
+                vxc_short8, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)
+
+TENSOR_MOD_2D(I8, I8, I8, char4, vxc_char8, vxc_char16,\
+                vxc_char16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)
+TENSOR_MOD_2D(I8, I8, F16, half4, vxc_short8, vxc_char16,\
+                vxc_char16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)
+
+TENSOR_MOD_2D(U8, U8, U8,  uchar4, vxc_uchar8, vxc_uchar16,\
+                vxc_uchar16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)
+TENSOR_MOD_2D(U8, U8, F16, half4, vxc_short8, vxc_uchar16,\
+                vxc_uchar16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)
+
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+
+#define MOD_BF16_PROCESS(read_fun, write_fun) \
+    vxc_short8 read_data0, read_data1, vec0; \
+    vxc_float4 in0Val1, in0Val2, in1Val1, in1Val2; \
+    vxc_float4 tmpVal1, tmpVal2; \
+    vxc_ushort8 dst0, dst1; \
+    vxc_ushort8 vect; \
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
+    read_fun(read_data0, input0, coord, VXC_5BITOFFSET_XY(0,0), \
+    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP2x8(vec0, read_data0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
+    _viv_asm(COPY, in0Val1, vec0, 16); \
+    VXC_DP2x8(vec0, read_data0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \
+    _viv_asm(COPY, in0Val2, vec0, 16); \
+    read_fun(read_data1, input1, coord, VXC_5BITOFFSET_XY(0,0), \
+    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP2x8(vec0, read_data1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
+    _viv_asm(COPY, in1Val1, vec0, 16); \
+    VXC_DP2x8(vec0, read_data1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \
+    _viv_asm(COPY, in1Val2, vec0, 16); \
+    tmpVal1 = fmod(in0Val1, in1Val1); \
+    tmpVal2 = fmod(in0Val2, in1Val2); \
+    _viv_asm(COPY, dst0, tmpVal1, 16); \
+    _viv_asm(COPY, dst1, tmpVal2, 16); \
+    VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
+    write_fun(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+__kernel void mod_BF16BF16toBF16
+    (
+    image2d_array_t input0,
+    image2d_array_t input1,
+    image2d_array_t output,
+    int             isfmod
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    MOD_BF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray);
+}
+
+__kernel void mod_BF16BF16toBF16_2D
+    (
+    image2d_array_t input0,
+    image2d_array_t input1,
+    image2d_array_t output,
+    int             isfmod
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+    MOD_BF16_PROCESS(VXC_ReadImage, VXC_WriteImage);
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx
new file mode 100644
index 0000000..19873f1
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx
@@ -0,0 +1,247 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
+_viv_uniform float input0_scale;
+_viv_uniform float input1_scale;
+_viv_uniform float input0_tail;
+_viv_uniform float input1_tail;
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+#define POW_SH_IMPL(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, copy2_type, conv_type) \
+__kernel void pow_##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    src0_type src0; \
+    copy0_type data0; \
+    src0_type src1; \
+    copy0_type data1; \
+    VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, data0, src0, 16); \
+    VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, data1, src1, 16); \
+    float4 x0, x1; \
+    float4 y0, y1; \
+    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); \
+    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); \
+    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); \
+    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); \
+    x0 = x0 * input0_scale + input0_tail; \
+    x1 = x1 * input0_scale + input0_tail; \
+    y0 = y0 * input1_scale + input1_tail; \
+    y1 = y1 * input1_scale + input1_tail; \
+    float4 s0 = sign(x0); \
+    float4 s1 = sign(x1); \
+    int4 t0 = convert_int4(y0) & 1; \
+    int4 t1 = convert_int4(y1) & 1; \
+    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; \
+    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; \
+    x0 = s0 * exp2(y0 * log2(fabs(x0))); \
+    x1 = s1 * exp2(y1 * log2(fabs(x1))); \
+    x0 = x0 * output_scale + output_zp; \
+    x1 = x1 * output_scale + output_zp; \
+ \
+    conv_type tmpVal0, tmpVal1; \
+    _viv_asm(CONV_RTE, tmpVal0, x0); \
+    _viv_asm(CONV_RTE, tmpVal1, x1); \
+    dst_type dst0; \
+ \
+    copy2_type dst; \
+    VXC_DP2x8(dst0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+    _viv_asm(COPY, dst, dst0, 16); \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+POW_SH_IMPL(F16_F16toF16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_half8,   vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL(F16_F16toI16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  int4)
+POW_SH_IMPL(F16_F16toI8,  vxc_short8,  vxc_half8,   vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  int4)
+POW_SH_IMPL(F16_F16toU8,  vxc_short8,  vxc_half8,   vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, int4)
+POW_SH_IMPL(F16_I16toF16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL(F16_I16toI16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  int4)
+POW_SH_IMPL(I16_F16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL(I16_F16toI16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  int4)
+POW_SH_IMPL(I16_I16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL(I16_I16toI16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  int4)
+POW_SH_IMPL(F16_I8toF16,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL(F16_I8toI8,   vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  int4)
+POW_SH_IMPL(I8_F16toF16,  vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8,   vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL(I8_F16toI8,   vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  int4)
+POW_SH_IMPL(I8_I8toF16,   vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL(I8_I8toI8,    vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  int4)
+POW_SH_IMPL(F16_U8toF16,  vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL(F16_U8toU8,   vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4)
+POW_SH_IMPL(U8_F16toF16,  vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8,   vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL(U8_F16toU8,   vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, int4)
+POW_SH_IMPL(U8_U8toF16,   vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL(U8_U8toU8,    vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4)
+
+#define POW_SH_IMPL_2D(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, copy2_type, conv_type) \
+__kernel void pow_##name##_2D \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    src0_type src0; \
+    copy0_type data0; \
+    src0_type src1; \
+    copy0_type data1; \
+    VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, data0, src0, 16); \
+    VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, data1, src1, 16); \
+    float4 x0, x1; \
+    float4 y0, y1; \
+    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); \
+    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); \
+    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); \
+    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); \
+    x0 = x0 * input0_scale + input0_tail; \
+    x1 = x1 * input0_scale + input0_tail; \
+    y0 = y0 * input1_scale + input1_tail; \
+    y1 = y1 * input1_scale + input1_tail; \
+    float4 s0 = sign(x0); \
+    float4 s1 = sign(x1); \
+    int4 t0 = convert_int4(y0) & 1; \
+    int4 t1 = convert_int4(y1) & 1; \
+    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; \
+    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; \
+    x0 = s0 * exp2(y0 * log2(fabs(x0))); \
+    x1 = s1 * exp2(y1 * log2(fabs(x1))); \
+    x0 = x0 * output_scale + output_zp; \
+    x1 = x1 * output_scale + output_zp; \
+ \
+    conv_type tmpVal0, tmpVal1; \
+    _viv_asm(CONV_RTE, tmpVal0, x0); \
+    _viv_asm(CONV_RTE, tmpVal1, x1); \
+    dst_type dst0; \
+ \
+    copy2_type dst; \
+    VXC_DP2x8(dst0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+    _viv_asm(COPY, dst, dst0, 16); \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+POW_SH_IMPL_2D(F16_F16toF16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_half8,   vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL_2D(F16_F16toI16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  int4)
+POW_SH_IMPL_2D(F16_F16toI8,  vxc_short8,  vxc_half8,   vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  int4)
+POW_SH_IMPL_2D(F16_F16toU8,  vxc_short8,  vxc_half8,   vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, int4)
+POW_SH_IMPL_2D(F16_I16toF16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL_2D(F16_I16toI16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  int4)
+POW_SH_IMPL_2D(I16_F16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL_2D(I16_F16toI16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  int4)
+POW_SH_IMPL_2D(I16_I16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL_2D(I16_I16toI16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  int4)
+POW_SH_IMPL_2D(F16_I8toF16,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL_2D(F16_I8toI8,   vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  int4)
+POW_SH_IMPL_2D(I8_F16toF16,  vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8,   vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL_2D(I8_F16toI8,   vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  int4)
+POW_SH_IMPL_2D(I8_I8toF16,   vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL_2D(I8_I8toI8,    vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  int4)
+POW_SH_IMPL_2D(F16_U8toF16,  vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL_2D(F16_U8toU8,   vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4)
+POW_SH_IMPL_2D(U8_F16toF16,  vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8,   vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL_2D(U8_F16toU8,   vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, int4)
+POW_SH_IMPL_2D(U8_U8toF16,   vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_half8,   vxc_short8,  half4)
+POW_SH_IMPL_2D(U8_U8toU8,    vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4)
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+
+__kernel void pow_BF16_BF16toBF16
+    (
+    __read_only  image2d_array_t input0,
+    __read_only  image2d_array_t input1,
+    __write_only image2d_array_t output
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_ushort8 src0, src1, dst, tmpData;
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    float4 x0, x1;
+    float4 y0, y1;
+    float4 tmpDst0, tmpDst1;
+
+    VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, x0, tmpData, 16);
+    VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, x1, tmpData, 16);
+
+    VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, y0, tmpData, 16);
+    VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, y1, tmpData, 16);
+
+    float4  s0 = sign(x0);
+    float4  s1 = sign(x1);
+    int4 t0 = convert_int4(y0) & 1;
+    int4 t1 = convert_int4(y1) & 1;
+    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
+    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
+    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
+    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
+
+    _viv_asm(COPY, src0, tmpDst0, 16);
+    _viv_asm(COPY, src1, tmpDst1, 16);
+    VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void pow_BF16_BF16toBF16_2D
+    (
+    __read_only  image2d_array_t input0,
+    __read_only  image2d_array_t input1,
+    __write_only image2d_array_t output
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+
+    vxc_ushort8 src0, src1, dst, tmpData;
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    float4 x0, x1;
+    float4 y0, y1;
+    float4 tmpDst0, tmpDst1;
+
+    VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, x0, tmpData, 16);
+    VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, x1, tmpData, 16);
+
+    VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, y0, tmpData, 16);
+    VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, y1, tmpData, 16);
+
+    float4  s0 = sign(x0);
+    float4  s1 = sign(x1);
+    int4 t0 = convert_int4(y0) & 1;
+    int4 t1 = convert_int4(y1) & 1;
+    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
+    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
+    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
+    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
+
+    _viv_asm(COPY, src0, tmpDst0, 16);
+    _viv_asm(COPY, src1, tmpDst1, 16);
+    VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16.vx
deleted file mode 100644
index 8180085..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16.vx
+++ /dev/null
@@ -1,338 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
-_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4_2;
-_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4_2;
-
-_viv_uniform int input_ZP1;
-
-_viv_uniform float output_ZP;
-_viv_uniform float outputScale;
-
-__kernel void pow_F16F16toF16(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0, src1;
-    vxc_short8 dst;
-    vxc_half8 data0, data1;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    half4 tmpVal0, tmpVal1;
-    _viv_asm(CONV, tmpVal0, tmpDst0);
-    _viv_asm(CONV, tmpVal1, tmpDst1);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_F16F16toF16_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 src0, src1;
-    vxc_short8 dst;
-    vxc_half8 data0, data1;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    half4 tmpVal0, tmpVal1;
-    _viv_asm(CONV, tmpVal0, tmpDst0);
-    _viv_asm(CONV, tmpVal1, tmpDst1);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_F16F16toU8(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0, src1;
-    vxc_uchar8 dst;
-    vxc_half8 data0, data1;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_F16F16toU8_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 src0, src1;
-    vxc_uchar8 dst;
-    vxc_half8 data0, data1;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_F16U8toF16(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0;
-    vxc_uchar8 src1;
-    vxc_short8 dst;
-    vxc_half8 data0;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    short in1_zp;
-    _viv_asm(COPY, in1_zp, input_ZP1, 4);
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);
-
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    half4 tmpVal0, tmpVal1;
-    _viv_asm(CONV, tmpVal0, tmpDst0);
-    _viv_asm(CONV, tmpVal1, tmpDst1);
-    VXC_DP2x8(data0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, dst, data0, 16);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_F16U8toF16_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 src0;
-    vxc_uchar8 src1;
-    vxc_short8 dst;
-    vxc_half8 data0;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    short in1_zp;
-    _viv_asm(COPY, in1_zp, input_ZP1, 4);
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);
-
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    half4 tmpVal0, tmpVal1;
-    _viv_asm(CONV, tmpVal0, tmpDst0);
-    _viv_asm(CONV, tmpVal1, tmpDst1);
-    VXC_DP2x8(data0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, dst, data0, 16);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_F16U8toU8(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0;
-    vxc_uchar8 src1, dst;
-    vxc_half8 data0;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    short in1_zp;
-    _viv_asm(COPY, in1_zp, input_ZP1, 4);
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);
-
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);
-
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_F16U8toU8_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 src0;
-    vxc_uchar8 src1, dst;
-    vxc_half8 data0;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    short in1_zp;
-    _viv_asm(COPY, in1_zp, input_ZP1, 4);
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);
-
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);
-
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i16.vx
deleted file mode 100644
index f877637..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i16.vx
+++ /dev/null
@@ -1,322 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;
-
-_viv_uniform float outScale_fl;
-
-__kernel void pow_F16F16toI16(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0, src1, dst;
-    vxc_half8 data0, data1;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_F16F16toI16_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 src0, src1, dst;
-    vxc_half8 data0, data1;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_F16I16toF16(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0, src1, dst;
-    vxc_half8 data0;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    half4 tmpVal0, tmpVal1;
-    _viv_asm(CONV, tmpVal0, tmpDst0);
-    _viv_asm(CONV, tmpVal1, tmpDst1);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_F16I16toF16_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 src0, src1, dst;
-    vxc_half8 data0;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    half4 tmpVal0, tmpVal1;
-    _viv_asm(CONV, tmpVal0, tmpDst0);
-    _viv_asm(CONV, tmpVal1, tmpDst1);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_F16I16toI16(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0, src1, dst;
-    vxc_half8 data0;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_F16I16toI16_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 src0, src1, dst;
-    vxc_half8 data0;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
-_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
-_viv_uniform VXC_512Bits uniExtractOddData_2x8;
-
-__kernel void pow_BF16BF16toBF16(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_ushort8 src0, src1, dst, tmpData;
-    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-
-    VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
-    _viv_asm(COPY, x0, tmpData, 16);
-    VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
-    _viv_asm(COPY, x1, tmpData, 16);
-
-    VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
-    _viv_asm(COPY, y0, tmpData, 16);
-    VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
-    _viv_asm(COPY, y1, tmpData, 16);
-
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    _viv_asm(COPY, src0, tmpDst0, 16);
-    _viv_asm(COPY, src1, tmpDst1, 16);
-    VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_BF16BF16toBF16_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_ushort8 src0, src1, dst, tmpData;
-    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-
-    VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
-    _viv_asm(COPY, x0, tmpData, 16);
-    VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
-    _viv_asm(COPY, x1, tmpData, 16);
-
-    VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
-    _viv_asm(COPY, y0, tmpData, 16);
-    VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
-    _viv_asm(COPY, y1, tmpData, 16);
-
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    _viv_asm(COPY, src0, tmpDst0, 16);
-    _viv_asm(COPY, src1, tmpDst1, 16);
-    VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i8.vx
deleted file mode 100644
index 4b1e7fc..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i8.vx
+++ /dev/null
@@ -1,239 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-_viv_uniform float outScale_fl;
-
-__kernel void pow_F16F16toI8(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0, src1;
-    vxc_char8 dst;
-    vxc_half8 data0, data1;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_F16F16toI8_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 src0, src1;
-    vxc_char8 dst;
-    vxc_half8 data0, data1;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_F16I8toF16(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0, dst;
-    vxc_char8 src1;
-    vxc_half8 data0;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    half4 tmpVal0, tmpVal1;
-    _viv_asm(CONV, tmpVal0, tmpDst0);
-    _viv_asm(CONV, tmpVal1, tmpDst1);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_F16I8toF16_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 src0, dst;
-    vxc_char8 src1;
-    vxc_half8 data0;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    half4 tmpVal0, tmpVal1;
-    _viv_asm(CONV, tmpVal0, tmpDst0);
-    _viv_asm(CONV, tmpVal1, tmpDst1);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_F16I8toI8(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0;
-    vxc_char8 src1, dst;
-    vxc_half8 data0;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_F16I8toI8_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 src0;
-    vxc_char8 src1, dst;
-    vxc_half8 data0;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_i16.vx
deleted file mode 100644
index f336106..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pow_i16.vx
+++ /dev/null
@@ -1,227 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-_viv_uniform float outScale_fl;
-
-__kernel void pow_I16F16toF16(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0, src1, dst;
-    vxc_half8 data1;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    half4 tmpVal0, tmpVal1;
-    _viv_asm(CONV, tmpVal0, tmpDst0);
-    _viv_asm(CONV, tmpVal1, tmpDst1);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_I16F16toF16_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 src0, src1, dst;
-    vxc_half8 data1;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    half4 tmpVal0, tmpVal1;
-    _viv_asm(CONV, tmpVal0, tmpDst0);
-    _viv_asm(CONV, tmpVal1, tmpDst1);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_I16F16toI16(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0, src1, dst;
-    vxc_half8 data1;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_I16F16toI16_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 src0, src1, dst;
-    vxc_half8 data1;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_I16I16toI16(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0, src1, dst;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_I16I16toI16_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 src0, src1, dst;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_i8.vx
deleted file mode 100644
index 89ecade..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pow_i8.vx
+++ /dev/null
@@ -1,231 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-_viv_uniform float outScale_fl;
-
-__kernel void pow_I8F16toF16(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_char8 src0;
-    vxc_short8 src1, dst;
-    vxc_half8 data1;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    half4 tmpVal0, tmpVal1;
-    _viv_asm(CONV, tmpVal0, tmpDst0);
-    _viv_asm(CONV, tmpVal1, tmpDst1);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_I8F16toF16_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_char8 src0;
-    vxc_short8 src1, dst;
-    vxc_half8 data1;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    half4 tmpVal0, tmpVal1;
-    _viv_asm(CONV, tmpVal0, tmpDst0);
-    _viv_asm(CONV, tmpVal1, tmpDst1);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_I8F16toI8(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_char8 src0, dst;
-    vxc_short8 src1;
-    vxc_half8 data1;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_I8F16toI8_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_char8 src0, dst;
-    vxc_short8 src1;
-    vxc_half8 data1;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_I8I8toI8(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_char8 src0, src1, dst;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_I8I8toI8_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_char8 src0, src1, dst;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_u8.vx
deleted file mode 100644
index 44e7ca3..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pow_u8.vx
+++ /dev/null
@@ -1,349 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-
-_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4_2;
-_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4_2;
-
-_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
-
-_viv_uniform int input_ZP0;
-_viv_uniform int input_ZP1;
-_viv_uniform float output_ZP;
-_viv_uniform float outputScale;
-
-__kernel void pow_U8F16toF16(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_uchar8 src0;
-    vxc_short8 src1;
-    vxc_short8 dst;
-    vxc_half8 data1;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    short in0_zp;
-    _viv_asm(COPY, in0_zp, input_ZP0, 4);
-    VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);
-    VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    half4 tmpVal0, tmpVal1;
-    _viv_asm(CONV, tmpVal0, tmpDst0);
-    _viv_asm(CONV, tmpVal1, tmpDst1);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_U8F16toF16_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_uchar8 src0;
-    vxc_short8 src1;
-    vxc_short8 dst;
-    vxc_half8 data1;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    short in0_zp;
-    _viv_asm(COPY, in0_zp, input_ZP0, 4);
-    VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);
-    VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    half4 tmpVal0, tmpVal1;
-    _viv_asm(CONV, tmpVal0, tmpDst0);
-    _viv_asm(CONV, tmpVal1, tmpDst1);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_U8F16toU8(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_uchar8 src0;
-    vxc_short8 src1;
-    vxc_uchar8 dst;
-    vxc_half8 data1;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    short in0_zp;
-    _viv_asm(COPY, in0_zp, input_ZP0, 4);
-    VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);
-    VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);
-
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_U8F16toU8_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_uchar8 src0;
-    vxc_short8 src1;
-    vxc_uchar8 dst;
-    vxc_half8 data1;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    short in0_zp;
-    _viv_asm(COPY, in0_zp, input_ZP0, 4);
-    VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);
-    VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);
-
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);
-
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_U8U8toU8(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_uchar8 src0, src1, dst;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    short in0_zp, in1_zp;
-    _viv_asm(COPY, in0_zp, input_ZP0, 4);
-    _viv_asm(COPY, in1_zp, input_ZP1, 4);
-    VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);
-    VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);
-    VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);
-
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_U8U8toU8_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_uchar8 src0, src1, dst;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    short in0_zp, in1_zp;
-    _viv_asm(COPY, in0_zp, input_ZP0, 4);
-    _viv_asm(COPY, in1_zp, input_ZP1, 4);
-    VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);
-    VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);
-    VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);
-    VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);
-
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-            uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_U8U8toF16(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_uchar8 src0;
-    vxc_uchar8 src1;
-    vxc_short8 dst;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    short in0_zp, in1_zp;
-    _viv_asm(COPY, in0_zp, input_ZP0, 4);
-    _viv_asm(COPY, in1_zp, input_ZP1, 4);
-    VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);
-    VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);
-    VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);
-    VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);
-
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    half4 tmpVal0, tmpVal1;
-    vxc_half8 tmpVal;
-    _viv_asm(CONV, tmpVal0, tmpDst0);
-    _viv_asm(CONV, tmpVal1, tmpDst1);
-    VXC_DP2x8(tmpVal, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
-    _viv_asm(COPY, dst, tmpVal, 16);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pow_U8U8toF16_2D(
-    image2d_array_t input0,
-    image2d_array_t input1,
-    image2d_array_t output)
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_uchar8 src0;
-    vxc_uchar8 src1;
-    vxc_short8 dst;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    float4 x0, x1;
-    float4 y0, y1;
-    float4 tmpDst0, tmpDst1;
-    short in0_zp, in1_zp;
-    _viv_asm(COPY, in0_zp, input_ZP0, 4);
-    _viv_asm(COPY, in1_zp, input_ZP1, 4);
-    VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);
-    VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);
-    VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);
-    VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);
-
-    float4  s0 = sign(x0);
-    float4  s1 = sign(x1);
-    int4 t0 = convert_int4(y0) & 1;
-    int4 t1 = convert_int4(y1) & 1;
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));
-
-    half4 tmpVal0, tmpVal1;
-    vxc_half8 tmpVal;
-    _viv_asm(CONV, tmpVal0, tmpDst0);
-    _viv_asm(CONV, tmpVal1, tmpDst1);
-    VXC_DP2x8(tmpVal, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
-    _viv_asm(COPY, dst, tmpVal, 16);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx
index 602f6f5..5cb3ebb 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx
@@ -11,20 +11,28 @@ _viv_uniform VXC_512Bits uniExtract8Data_2x8;
 _viv_uniform float output_scale;
 _viv_uniform float output_zp;
 
-#define RESIZE_BILINEAR_4X1(input, mean, output) \
-    VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
- \
-    VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+#define RESIZE_BILINEAR_4X1(mean, output) \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.y; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.z; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.w; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.z ++; \
+    coord_in.x = coord.x; \
  \
     VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
         uniVecShift10); \
@@ -52,9 +60,7 @@ _viv_uniform float output_zp;
 #define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \
 __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
     ( \
-    __read_only  image2d_array_t input0, \
-    __read_only  image2d_array_t input1, \
-    __read_only  image2d_array_t input2, \
+    __read_only  image2d_array_t input, \
     __write_only image2d_array_t output0, \
     __write_only image2d_array_t output1, \
     __write_only image2d_array_t output2, \
@@ -96,23 +102,32 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
     vxc_uchar16 line0Y; \
     vxc_uchar16 line1Y; \
     int4 coord; \
+    int4 coord_in = (int4)(0, 0, 0, 0); \
     sx = sx + *xOffset; \
-    coord.xyz = sx.xyz; \
-    coord.w = sy + *yOffset; \
-    int2 coord1 = (int2)(sx.w, coord.w); \
-    VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
- \
-    VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+    coord = sx.xyzw; \
+    coord_in.y = sy + *yOffset; \
+    coord_in.x = coord.x; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.y; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.z; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.w; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.z ++; \
+    coord_in.x = coord.x; \
  \
     int4 test01, temp1; \
     int4 test02, temp2; \
@@ -151,8 +166,8 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
     VXC_WriteImage(output0, coord_out, dst, \
         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
-    RESIZE_BILINEAR_4X1(input1, gMean, output1) \
-    RESIZE_BILINEAR_4X1(input2, bMean, output2) \
+    RESIZE_BILINEAR_4X1(gMean, output1) \
+    RESIZE_BILINEAR_4X1(bMean, output2) \
 }
 PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8,  half4, vxc_short8)
 PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4,  vxc_short8)
@@ -160,9 +175,7 @@ PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4,  vxc_short8)
 #define PRE_PROCESS_RGB888_PLANAR_8BITS(dst_name, write_type) \
 __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
     ( \
-    __read_only  image2d_array_t input0, \
-    __read_only  image2d_array_t input1, \
-    __read_only  image2d_array_t input2, \
+    __read_only  image2d_array_t input, \
     __write_only image2d_array_t output0, \
     __write_only image2d_array_t output1, \
     __write_only image2d_array_t output2, \
@@ -205,18 +218,25 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
     coord.xyz = sx.xyz; \
     coord.w   = sy + *yOffset; \
     int2 coord1 = (int2)(sx.w, coord.w); \
-    VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
- \
-    VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+    int4 coord_in = (int4)(coord.xw, 0, 0); \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.y; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.z; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord1.x; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
         VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
  \
     int4 test01, temp1; \
@@ -252,18 +272,26 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
  \
     VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
-    VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0Y, input1, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0Y, input1, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
- \
-    VXC_ReadImage(line1Y, input1, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+    coord_in.x = coord.x; \
+    coord_in.z = 1; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1Y, input1, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.y; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1Y, input1, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.z; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1Y, input1, coord1, VXC_5BITOFFSET_XY(0, 1), \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord1.x; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
         VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
  \
     VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
@@ -289,18 +317,26 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
  \
     VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
-    VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0Y, input2, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line0Y, input2, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
- \
-    VXC_ReadImage(line1Y, input2, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+    coord_in.x = coord.x; \
+    coord_in.z = 2; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1Y, input2, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.y; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1Y, input2, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.z; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(line1Y, input2, coord1, VXC_5BITOFFSET_XY(0, 1), \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord1.x; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
         VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
  \
     VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx
index 5a9942c..b0714e4 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx
@@ -1,12 +1,4 @@
-/*
- ============================================================================
- Name        : GrayScale.vx
- Author      : Sam
- Version     :
- Copyright   : Your copyright notice
- Description :
- ============================================================================
- */
+
 #include "cl_viv_vx_ext.h"
 
 _viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;
@@ -18,9 +10,7 @@ _viv_uniform float output_zp;
 #define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \
 __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
     ( \
-    __read_only  image2d_array_t input0, \
-    __read_only  image2d_array_t input1, \
-    __read_only  image2d_array_t input2, \
+    __read_only  image2d_array_t input, \
     __write_only image2d_array_t output0, \
     __write_only image2d_array_t output1, \
     __write_only image2d_array_t output2, \
@@ -40,9 +30,12 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
     vxc_uchar16 src0, src1, src2; \
     dst_type dst0, dst1; \
  \
-    VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    int4 coord_in = (int4)(coord.xy, 0, 0); \
+    VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.z ++; \
+    VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.z ++; \
+    VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
  \
     coord.x = coord.z + 8; \
     float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \
@@ -90,9 +83,7 @@ PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)
 #define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \
 __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
     ( \
-    __read_only  image2d_array_t input0, \
-    __read_only  image2d_array_t input1, \
-    __read_only  image2d_array_t input2, \
+    __read_only  image2d_array_t input, \
     __write_only image2d_array_t output0, \
     __write_only image2d_array_t output1, \
     __write_only image2d_array_t output2, \
@@ -112,9 +103,12 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
     vxc_uchar16 src0, src1, src2; \
     write_type dst; \
  \
-    VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    int4 coord_in = (int4)(coord.xy, 0, 0); \
+    VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.z ++; \
+    VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.z ++; \
+    VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
  \
     float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \
         rMean * output_scale - output_zp, output_scale); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx
index a82a3ba..1ac60fe 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx
@@ -8,9 +8,7 @@ _viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;
 
 __kernel void pre_process_rgb888_planar_4over3_U8toU8
     (
-    __read_only  image2d_array_t input0,
-    __read_only  image2d_array_t input1,
-    __read_only  image2d_array_t input2,
+    __read_only  image2d_array_t input,
     __write_only image2d_array_t output0,
     __write_only image2d_array_t output1,
     __write_only image2d_array_t output2,
@@ -24,17 +22,21 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8
                  float           f32Var
     )
 {
-    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);
     int4 coord_out;
 
     vxc_uchar16 src0, src1, src2, src3;
     vxc_uchar16 dst0, dst1, dst2;
 
-    VXC_ReadImage(src0, input0, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input0, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src2, input0, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src3, input0, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
+    VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    coord_in.z ++;
     coord_out.xy = (coord_in.xy >> 2) * 3;
     coord_out.zw = coord_in.yy + (int2)(1, 2);
 
@@ -51,10 +53,15 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8
     VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
     VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
 
-    VXC_ReadImage(src0, input1, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src2, input1, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src3, input1, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    coord_in.z ++;
 
     VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);
     VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);
@@ -69,10 +76,14 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8
     VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
     VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
 
-    VXC_ReadImage(src0, input2, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input2, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src2, input2, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src3, input2, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
 
     VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);
     VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);
@@ -90,9 +101,7 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8
 
 __kernel void pre_process_rgb888_planar_half_U8toU8
     (
-    __read_only  image2d_array_t input0,
-    __read_only  image2d_array_t input1,
-    __read_only  image2d_array_t input2,
+    __read_only  image2d_array_t input,
     __write_only image2d_array_t output0,
     __write_only image2d_array_t output1,
     __write_only image2d_array_t output2,
@@ -106,17 +115,22 @@ __kernel void pre_process_rgb888_planar_half_U8toU8
                  float           f32Var
     )
 {
-    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);
 
     vxc_uchar16 src0, src1, src2;
 
-    VXC_ReadImage(src0, input0, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src2, input2, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src0, input, coord_in, 0,
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    coord_in.z ++;
+    VXC_ReadImage2DArray(src1, input, coord_in, 0,
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    coord_in.z ++;
+    VXC_ReadImage2DArray(src2, input, coord_in, 0,
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
 
-    coord_in.zw = coord_in.xy >> 1;
+    int2 coord = coord_in.xy >> 1;
 
-    VXC_WriteImage(output0, coord_in.zw, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output1, coord_in.zw, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output2, coord_in.zw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output0, coord, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output1, coord, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output2, coord, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
 }
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx
new file mode 100644
index 0000000..107846e
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx
@@ -0,0 +1,330 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniVecShift10;
+_viv_uniform VXC_512Bits uniAddRShift;
+_viv_uniform VXC_512Bits uniGetTempVal;
+_viv_uniform VXC_512Bits uniExtractBytes;
+
+_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+#define RESIZE_BILINEAR_4X1(input, mean, output) \
+    VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+ \
+    tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \
+    _viv_asm(CONV, dst0, tmp_dst); \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst1, 8); \
+    VXC_WriteImage(output, coord_out, dst, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+
+#define RGB888_PLANAR_SEP_16BITS(dst_name, dst_type, conv_type, copy_type) \
+__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __read_only  image2d_array_t input2, \
+    __write_only image2d_array_t output0, \
+    __write_only image2d_array_t output1, \
+    __write_only image2d_array_t output2, \
+          global int             *xRatio, \
+          global int             *yRatio, \
+          global int             *xOffset, \
+          global int             *yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           f32Var \
+    ) \
+{ \
+    int2 ratioXY = (int2)(*xRatio, *yRatio); \
+ \
+    int4 xPos = get_global_id(0); \
+    int yPos = get_global_id(1); \
+ \
+    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \
+    xPos += (int4)(0, 1, 2, 3); \
+ \
+    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \
+    int4 sx = fx0 & 0xffff8000; \
+    fx0 -= sx; \
+    sx = sx >> 15; \
+ \
+    vxc_short4 fx; \
+    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniAddRShift); \
+ \
+    int fy = yPos * ratioXY.y + ratioSufXY.y; \
+    int sy = fy & 0xffff8000; \
+ \
+    fy -= sy; \
+    sy = sy >> 15; \
+ \
+    fy = (fy + (1<< 4)) >> 5; \
+ \
+    vxc_uchar16 line0Y; \
+    vxc_uchar16 line1Y; \
+    int4 coord; \
+    sx = sx + *xOffset; \
+    coord.xyz = sx.xyz; \
+    coord.w = sy + *yOffset; \
+    int2 coord1 = (int2)(sx.w, coord.w); \
+    VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 test01, temp1; \
+    int4 test02, temp2; \
+    int4 tt; \
+    vxc_uchar4 val; \
+    int2 coord_out = (int2)(xPos.x, yPos); \
+ \
+    vxc_uchar8 line1, line2; \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ \
+    vxc_float4 tmp_dst; \
+    vxc_uchar4 u8_dst; \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+ \
+    conv_type dst0; \
+    dst_type dst1; \
+    copy_type dst; \
+    tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \
+    _viv_asm(CONV, dst0, tmp_dst); \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst1, 8); \
+    VXC_WriteImage(output0, coord_out, dst, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    RESIZE_BILINEAR_4X1(input1, gMean, output1) \
+    RESIZE_BILINEAR_4X1(input2, bMean, output2) \
+}
+RGB888_PLANAR_SEP_16BITS(F16, vxc_half8,  half4, vxc_short8)
+RGB888_PLANAR_SEP_16BITS(I16, vxc_short8, int4,  vxc_short8)
+
+#define RGB888_PLANAR_SEP_8BITS(dst_name, write_type) \
+__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __read_only  image2d_array_t input2, \
+    __write_only image2d_array_t output0, \
+    __write_only image2d_array_t output1, \
+    __write_only image2d_array_t output2, \
+          global int             *xRatio, \
+          global int             *yRatio, \
+          global int             *xOffset, \
+          global int             *yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           f32Var \
+    ) \
+{ \
+    int2 ratioXY = (int2)(*xRatio, *yRatio); \
+    int4 xPos = get_global_id(0); \
+    int yPos  = get_global_id(1); \
+ \
+    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \
+    xPos += (int4)(0, 1, 2, 3); \
+ \
+    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \
+    int4 sx = fx0 & 0xffff8000; \
+    fx0 -= sx; \
+    sx = sx >> 15; \
+ \
+    vxc_short4 fx; \
+    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \
+ \
+    int fy = yPos * ratioXY.y + ratioSufXY.y; \
+    int sy = fy & 0xffff8000; \
+ \
+    fy -= sy; \
+    sy = sy >> 15; \
+    fy = (fy + (1<< 4)) >> 5; \
+ \
+    vxc_uchar16 line0Y; \
+    vxc_uchar16 line1Y; \
+    int4 coord; \
+    sx = sx + *xOffset; \
+    coord.xyz = sx.xyz; \
+    coord.w   = sy + *yOffset; \
+    int2 coord1 = (int2)(sx.w, coord.w); \
+    VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 test01, temp1; \
+    int4 test02, temp2; \
+    int2 coord_out = (int2)(xPos.x, yPos); \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ \
+    vxc_float4 tmp_dst; \
+    vxc_uchar4 u8_dst; \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+ \
+    int4 dst0; \
+    write_type dst; \
+    tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \
+    dst0 = convert_int4_rte(tmp_dst); \
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+ \
+    VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input1, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input1, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line1Y, input1, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input1, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input1, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input1, coord1, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+    tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \
+    dst0 = convert_int4_rte(tmp_dst); \
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+ \
+    VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input2, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input2, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line1Y, input2, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input2, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input2, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input2, coord1, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+    tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \
+    dst0 = convert_int4_rte(tmp_dst); \
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+ \
+    VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+RGB888_PLANAR_SEP_8BITS(U8, vxc_uchar16)
+RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx
new file mode 100644
index 0000000..ff55851
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx
@@ -0,0 +1,143 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;
+_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;
+
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+#define RGB888_PLANAR_SEP_COPY_16BITS(dst_name, dst_type, copy_type) \
+__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __read_only  image2d_array_t input2, \
+    __write_only image2d_array_t output0, \
+    __write_only image2d_array_t output1, \
+    __write_only image2d_array_t output2, \
+          global int             *xRatio, \
+          global int             *yRatio, \
+          global int             *xOffset, \
+          global int             *yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           f32Var \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    coord.xy += (int2)(*xOffset, *yOffset); \
+    vxc_uchar16 src0, src1, src2; \
+    dst_type dst0, dst1; \
+ \
+    VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    coord.x = coord.z + 8; \
+    float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \
+        rMean * output_scale - output_zp, output_scale); \
+ \
+    half4 paramData_f16; \
+    copy_type tmp_dst; \
+    _viv_asm(CONV, paramData_f16, paramData0); \
+    VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevLo_2x8); \
+    VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevHi_2x8); \
+    _viv_asm(COPY, tmp_dst, dst0, 16); \
+    VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, tmp_dst, dst1, 16); \
+    VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \
+        gMean * output_scale - output_zp, output_scale); \
+    _viv_asm(CONV, paramData_f16, paramData1); \
+    VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevLo_2x8); \
+    VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevHi_2x8); \
+    _viv_asm(COPY, tmp_dst, dst0, 16); \
+    VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, tmp_dst, dst1, 16); \
+    VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \
+        bMean * output_scale - output_zp, output_scale); \
+    _viv_asm(CONV, paramData_f16, paramData2); \
+    VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevLo_2x8); \
+    VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevHi_2x8); \
+    _viv_asm(COPY, tmp_dst, dst0, 16); \
+    VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, tmp_dst, dst1, 16); \
+    VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+RGB888_PLANAR_SEP_COPY_16BITS(F16, vxc_half8,  vxc_short8)
+RGB888_PLANAR_SEP_COPY_16BITS(I16, vxc_short8, vxc_short8)
+
+#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \
+__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __read_only  image2d_array_t input2, \
+    __write_only image2d_array_t output0, \
+    __write_only image2d_array_t output1, \
+    __write_only image2d_array_t output2, \
+          global int             *xRatio, \
+          global int             *yRatio, \
+          global int             *xOffset, \
+          global int             *yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           f32Var \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    coord.xy += (int2) (*xOffset, *yOffset); \
+    vxc_uchar16 src0, src1, src2; \
+    write_type dst; \
+ \
+    VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \
+        rMean * output_scale - output_zp, output_scale); \
+ \
+    half4 paramData_f16; \
+    _viv_asm(CONV, paramData_f16, paramData0); \
+ \
+    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevLo_2x8); \
+    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevHi_2x8); \
+    VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \
+        gMean * output_scale - output_zp, output_scale); \
+    _viv_asm(CONV, paramData_f16, paramData1); \
+ \
+    VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevLo_2x8); \
+    VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevHi_2x8); \
+    VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \
+        bMean * output_scale - output_zp, output_scale); \
+    _viv_asm(CONV, paramData_f16, paramData2); \
+ \
+    VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevLo_2x8); \
+    VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevHi_2x8); \
+    VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+}
+PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)
+PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx
new file mode 100644
index 0000000..bbfed6e
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx
@@ -0,0 +1,122 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;
+_viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;
+_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;
+_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;
+_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;
+
+__kernel void pre_process_rgb888_planar_sep_4over3_U8toU8
+    (
+    __read_only  image2d_array_t input0,
+    __read_only  image2d_array_t input1,
+    __read_only  image2d_array_t input2,
+    __write_only image2d_array_t output0,
+    __write_only image2d_array_t output1,
+    __write_only image2d_array_t output2,
+          global int             *xRatio,
+          global int             *yRatio,
+          global int             *xOffset,
+          global int             *yOffset,
+                 float           rMean,
+                 float           gMean,
+                 float           bMean,
+                 float           f32Var
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
+    int4 coord_out;
+
+    vxc_uchar16 src0, src1, src2, src3;
+    vxc_uchar16 dst0, dst1, dst2;
+
+    VXC_ReadImage(src0, input0, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input0, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src2, input0, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src3, input0, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    coord_out.xy = (coord_in.xy >> 2) * 3;
+    coord_out.zw = coord_in.yy + (int2)(1, 2);
+
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
+
+    VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+
+    VXC_ReadImage(src0, input1, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input1, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src2, input1, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src3, input1, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
+
+    VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+
+    VXC_ReadImage(src0, input2, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input2, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src2, input2, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src3, input2, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
+
+    VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void pre_process_rgb888_planar_sep_half_U8toU8
+    (
+    __read_only  image2d_array_t input0,
+    __read_only  image2d_array_t input1,
+    __read_only  image2d_array_t input2,
+    __write_only image2d_array_t output0,
+    __write_only image2d_array_t output1,
+    __write_only image2d_array_t output2,
+          global int             *xRatio,
+          global int             *yRatio,
+          global int             *xOffset,
+          global int             *yOffset,
+                 float           rMean,
+                 float           gMean,
+                 float           bMean,
+                 float           f32Var
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
+
+    vxc_uchar16 src0, src1, src2;
+
+    VXC_ReadImage(src0, input0, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input1, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src2, input2, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    coord_in.zw = coord_in.xy >> 1;
+
+    VXC_WriteImage(output0, coord_in.zw, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output1, coord_in.zw, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output2, coord_in.zw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_nhwc_bound.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_nhwc_bound.vx
new file mode 100644
index 0000000..f0303f4
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_nhwc_bound.vx
@@ -0,0 +1,153 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniResize_x2_nhwc2_0_4x8;
+_viv_uniform int2 x_coord;
+
+__kernel void resize_bilinear_nhwc_bound_U8toU8_2x
+    (
+    __read_only  image2d_array_t   input,
+                 image2d_array_t   output,
+    __write_only image2d_array_t   output1
+    )
+{
+    int4 coord_out =  (int4)(1, get_global_id(0), get_global_id(0), get_global_id(0));
+    int2 coord_in  = (int2)(1, get_global_id(0));
+    coord_in.y = ((coord_out.y * 2 - 1) >> 2);
+    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;
+
+    vxc_uchar16 in0, in1, in2, in3, result;
+
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    coord_out.z = coord_out.y + 1;
+
+    VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);
+    VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));
+    VXC_DP4x8(result, in2, in1, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);
+    VXC_WriteImage(output, coord_out.xz, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));
+
+    coord_in.x = x_coord.x;
+
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    coord_out.x = x_coord.y;
+
+    VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);
+    VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));
+    VXC_DP4x8(result, in2, in1, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);
+    VXC_WriteImage(output, coord_out.xz, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l10_4x4;
+__kernel void resize_bilinear_nhwc_bound_U8toU8_3x
+    (
+    __read_only  image2d_array_t   input,
+                 image2d_array_t   output,
+    __write_only image2d_array_t   output1
+    )
+{
+    int4 coord_out =  (int4)(1, get_global_id(0), get_global_id(0), get_global_id(0));
+    int2 coord_in  = (int2)(1, get_global_id(0));
+    coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;
+    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;
+
+    vxc_uchar16 in0, in1, in2, in3, result;
+
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    coord_out.zw = coord_out.yy + (int2)(1, 2);
+
+    VXC_DP4x4(result, in1, in0, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);
+    VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.xz, in1, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));
+    VXC_DP4x4(result, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);
+    VXC_WriteImage(output, coord_out.xw, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));
+
+    coord_in.x = x_coord.x;
+
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    coord_out.x = x_coord.y;
+
+    VXC_DP4x4(result, in1, in0, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);
+    VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.xz, in1, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));
+    VXC_DP4x4(result, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);
+    VXC_WriteImage(output, coord_out.xw, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l00_4x8;
+_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l10_4x8;
+__kernel void resize_bilinear_nhwc_bound_U8toU8_4x
+    (
+    __read_only  image2d_array_t   input,
+                 image2d_array_t   output,
+    __write_only image2d_array_t   output1
+    )
+{
+    int4 coord_out =  (int4)(1, get_global_id(0), get_global_id(0), get_global_id(0));
+    int2 coord_in  = (int2)(1, get_global_id(0));
+    coord_in.y = (coord_out.y * 2 - 3) >> 3;
+    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;
+
+    vxc_uchar16 in0, in1, in2, in3, dst0, dst1;
+
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    coord_out.z = coord_out.y + 1;
+
+    VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);
+    VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_out.x += 2;
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_out.x -= 2;
+
+    coord_out.zw = coord_out.zz + (int2)(1, 2);
+    VXC_DP4x8(dst0, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);
+    VXC_DP4x8(dst1, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);
+    VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_out.x += 2;
+    VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    coord_in.x = x_coord.x;
+
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    coord_out.x = x_coord.y;
+    coord_out.z = coord_out.y + 1;
+
+    VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);
+    VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_out.x -= 2;
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_out.x += 2;
+
+    coord_out.zw = coord_out.zz + (int2)(1, 2);
+    VXC_DP4x8(dst0, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);
+    VXC_DP4x8(dst1, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);
+    VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_out.x -= 2;
+    VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/select.vx b/src/tim/vx/internal/src/libnnext/ops/vx/select.vx
index 8553903..ce788a4 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/select.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/select.vx
@@ -1,24 +1,25 @@
 #include "cl_viv_vx_ext.h"
 
 _viv_uniform VXC_512Bits uniConvConditiontoDst_2x8;
-_viv_uniform VXC_512Bits uniConvIntIn0toDst_2x8;
-_viv_uniform VXC_512Bits uniConvIntIn1toDst_2x8;
-_viv_uniform VXC_512Bits uniU8SubZP_MulM_PStoF16In0_2x8;
-_viv_uniform VXC_512Bits uniU8SubZP_MulM_PStoF16In1_2x8;
-_viv_uniform int input0Zp;
-_viv_uniform int input1Zp;
-_viv_uniform int outputZP;
-_viv_uniform VXC_512Bits uniU8AddZP_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
 
 #define SELECT_INT(type_name, read_fun, write_fun) \
-    type_name tmp, src0, src1, dst, value; \
+    type_name src0, src1, dst, value; \
     vxc_char8 value_tmp; \
-    read_fun(tmp, input0, coord, VXC_5BITOFFSET_XY(0, 0), \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-    VXC_DP2x8(src0, tmp, tmp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvIntIn0toDst_2x8); \
-    read_fun(tmp, input1, coord, VXC_5BITOFFSET_XY(0, 0), \
+    read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-    VXC_DP2x8(src1, tmp, tmp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvIntIn1toDst_2x8); \
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+             uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+             uniU8MulAndPostShift1_Lo_2x8); \
     read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
     VXC_DP2x8(value, value_tmp, value_tmp,\
@@ -38,6 +39,7 @@ __kernel void select_##cond_name##_##src_name##_##src_name##to##dst_name( \
 }
 
 SELECT_INT_FUN(I8, I8,  I8,  vxc_char8)
+SELECT_INT_FUN(I8, U8,  U8,  vxc_uchar8)
 SELECT_INT_FUN(I8, I16, I16, vxc_short8)
 
 #define SELECT_INT_FUN_2D(cond_name, src_name, dst_name, type_name) \
@@ -52,6 +54,7 @@ __kernel void select_##cond_name##_##src_name##_##src_name##to##dst_name##_2D( \
 }
 
 SELECT_INT_FUN_2D(I8, I8,  I8,  vxc_char8)
+SELECT_INT_FUN_2D(I8, U8,  U8,  vxc_uchar8)
 SELECT_INT_FUN_2D(I8, I16, I16, vxc_short8)
 
 #define SELECT_HALF(read_fun, write_fun) \
@@ -88,45 +91,109 @@ __kernel void select_I8_F16_F16toF16_2D(
     SELECT_HALF(VXC_ReadImage, VXC_WriteImage)
 }
 
-#define SELECT_U8(read_fun, write_fun) \
-    vxc_uchar8 tmp, src0, src1, dst; \
-    vxc_char8 value; \
-    vxc_half8 tmp1; \
-    vxc_uchar16 input0_ZP, input1_ZP, output_ZP; \
-    _viv_asm(COPY, input0_ZP, input0Zp, 4); \
-    _viv_asm(COPY, input1_ZP, input1Zp, 4); \
-    _viv_asm(COPY, output_ZP, outputZP, 4); \
-    read_fun(tmp, input0, coord, VXC_5BITOFFSET_XY(0, 0), \
+#define SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, read_fun, write_fun) \
+    vxc_short8 src0, src1, dst, value; \
+    vxc_half8 value0, value1; \
+    src0_type r0; \
+    src1_type r1; \
+    copy0_type v0; \
+    copy1_type v1; \
+    vxc_char8 value_tmp; \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-    VXC_DP2x8(tmp1, tmp, input0_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-             uniU8SubZP_MulM_PStoF16In0_2x8); \
-    VXC_DP2x8(src0, tmp1, output_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8AddZP_2x8); \
-    read_fun(tmp, input1, coord, VXC_5BITOFFSET_XY(0, 0), \
+    _viv_asm(COPY, v0, src0, 16); \
+    read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-    VXC_DP2x8(tmp1, tmp, input1_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-             uniU8SubZP_MulM_PStoF16In1_2x8); \
-    VXC_DP2x8(src1, tmp1, output_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8AddZP_2x8); \
-    read_fun(value, condition, coord, VXC_5BITOFFSET_XY(0, 0), \
+    _viv_asm(COPY, v1, src1, 16); \
+    VXC_DP2x8(value0, v0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+             uniU8MulAndPostShift0_Lo_2x8); \
+    _viv_asm(COPY, src0, value0, 16); \
+    VXC_DP2x8(value1, v1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+             uniU8MulAndPostShift1_Lo_2x8); \
+    _viv_asm(COPY, src1, value1, 16); \
+    read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP2x8(value, value_tmp, value_tmp,\
+             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \
     dst = (value != 0 ? src0 : src1); \
     write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 
-__kernel void select_I8_U8_U8toU8(
+#define SELECT_HYBRID_TOF16_FUN(name, src0_type, copy0_type, src1_type, copy1_type) \
+__kernel void select_##name( \
+    __read_only  image2d_array_t   condition, \
+    __read_only  image2d_array_t   input0, \
+    __read_only  image2d_array_t   input1, \
+    __write_only image2d_array_t   output) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \
+            VXC_ReadImage2DArray, VXC_WriteImage2DArray) \
+}
+SELECT_HYBRID_TOF16_FUN(I8_F16_U8toF16,  vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16)
+SELECT_HYBRID_TOF16_FUN(I8_U8_F16toF16,  vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8)
+SELECT_HYBRID_TOF16_FUN(I8_F16_I8toF16,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16)
+SELECT_HYBRID_TOF16_FUN(I8_I8_F16toF16,  vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8)
+SELECT_HYBRID_TOF16_FUN(I8_F16_I16toF16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8)
+SELECT_HYBRID_TOF16_FUN(I8_I16_F16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8)
+
+#define SELECT_HYBRID_TOF16_FUN_2D(name, src0_type, copy0_type, src1_type, copy1_type) \
+__kernel void select_##name( \
+    __read_only  image2d_array_t   condition, \
+    __read_only  image2d_array_t   input0, \
+    __read_only  image2d_array_t   input1, \
+    __write_only image2d_array_t   output) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \
+            VXC_ReadImage, VXC_WriteImage) \
+}
+SELECT_HYBRID_TOF16_FUN_2D(I8_F16_U8toF16_2D,  vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16)
+SELECT_HYBRID_TOF16_FUN_2D(I8_U8_F16toF16_2D,  vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8)
+SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I8toF16_2D,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16)
+SELECT_HYBRID_TOF16_FUN_2D(I8_I8_F16toF16_2D,  vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8)
+SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I16toF16_2D, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8)
+SELECT_HYBRID_TOF16_FUN_2D(I8_I16_F16toF16_2D, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8)
+
+#define SELECT_HALF_TO_QINT(read_fun, write_fun, dst_type) \
+    vxc_short8 src0, src1, tmp_dst, value; \
+    vxc_half8 data; \
+    dst_type dst; \
+    vxc_char8 value_tmp; \
+    read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP2x8(value, value_tmp, value_tmp,\
+             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \
+    tmp_dst = (value != 0 ? src0 : src1); \
+    _viv_asm(COPY, data, tmp_dst, 16); \
+    vxc_ushort8 mp0; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    VXC_DP2x8(dst, data, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+            uniU8MulAndPostShift0_Lo_2x8); \
+    write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+__kernel void select_I8_F16_F16toU8(
     __read_only  image2d_array_t   condition,
     __read_only  image2d_array_t   input0,
     __read_only  image2d_array_t   input1,
     __write_only image2d_array_t   output)
 {
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    SELECT_U8(VXC_ReadImage2DArray, VXC_WriteImage2DArray)
+    SELECT_HALF_TO_QINT(VXC_ReadImage2DArray, VXC_WriteImage2DArray, vxc_uchar16)
 }
 
-__kernel void select_I8_U8_U8toU8_2D(
+__kernel void select_I8_F16_F16toU8_2D(
     __read_only  image2d_array_t   condition,
     __read_only  image2d_array_t   input0,
     __read_only  image2d_array_t   input1,
     __write_only image2d_array_t   output)
 {
     int2 coord = (int2)(get_global_id(0), get_global_id(1));
-    SELECT_U8(VXC_ReadImage, VXC_WriteImage)
+    SELECT_HALF_TO_QINT(VXC_ReadImage, VXC_WriteImage, vxc_uchar16)
 }
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/slice.vx b/src/tim/vx/internal/src/libnnext/ops/vx/slice.vx
index 5717266..ff07885 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/slice.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/slice.vx
@@ -174,7 +174,7 @@ __kernel void slice_##name0##_I32to##name1##_2D \
 SLICE_8BITSTO8BITS_2D(I8, I8, vxc_char16,  vxc_char16)
 SLICE_8BITSTO8BITS_2D(U8, U8, vxc_uchar16, vxc_uchar16)
 
-#define SLICE_16BITS_TO(name0, name1, src_type, copy_type, dst_type) \
+#define SLICE_16BITS_TO(name0, name1, src_type, copy_type, dst_type, save_type) \
 __kernel void slice_##name0##_I32to##name1 \
     ( \
     __read_only  image2d_array_t input0, \
@@ -186,7 +186,7 @@ __kernel void slice_##name0##_I32to##name1 \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
     src_type src; \
     copy_type src0; \
-    dst_type dst; \
+    dst_type result; \
     int4 coord_in; \
     Image begin_img = create_image_from_image2d(input1, 4); \
     uchar* begin_ptr = begin_img.ptr; \
@@ -198,15 +198,19 @@ __kernel void slice_##name0##_I32to##name1 \
  \
     vxc_ushort8 multiplier; \
     _viv_asm(COPY, multiplier, multAndoutZP, 16); \
-    VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+    VXC_DP2x8(result, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
         uniU8MulAndPostShift_Lo_2x8); \
+    save_type dst; \
+    _viv_asm(COPY, dst, result, 16); \
     VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
 }
-SLICE_16BITS_TO(F16, I8,  vxc_half8,  vxc_short8, vxc_char16)
-SLICE_16BITS_TO(F16, U8,  vxc_half8,  vxc_short8, vxc_uchar16)
-SLICE_16BITS_TO(F16, I16, vxc_half8,  vxc_short8, vxc_short8)
+SLICE_16BITS_TO(F16, I8,  vxc_half8,   vxc_short8, vxc_char16,  vxc_char16)
+SLICE_16BITS_TO(F16, U8,  vxc_half8,   vxc_short8, vxc_uchar16, vxc_uchar16)
+SLICE_16BITS_TO(F16, I16, vxc_half8,   vxc_short8, vxc_short8,  vxc_short8)
+SLICE_16BITS_TO(I16, I16, vxc_short8,  vxc_short8, vxc_short8,  vxc_short8)
+SLICE_16BITS_TO(I16, F16, vxc_short8,  vxc_short8, vxc_half8,   vxc_short8)
 
-#define SLICE_16BITS_TO_2D(name0, name1, src_type, copy_type, dst_type) \
+#define SLICE_16BITS_TO_2D(name0, name1, src_type, copy_type, dst_type, save_type) \
 __kernel void slice_##name0##_I32to##name1##_2D \
     ( \
     __read_only  image2d_array_t input0, \
@@ -218,7 +222,7 @@ __kernel void slice_##name0##_I32to##name1##_2D \
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
     src_type src; \
     copy_type src0; \
-    dst_type dst; \
+    dst_type result; \
     int2 coord_in; \
     Image begin_img = create_image_from_image2d(input1, 4); \
     uchar* begin_ptr = begin_img.ptr; \
@@ -230,10 +234,14 @@ __kernel void slice_##name0##_I32to##name1##_2D \
  \
     vxc_ushort8 multiplier; \
     _viv_asm(COPY, multiplier, multAndoutZP, 16); \
-    VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+    VXC_DP2x8(result, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
         uniU8MulAndPostShift_Lo_2x8); \
+    save_type dst; \
+    _viv_asm(COPY, dst, result, 16); \
     VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
 }
-SLICE_16BITS_TO_2D(F16, I8,  vxc_half8,  vxc_short8, vxc_char16)
-SLICE_16BITS_TO_2D(F16, U8,  vxc_half8,  vxc_short8, vxc_uchar16)
-SLICE_16BITS_TO_2D(F16, I16, vxc_half8,  vxc_short8, vxc_short8)
\ No newline at end of file
+SLICE_16BITS_TO_2D(F16, I8,  vxc_half8,   vxc_short8, vxc_char16,  vxc_char16)
+SLICE_16BITS_TO_2D(F16, U8,  vxc_half8,   vxc_short8, vxc_uchar16, vxc_uchar16)
+SLICE_16BITS_TO_2D(F16, I16, vxc_half8,   vxc_short8, vxc_short8,  vxc_short8)
+SLICE_16BITS_TO_2D(I16, I16, vxc_short8,  vxc_short8, vxc_short8,  vxc_short8)
+SLICE_16BITS_TO_2D(I16, F16, vxc_short8,  vxc_short8, vxc_half8,   vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
index 0dd28ed..2aedbce 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
@@ -2967,6 +2967,846 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(\n\
 \n\
 "; /* end of conv1d_ovxlib_k1024_vx*/
 
+static const char cumsum_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;\n\
+_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform int channel;\n\
+_viv_uniform int input_zp;\n\
+_viv_uniform float in_out_scale;\n\
+_viv_uniform float in_out_zp_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+__kernel void cumsum_F16toF16_axis2(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    vxc_half8 data, sum;\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+    for(coord.z = 0; coord.z < channel; coord.z++)\n\
+    {\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        _viv_asm(COPY, data, src, 16);\n\
+\n\
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_8BITS_AXIS2(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_##in_name##to##out_name##_axis2( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\
+ \\\n\
+    for(coord.z = 0; coord.z < channel; coord.z++) \\\n\
+    { \\\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+        float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp; \\\n\
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+        int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+        int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
+        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);\\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CUMSUM_8BITS_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_8BITS_AXIS2(I8, I8, vxc_char16, vxc_char16)\n\
+\n\
+__kernel void cumsum_I16toI16_axis2(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\
+\n\
+    for(coord.z = 0; coord.z < channel; coord.z++)\n\
+    {\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+        float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp;\n\
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+        int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+        int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);\n\
+\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_F16toF16_axis1(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    vxc_half8 data, sum;\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        _viv_asm(COPY, data, src, 16);\n\
+\n\
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_8BITS_AXIS1(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_##in_name##to##out_name##_axis1( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\
+ \\\n\
+    for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \\\n\
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+        int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+        int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
+        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CUMSUM_8BITS_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_8BITS_AXIS1(I8, I8, vxc_char16,  vxc_char16)\n\
+\n\
+__kernel void cumsum_I16toI16_axis1(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;\n\
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+        int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+        int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_F16toF16_axis0(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    vxc_half8 data, tmpsum, sum;\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+    for(; coord.x < width; coord.x += 8)\n\
+    {\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        _viv_asm(COPY, data, src, 16);\n\
+\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);\n\
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);\n\
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_QINT_AXIS0(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_##in_name##to##out_name##_axis0( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    vxc_short8 rowSum; \\\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0); \\\n\
+    short zp = (short)input_zp; \\\n\
+ \\\n\
+    for(; coord.x < width; coord.x += 8) \\\n\
+    { \\\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \\\n\
+        VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \\\n\
+        VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \\\n\
+        VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32A_4x4); \\\n\
+        VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32B_4x4); \\\n\
+ \\\n\
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+\n\
+CUMSUM_QINT_AXIS0(U8,  U8,  vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_QINT_AXIS0(I8,  I8,  vxc_char16,  vxc_char16)\n\
+CUMSUM_QINT_AXIS0(I16, I16, vxc_short8,  vxc_short8)\n\
+"; /* end of cumsum_vx*/
+
+static const char cumsum_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;\n\
+_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform int input_zp;\n\
+_viv_uniform float in_out_scale;\n\
+_viv_uniform float in_out_zp_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+__kernel void cumsum_F16toF16_axis1_2D(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    vxc_half8 data, sum;\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+\n\
+    for(; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        _viv_asm(COPY, data, src, 16);\n\
+\n\
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumVertF16toF16_2x8);\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_8BITS_AXIS1_2D(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_##in_name##to##out_name##_axis1_2D( \\\n\
+    __read_only image2d_t   input, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    int4 sum0 = (int4)(0); \\\n\
+    int4 sum1 = (int4)(0); \\\n\
+    int4 sum2 = (int4)(0); \\\n\
+    int4 sum3 = (int4)(0); \\\n\
+ \\\n\
+    for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumVertU8toI32A_4x4); \\\n\
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumVertU8toI32B_4x4); \\\n\
+        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumVertU8toI32C_4x4); \\\n\
+        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumVertU8toI32D_4x4); \\\n\
+ \\\n\
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \\\n\
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+        int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+        int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                 uniConvertInt32toUint8_2x8); \\\n\
+        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \\\n\
+                 uniConvertInt32toUint8_2x8); \\\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+\n\
+CUMSUM_8BITS_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_8BITS_AXIS1_2D(I8, I8, vxc_char16, vxc_char16)\n\
+\n\
+__kernel void cumsum_I16toI16_axis1_2D(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+\n\
+    vxc_short8 src, dst;\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumVertU8toI32A_4x4);\n\
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumVertU8toI32B_4x4);\n\
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;\n\
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+        int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+        int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\
+                 uniConvertInt32toUint8_2x8);\n\
+\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_F16toF16_axis0_2D(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+\n\
+    vxc_short8 src, dst;\n\
+    vxc_half8 data, tmpsum, sum;\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+    for(; coord.x < width; coord.x += 8)\n\
+    {\n\
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        _viv_asm(COPY, data, src, 16);\n\
+\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzF16toF16A_4x4);\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzF16toF16B_4x4);\n\
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzF16toF16C_2x8);\n\
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumHorzF16toF16_2x8);\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_QINT_AXIS0_2D(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_##in_name##to##out_name##_axis0_2D( \\\n\
+    __read_only image2d_t   input, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    vxc_short8 rowSum; \\\n\
+    int4 sum0, sum1; \\\n\
+    sum0 ^= sum0; \\\n\
+    sum1 ^= sum1; \\\n\
+    short zp = (short)input_zp; \\\n\
+ \\\n\
+    for(; coord.x < width; coord.x += 8) \\\n\
+    { \\\n\
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzU8toI16A_4x4); \\\n\
+        VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzU8toI16B_8x4); \\\n\
+        VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSubZpI16toI16_2x8); \\\n\
+        VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumHorzI16toI32A_4x4); \\\n\
+        VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumHorzI16toI32B_4x4); \\\n\
+ \\\n\
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+ \\\n\
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                 uniConvertInt32toUint8_2x8); \\\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+\n\
+CUMSUM_QINT_AXIS0_2D(U8,  U8,  vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_QINT_AXIS0_2D(I8,  I8,  vxc_char16,  vxc_char16)\n\
+CUMSUM_QINT_AXIS0_2D(I16, I16, vxc_short8,  vxc_short8)\n\
+"; /* end of cumsum_2d_vx*/
+
+static const char cumsum_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform int channel;\n\
+\n\
+__kernel void cumsum_BF16toBF16_axis2(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_ushort8 src, val0, val1;\n\
+    vxc_ushort8 dst0, dst1, dst;\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\
+\n\
+    for(coord.z = 0; coord.z < channel; coord.z++)\n\
+    {\n\
+        float4 data0, data1;\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, data0, val0, 16);\n\
+        _viv_asm(COPY, data1, val1, 16);\n\
+\n\
+        sum0 += data0;\n\
+        sum1 += data1;\n\
+        _viv_asm(COPY, dst0, sum0, 16);\n\
+        _viv_asm(COPY, dst1, sum1, 16);\n\
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_BF16toBF16_axis1(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_ushort8 src, val0, val1;\n\
+    vxc_ushort8 dst0, dst1, dst;\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        float4 data0, data1;\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, data0, val0, 16);\n\
+        _viv_asm(COPY, data1, val1, 16);\n\
+        sum0 += data0;\n\
+        sum1 += data1;\n\
+        _viv_asm(COPY, dst0, sum0, 16);\n\
+        _viv_asm(COPY, dst1, sum1, 16);\n\
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_BF16toBF16_axis0(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_ushort8 src, val0, val1;\n\
+    vxc_ushort8 dst0, dst1, dst;\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    float preSum = 0;\n\
+    float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\
+    float4 q = (float4)(1.0, 1.0, 1.0, 0);\n\
+\n\
+    for(; coord.x < width; coord.x += 8)\n\
+    {\n\
+        float4 data0, data1;\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, data0, val0, 16);\n\
+        _viv_asm(COPY, data1, val1, 16);\n\
+\n\
+        float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));\n\
+        float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));\n\
+        tmpSum1 += tmpSum0.w;\n\
+\n\
+        tmpSum0 += preSum;\n\
+        tmpSum1 += preSum;\n\
+\n\
+        preSum = tmpSum1.w;\n\
+\n\
+        _viv_asm(COPY, dst0, tmpSum0, 16);\n\
+        _viv_asm(COPY, dst1, tmpSum1, 16);\n\
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_BF16toBF16_axis1_2D(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), 0);\n\
+\n\
+    vxc_ushort8 src, val0, val1;\n\
+    vxc_ushort8 dst0, dst1, dst;\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\
+\n\
+    for(; coord.y < height; coord.y++)\n\
+    {\n\
+        float4 data0, data1;\n\
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part0_2x8);\n\
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, data0, val0, 16);\n\
+        _viv_asm(COPY, data1, val1, 16);\n\
+\n\
+        sum0 += data0;\n\
+        sum1 += data1;\n\
+\n\
+        _viv_asm(COPY, dst0, sum0, 16);\n\
+        _viv_asm(COPY, dst1, sum1, 16);\n\
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                uniExtractOddData_2x8);\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_BF16toBF16_axis0_2D(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+\n\
+    vxc_ushort8 src, val0, val1;\n\
+    vxc_ushort8 dst0, dst1, dst;\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    float preSum = 0;\n\
+    float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\
+    float4 q = (float4)(1.0, 1.0, 1.0, 0);\n\
+\n\
+    for(; coord.x < width; coord.x += 8)\n\
+    {\n\
+        float4 data0, data1;\n\
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part0_2x8);\n\
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, data0, val0, 16);\n\
+        _viv_asm(COPY, data1, val1, 16);\n\
+\n\
+        float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));\n\
+        float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));\n\
+        tmpSum1 += tmpSum0.w;\n\
+\n\
+        tmpSum0 += preSum;\n\
+        tmpSum1 += preSum;\n\
+\n\
+        preSum = tmpSum1.w;\n\
+\n\
+        _viv_asm(COPY, dst0, tmpSum0, 16);\n\
+        _viv_asm(COPY, dst1, tmpSum1, 16);\n\
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                uniExtractOddData_2x8);\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+"; /* end of cumsum_bf16_vx*/
+
+static const char cumsum_f16_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform int channel;\n\
+\n\
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\
+\n\
+#define CUMSUM_F16TOQINT_AXIS2(out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_F16to##out_name##_axis2( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    dst_type dst; \\\n\
+    vxc_half8 data, sum; \\\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    for(coord.z = 0; coord.z < channel; coord.z++) \\\n\
+    { \\\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, data, src, 16); \\\n\
+ \\\n\
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CUMSUM_F16TOQINT_AXIS2(I8,  vxc_half8, vxc_char16)\n\
+CUMSUM_F16TOQINT_AXIS2(I16, vxc_half8, vxc_short8)\n\
+CUMSUM_F16TOQINT_AXIS2(U8,  vxc_half8, vxc_uchar16)\n\
+\n\
+\n\
+#define CUMSUM_F16TOQINT_AXIS1(out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_F16to##out_name##_axis1( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    dst_type dst; \\\n\
+    vxc_half8 data, sum; \\\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, data, src, 16); \\\n\
+ \\\n\
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CUMSUM_F16TOQINT_AXIS1(I8,  vxc_half8, vxc_char16)\n\
+CUMSUM_F16TOQINT_AXIS1(I16, vxc_half8, vxc_short8)\n\
+CUMSUM_F16TOQINT_AXIS1(U8,  vxc_half8, vxc_uchar16)\n\
+\n\
+#define CUMSUM_F16TOQINT_AXIS0(out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_F16to##out_name##_axis0( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    dst_type dst; \\\n\
+    vxc_half8 data, tmpsum, sum; \\\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    for(; coord.x < width; coord.x += 8) \\\n\
+    { \\\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, data, src, 16); \\\n\
+ \\\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4); \\\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4); \\\n\
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8); \\\n\
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8); \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CUMSUM_F16TOQINT_AXIS0(I8,  vxc_half8, vxc_char16)\n\
+CUMSUM_F16TOQINT_AXIS0(I16, vxc_half8, vxc_short8)\n\
+CUMSUM_F16TOQINT_AXIS0(U8,  vxc_half8, vxc_uchar16)\n\
+\n\
+#define CUMSUM_F16TOQINT_AXIS1_2D(out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_F16to##out_name##_axis1_2D( \\\n\
+    __read_only image2d_t   input, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), 0); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    dst_type dst; \\\n\
+    vxc_half8 data, sum; \\\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    for(; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, data, src, 16); \\\n\
+ \\\n\
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumVertF16toF16_2x8); \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CUMSUM_F16TOQINT_AXIS1_2D(I8,  vxc_half8, vxc_char16)\n\
+CUMSUM_F16TOQINT_AXIS1_2D(I16, vxc_half8, vxc_short8)\n\
+CUMSUM_F16TOQINT_AXIS1_2D(U8,  vxc_half8, vxc_uchar16)\n\
+\n\
+#define CUMSUM_F16TOQINT_AXIS0_2D(out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_F16to##out_name##_axis0_2D( \\\n\
+    __read_only image2d_t   input, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    dst_type dst; \\\n\
+    vxc_half8 data, tmpsum, sum; \\\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    for(; coord.x < width; coord.x += 8) \\\n\
+    { \\\n\
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, data, src, 16); \\\n\
+ \\\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzF16toF16A_4x4); \\\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzF16toF16B_4x4); \\\n\
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzF16toF16C_2x8); \\\n\
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumHorzF16toF16_2x8); \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CUMSUM_F16TOQINT_AXIS0_2D(I8,  vxc_half8, vxc_char16)\n\
+CUMSUM_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8)\n\
+CUMSUM_F16TOQINT_AXIS0_2D(U8,  vxc_half8, vxc_uchar16)\n\
+"; /* end of cumsum_f16_u8_vx*/
+
 static const char custom_softmax_vx[] = "/*\n\
  ============================================================================\n\
  Name        : Softmax2.vx\n\
@@ -5237,6 +6077,22 @@ float4 eltwise_unary_celu(float4 val)\n\
     return val < 0 ? x : val;\n\
 }\n\
 \n\
+float4 eltwise_unary_rcp(float4 val)\n\
+{\n\
+    return 1.0f / val;\n\
+}\n\
+\n\
+float4 eltwise_unary_sign(float4 val)\n\
+{\n\
+    return sign(val);\n\
+}\n\
+\n\
+float4 eltwise_unary_softsign(float4 val)\n\
+{\n\
+    float4 _rcp = 1.0f / (1.0f + fabs(val));\n\
+    return val * _rcp;\n\
+}\n\
+\n\
 _viv_uniform float inputScale;\n\
 _viv_uniform float inputTail;\n\
 _viv_uniform float outputScale;\n\
@@ -5281,83 +6137,6 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\
     _viv_asm(COPY, dst, dst2, 16); \\\n\
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-//EXP\n\
-ELTSISE_UNARY_2D(exp, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(exp, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(exp, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(exp, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(exp, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(exp, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(exp, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(exp, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(exp, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(exp, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//SIN\n\
-ELTSISE_UNARY_2D(sin, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(sin, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(sin, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(sin, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(sin, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(sin, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(sin, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(sin, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(sin, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//COS\n\
-ELTSISE_UNARY_2D(cos, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(cos, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(cos, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(cos, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(cos, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(cos, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(cos, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(cos, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(cos, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//LOG\n\
-ELTSISE_UNARY_2D(log, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(log, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(log, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(log, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(log, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(log, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(log, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(log, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(log, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//SELU\n\
-ELTSISE_UNARY_2D(selu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(selu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(selu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(selu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(selu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(selu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(selu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(selu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(selu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(selu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//NEG\n\
-ELTSISE_UNARY_2D(neg, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(neg, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(neg, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(neg, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(neg, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(neg, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(neg, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(neg, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(neg, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//CELU\n\
-ELTSISE_UNARY_2D(celu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(celu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(celu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(celu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(celu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(celu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(celu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(celu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(celu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(celu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
 \n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
@@ -5392,21 +6171,39 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
     VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+#define ADD_ELTSISE_UNARY_2D(func_name) \\\n\
+ELTSISE_UNARY_2D(func_name, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8) \\\n\
+ELTSISE_UNARY_2D(func_name, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8) \\\n\
+ELTSISE_UNARY_2D(func_name, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8) \\\n\
+ELTSISE_UNARY_2D(func_name, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8) \\\n\
+ELTSISE_UNARY_2D(func_name, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8) \\\n\
+ELTSISE_UNARY_2D(func_name, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8) \\\n\
+ELTSISE_UNARY_2D(func_name, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8) \\\n\
+ELTSISE_UNARY_2D(func_name, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8) \\\n\
+ELTSISE_UNARY_2D(func_name, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8) \\\n\
+ELTSISE_UNARY_2D(func_name, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8) \\\n\
+ELTSISE_UNARY_BF16_2D(func_name)\n\
+\n\
 //EXP\n\
-ELTSISE_UNARY_BF16_2D(exp)\n\
+ADD_ELTSISE_UNARY_2D(exp)\n\
 //SIN\n\
-ELTSISE_UNARY_BF16_2D(sin)\n\
+ADD_ELTSISE_UNARY_2D(sin)\n\
 //COS\n\
-ELTSISE_UNARY_BF16_2D(cos)\n\
+ADD_ELTSISE_UNARY_2D(cos)\n\
 //LOG\n\
-ELTSISE_UNARY_BF16_2D(log)\n\
+ADD_ELTSISE_UNARY_2D(log)\n\
 //SELU\n\
-ELTSISE_UNARY_BF16_2D(selu)\n\
+ADD_ELTSISE_UNARY_2D(selu)\n\
 //NEG\n\
-ELTSISE_UNARY_BF16_2D(neg)\n\
+ADD_ELTSISE_UNARY_2D(neg)\n\
 //CELU\n\
-ELTSISE_UNARY_BF16_2D(celu)\n\
-"; /* end of eltwise_unary_2d_1_vx*/
+ADD_ELTSISE_UNARY_2D(celu)\n\
+//RCP\n\
+ADD_ELTSISE_UNARY_2D(rcp)\n\
+//SIGN\n\
+ADD_ELTSISE_UNARY_2D(sign)\n\
+//SOFTSIGN\n\
+ADD_ELTSISE_UNARY_2D(softsign)"; /* end of eltwise_unary_2d_1_vx*/
 
 static const char eltwise_unary_3d_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -5706,6 +6503,22 @@ float4 eltwise_unary_celu(float4 val)\n\
     return val < 0 ? x : val;\n\
 }\n\
 \n\
+float4 eltwise_unary_rcp(float4 val)\n\
+{\n\
+    return 1.0f / val;\n\
+}\n\
+\n\
+float4 eltwise_unary_sign(float4 val)\n\
+{\n\
+    return sign(val);\n\
+}\n\
+\n\
+float4 eltwise_unary_softsign(float4 val)\n\
+{\n\
+    float4 _rcp = 1.0f / (1.0f + fabs(val));\n\
+    return val * _rcp;\n\
+}\n\
+\n\
 _viv_uniform float inputScale;\n\
 _viv_uniform float inputTail;\n\
 _viv_uniform float outputScale;\n\
@@ -5750,83 +6563,6 @@ __kernel void func_name##_##src_type_name##to##dst_type_name( \\\n\
     _viv_asm(COPY, dst, dst2, 16); \\\n\
     VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-//EXP\n\
-ELTSISE_UNARY_3D(exp, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(exp, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(exp, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(exp, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(exp, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(exp, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(exp, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(exp, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(exp, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(exp, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//SIN\n\
-ELTSISE_UNARY_3D(sin, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(sin, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(sin, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(sin, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(sin, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(sin, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(sin, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(sin, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(sin, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//COS\n\
-ELTSISE_UNARY_3D(cos, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(cos, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(cos, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(cos, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(cos, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(cos, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(cos, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(cos, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(cos, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//LOG\n\
-ELTSISE_UNARY_3D(log, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(log, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(log, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(log, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(log, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(log, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(log, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(log, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(log, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//SELU\n\
-ELTSISE_UNARY_3D(selu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(selu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(selu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(selu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(selu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(selu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(selu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(selu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(selu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(selu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//NEG\n\
-ELTSISE_UNARY_3D(neg, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(neg, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(neg, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(neg, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(neg, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(neg, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(neg, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(neg, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(neg, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//CELU\n\
-ELTSISE_UNARY_3D(celu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(celu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(celu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(celu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(celu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(celu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(celu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(celu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(celu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(celu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
 \n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
@@ -5860,20 +6596,39 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
     VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\
     VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+#define ADD_ELTSISE_UNARY_3D(func_name) \\\n\
+ELTSISE_UNARY_3D(func_name, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8) \\\n\
+ELTSISE_UNARY_3D(func_name, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8) \\\n\
+ELTSISE_UNARY_3D(func_name, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8) \\\n\
+ELTSISE_UNARY_3D(func_name, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8) \\\n\
+ELTSISE_UNARY_3D(func_name, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8) \\\n\
+ELTSISE_UNARY_3D(func_name, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8) \\\n\
+ELTSISE_UNARY_3D(func_name, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8) \\\n\
+ELTSISE_UNARY_3D(func_name, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8) \\\n\
+ELTSISE_UNARY_3D(func_name, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8) \\\n\
+ELTSISE_UNARY_3D(func_name, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8) \\\n\
+ELTSISE_UNARY_BF16(func_name)\n\
+\n\
 //EXP\n\
-ELTSISE_UNARY_BF16(exp)\n\
+ADD_ELTSISE_UNARY_3D(exp)\n\
 //SIN\n\
-ELTSISE_UNARY_BF16(sin)\n\
+ADD_ELTSISE_UNARY_3D(sin)\n\
 //COS\n\
-ELTSISE_UNARY_BF16(cos)\n\
+ADD_ELTSISE_UNARY_3D(cos)\n\
 //LOG\n\
-ELTSISE_UNARY_BF16(log)\n\
+ADD_ELTSISE_UNARY_3D(log)\n\
 //SELU\n\
-ELTSISE_UNARY_BF16(selu)\n\
+ADD_ELTSISE_UNARY_3D(selu)\n\
 //NEG\n\
-ELTSISE_UNARY_BF16(neg)\n\
+ADD_ELTSISE_UNARY_3D(neg)\n\
 //CELU\n\
-ELTSISE_UNARY_BF16(selu)\n\
+ADD_ELTSISE_UNARY_3D(celu)\n\
+//RCP\n\
+ADD_ELTSISE_UNARY_3D(rcp)\n\
+//SIGN\n\
+ADD_ELTSISE_UNARY_3D(sign)\n\
+//SOFTSIGN\n\
+ADD_ELTSISE_UNARY_3D(softsign)\n\
 "; /* end of eltwise_unary_3d_1_vx*/
 
 static const char erf_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -8234,1661 +8989,139 @@ __kernel void get_matrix_F16toF32\n\
 }\n\
 "; /* end of get_matrix_vx*/
 
-static const char group_normalization_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char group_normalization_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int width;\n\
 _viv_uniform int height;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
 \n\
-_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
-_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\
-\n\
-_viv_uniform float outputScale;\n\
-_viv_uniform int output_ZP;\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_F16(\n\
-    image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\
-{\n\
-    int gidx = get_global_id(0) << 3;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
-    vxc_short8 src0;\n\
-    vxc_half8 in_h;\n\
-    vxc_float4 sumsqr;\n\
-    vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    int8 input_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr_a);\n\
-\n\
-    if(gidx < width)\n\
-    {\n\
-        for(coord.y = 0; coord.y < height;)\n\
-        {\n\
-            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            _viv_asm(COPY, in_h, src0, 16);\n\
-            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniFp16SumSqr_dp8x2);\n\
-            tmpSumSqr += sumsqr;\n\
-        }\n\
-    }\n\
-\n\
-    lcl_sum[lidx] = tmpSumSqr.x;\n\
-    lcl_sqr[lidx] = tmpSumSqr.y;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
-        float sum = 0;\n\
-        float sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            //sum += lcl_sum[i];\n\
-            //sqr += lcl_sqr[i];\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_F16_2D(\n\
-    image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\
-{\n\
-    int gidx = get_global_id(0) << 3;\n\
-    int lidx = get_local_id(0);\n\
-\n\
-    int2 coord = (int2)(gidx, get_global_id(1));\n\
-    vxc_short8 src0;\n\
-    vxc_half8 in_h;\n\
-    vxc_float4 sumsqr = (vxc_float4)(0);\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    if(gidx < width)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        _viv_asm(COPY, in_h, src0, 16);\n\
-        VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniFp16SumSqr_dp8x2);\n\
-    }\n\
-\n\
-    lcl_sum[lidx] = sumsqr.x;\n\
-    lcl_sqr[lidx] = sumsqr.y;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
-        float sum = 0;\n\
-        float sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            //sum += lcl_sum[i];\n\
-            //sqr += lcl_sqr[i];\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toF16(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\
-    float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidy = get_global_id(1);\n\
-    int gidz = get_global_id(2);\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
-    vxc_short8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h, in_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
-\n\
-    _viv_asm(COPY, in_h, src0, 16);\n\
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertEndInt16Fp32_4x4);\n\
-\n\
-    vxc_float4 norm;\n\
-    norm = scale_vari * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = scale_vari * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toF16_2D(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\
-    float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int2 coord = (int2)(get_global_id(0), gidz);\n\
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
-    vxc_short8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h, in_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
-\n\
-    _viv_asm(COPY, in_h, src0, 16);\n\
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        UniFP16toFP32Lo4_dp4x4);\n\
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertEndInt16Fp32_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = scale_vari * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = scale_vari * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\
-    float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidy = get_global_id(1);\n\
-    int gidz = get_global_id(2);\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
-    vxc_short8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h, in_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_uchar16 outval;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    float alpha = outputScale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
-\n\
-    _viv_asm(COPY, in_h, src0, 16);\n\
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertEndInt16Fp32_4x4);\n\
-\n\
-    vxc_float4 norm;\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8_2D(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\
-    float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int2 coord = (int2)(get_global_id(0), gidz);\n\
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
-    vxc_short8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h, in_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_uchar16 outval;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    float alpha = outputScale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
-\n\
-    _viv_asm(COPY, in_h, src0, 16);\n\
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertEndInt16Fp32_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of group_normalization_f16_vx*/
-
-static const char group_normalization_f16_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\
-\n\
-_viv_uniform float outputScale;\n\
-_viv_uniform int output_ZP;\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toF16(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\
-    float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidy = get_global_id(1);\n\
-    int gidz = get_global_id(2);\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
-    vxc_short8 src0;\n\
-    vxc_half8 in_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    scale_f = read_imagef(scale, coord_para.xy);\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
-\n\
-    _viv_asm(COPY, in_h, src0, 16);\n\
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertEndInt16Fp32_4x4);\n\
-\n\
-    vxc_float4 norm;\n\
-    norm = scale_vari * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = scale_vari * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toF16_2D(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\
-    float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int2 coord = (int2)(get_global_id(0), gidz);\n\
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
-    vxc_short8 src0;\n\
-    vxc_half8 in_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    scale_f = read_imagef(scale, coord_para.xy);\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
-\n\
-    _viv_asm(COPY, in_h, src0, 16);\n\
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        UniFP16toFP32Lo4_dp4x4);\n\
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertEndInt16Fp32_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = scale_vari * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = scale_vari * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toU8(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\
-    float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidy = get_global_id(1);\n\
-    int gidz = get_global_id(2);\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
-    vxc_short8 src0;\n\
-    vxc_half8 in_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    scale_f = read_imagef(scale, coord_para.xy);\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_uchar16 outval;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    float alpha = outputScale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
-\n\
-    _viv_asm(COPY, in_h, src0, 16);\n\
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertEndInt16Fp32_4x4);\n\
-\n\
-    vxc_float4 norm;\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toU8_2D(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\
-    float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int2 coord = (int2)(get_global_id(0), gidz);\n\
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
-    vxc_short8 src0;\n\
-    vxc_half8 in_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    scale_f = read_imagef(scale, coord_para.xy);\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_uchar16 outval;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    float alpha = outputScale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
-\n\
-    _viv_asm(COPY, in_h, src0, 16);\n\
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertEndInt16Fp32_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-"; /* end of group_normalization_f16_scale_vx*/
-
-static const char group_normalization_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform int width;\n\
-_viv_uniform int height;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
-\n\
-_viv_uniform float inFlScale_s2;\n\
-_viv_uniform float input_fl_scale;\n\
-_viv_uniform float inOut_fl_scale;\n\
-_viv_uniform float output_fl_scale;\n\
-\n\
-_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\
-_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I16(\n\
-    image2d_array_t input,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int is2D)\n\
-{\n\
-    int gidx = get_global_id(0) << 3;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
-    vxc_short8 src0;\n\
-    float sum = 0, sqr = 0;\n\
-    vxc_float4 sumsqr = (vxc_float4)(0);\n\
-    vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    int8 input_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr_a);\n\
-    if(gidx < width)\n\
-    {\n\
-        for(coord.y = 0; coord.y < height;)\n\
-        {\n\
-            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniInt16SumSqr_dp8x2);\n\
-            //tmpSumSqr += sumsqr;\n\
-            tmpSumSqr.x += sumsqr.x;\n\
-            sqr += (sumsqr.y * inFlScale_s2);\n\
-        }\n\
-        sum = tmpSumSqr.x * input_fl_scale;\n\
-        //sqr = tmpSumSqr.y * inFlScale_s2;\n\
-    }\n\
-\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
-        sum = 0; sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            //sum += lcl_sum[i];\n\
-            //sqr += lcl_sqr[i];\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I16_2D(\n\
-    image2d_array_t input,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int is2D)\n\
-{\n\
-    int gidx = get_global_id(0) << 3;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-\n\
-    int2 coord = (int2)(gidx, gidz);\n\
-    vxc_short8 src0;\n\
-    float sum = 0, sqr = 0;\n\
-    vxc_float4 sumsqr = (vxc_float4)(0);\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    if(gidx < width)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniInt16SumSqr_dp8x2);\n\
-        sqr = sumsqr.y * inFlScale_s2;\n\
-        sum = sumsqr.x * input_fl_scale;\n\
-        //sqr = tmpSumSqr.y * inFlScale_s2;\n\
-    }\n\
-\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
-        sum = 0; sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            //sum += lcl_sum[i];\n\
-            //sqr += lcl_sqr[i];\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toF16(\n\
-    image2d_array_t input,\n\
-    image2d_t bias,\n\
-    image2d_t scale,\n\
-    image2d_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int is2D,\n\
-              float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidy = get_global_id(1);\n\
-    int gidz = get_global_id(2);\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
-    vxc_short8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Fst_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Secd_4x4);\n\
-\n\
-    vxc_float4 norm;\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toF16_2D(\n\
-    image2d_array_t input,\n\
-    image2d_t bias,\n\
-    image2d_t scale,\n\
-    image2d_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int is2D,\n\
-              float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int2 coord = (int2)(get_global_id(0), gidz);\n\
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
-    vxc_short8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Fst_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Secd_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI16(\n\
-    image2d_array_t input,\n\
-    image2d_t bias,\n\
-    image2d_t scale,\n\
-    image2d_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int is2D,\n\
-              float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidy = get_global_id(1);\n\
-    int gidz = get_global_id(2);\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
-    vxc_short8 src0, src2;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    float alpha = inOut_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Fst_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Secd_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt32toInt16_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI16_2D(\n\
-    image2d_array_t input,\n\
-    image2d_t bias,\n\
-    image2d_t scale,\n\
-    image2d_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int is2D,\n\
-              float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int2 coord = (int2)(get_global_id(0), gidz);\n\
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
-    vxc_short8 src0, src2;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    float alpha = inOut_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Fst_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Secd_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toInt16_2x8);\n\
-    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of group_normalization_i16_vx*/
-
-static const char group_normalization_i16_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
-\n\
-_viv_uniform float input_fl_scale;\n\
-_viv_uniform float inOut_fl_scale;\n\
-_viv_uniform float output_fl_scale;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;\n\
-\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toF16(\n\
-    image2d_array_t input,\n\
-    image2d_t bias,\n\
-    image2d_t scale,\n\
-    image2d_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int is2D,\n\
-              float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidy = get_global_id(1);\n\
-    int gidz = get_global_id(2);\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
-    vxc_short8 src0;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    scale_f = read_imagef(scale, coord_para.xy);\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Fst_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Secd_4x4);\n\
-\n\
-    vxc_float4 norm;\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toF16_2D(\n\
-    image2d_array_t input,\n\
-    image2d_t bias,\n\
-    image2d_t scale,\n\
-    image2d_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int is2D,\n\
-              float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int2 coord = (int2)(get_global_id(0), gidz);\n\
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
-    vxc_short8 src0;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    scale_f = read_imagef(scale, coord_para.xy);\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Fst_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Secd_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toI16(\n\
-    image2d_array_t input,\n\
-    image2d_t bias,\n\
-    image2d_t scale,\n\
-    image2d_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int is2D,\n\
-              float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidy = get_global_id(1);\n\
-    int gidz = get_global_id(2);\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
-    vxc_short8 src0, src2;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    scale_f = read_imagef(scale, coord_para.xy);\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    float alpha = inOut_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Fst_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Secd_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt32toInt16_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toI16_2D(\n\
-    image2d_array_t input,\n\
-    image2d_t bias,\n\
-    image2d_t scale,\n\
-    image2d_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int is2D,\n\
-              float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int2 coord = (int2)(get_global_id(0), gidz);\n\
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
-    vxc_short8 src0, src2;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    scale_f = read_imagef(scale, coord_para.xy);\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    float alpha = inOut_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Fst_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Secd_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toInt16_2x8);\n\
-    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of group_normalization_i16_scale_vx*/
-
-static const char group_normalization_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform int width;\n\
-_viv_uniform int height;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniSumInt8_16x1;\n\
-_viv_uniform VXC_512Bits uniSqrSumInt8_16x1;\n\
-_viv_uniform float inFlScale_s2;\n\
-_viv_uniform float input_fl_scale;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4;\n\
-\n\
-_viv_uniform float inOut_fl_scale;\n\
-_viv_uniform float output_fl_scale;\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I8(\n\
-    image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\
-{\n\
-    int gidx = get_global_id(0) << 4;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
-    vxc_char16 src0;\n\
-    float sum = 0, sqr = 0;\n\
-    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    int8 input_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr_a);\n\
-\n\
-    if(gidx < width)\n\
-    {\n\
-        for(coord.y = 0; coord.y < height;)\n\
-        {\n\
-            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);\n\
-            tmpSum += (tmpSum1);\n\
-            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);\n\
-            tmpSqr += (tmpSqr1);\n\
-        }\n\
-        sqr = tmpSqr * inFlScale_s2;\n\
-        sum = tmpSum * input_fl_scale;\n\
-    }\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
-        sum = 0; sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I8_2D(\n\
-    image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\
-{\n\
-    int gidx = get_global_id(0) << 4;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-\n\
-    int2 coord = (int2)(gidx, gidz);\n\
-    vxc_char16 src0;\n\
-    float sum = 0, sqr = 0;\n\
-    int tmpSum1, tmpSqr1;\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    if(gidx < width)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);\n\
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);\n\
-        sqr = tmpSqr1 * inFlScale_s2;\n\
-        sum = tmpSum1 * input_fl_scale;\n\
-    }\n\
-\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
-        sum = 0; sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toF16(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidy = get_global_id(1);\n\
-    int gidz = get_global_id(2);\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
-    vxc_char16 src0;\n\
-    vxc_short8 src1, outval;\n\
-    vxc_half8 scale_h, dst;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
-\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord.x += 8;\n\
-    norm = alpha * tmpData2 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData3 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toF16_2D(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int2 coord = (int2)(get_global_id(0), gidz);\n\
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
-    vxc_char16 src0;\n\
-    vxc_short8 src1, outval;\n\
-    vxc_half8 scale_h, dst;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x += 8;\n\
-    norm = alpha * tmpData2 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData3 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidy = get_global_id(1);\n\
-    int gidz = get_global_id(2);\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
-    vxc_char16 src0, src2;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    float alpha = inOut_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    norm = tmpData2 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData3 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8_2D(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int2 coord = (int2)(get_global_id(0), gidz);\n\
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
-    vxc_char16 src0, src2;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    float alpha = inOut_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    norm = tmpData2 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData3 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-"; /* end of group_normalization_i8_vx*/
-
-static const char group_normalization_i8_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-\n\
-_viv_uniform float input_fl_scale;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4;\n\
-\n\
-_viv_uniform float inOut_fl_scale;\n\
-_viv_uniform float output_fl_scale;\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toF16(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidy = get_global_id(1);\n\
-    int gidz = get_global_id(2);\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
-    vxc_char16 src0;\n\
-    vxc_short8 outval;\n\
-    vxc_half8 dst;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    scale_f = read_imagef(scale, coord_para.xy);\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
-\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord.x += 8;\n\
-    norm = alpha * tmpData2 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData3 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toF16_2D(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int2 coord = (int2)(get_global_id(0), gidz);\n\
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
-    vxc_char16 src0;\n\
-    vxc_short8 outval;\n\
-    vxc_half8 dst;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    scale_f = read_imagef(scale, coord_para.xy);\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x += 8;\n\
-    norm = alpha * tmpData2 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData3 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toI8(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidy = get_global_id(1);\n\
-    int gidz = get_global_id(2);\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
-    vxc_char16 src0, src2;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    scale_f = read_imagef(scale, coord_para.xy);\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    float alpha = inOut_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    norm = tmpData2 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData3 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toI8_2D(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int2 coord = (int2)(get_global_id(0), gidz);\n\
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
-    vxc_char16 src0, src2;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    scale_f = read_imagef(scale, coord_para.xy);\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    float alpha = inOut_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    norm = tmpData2 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData3 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-"; /* end of group_normalization_i8_scale_vx*/
-
-static const char group_normalization_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform int width;\n\
-_viv_uniform int height;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniSumU8_16x1;\n\
-_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\
 _viv_uniform float input_scale;\n\
-_viv_uniform int inputZP;\n\
-_viv_uniform int sumInZp;\n\
-_viv_uniform int tmpZp1;\n\
-_viv_uniform float e2InScale;\n\
-_viv_uniform float rowSumScale;\n\
-_viv_uniform float scale_inOut;\n\
-_viv_uniform float outputScale;\n\
-_viv_uniform int output_ZP;\n\
+_viv_uniform float input_scale2;\n\
+_viv_uniform float input_zp;\n\
+_viv_uniform float sum_x_tail;\n\
+_viv_uniform float sum_x2_tail0;\n\
+_viv_uniform float sum_x2_tail1;\n\
 \n\
+_viv_uniform VXC_512Bits uniSumX_16x1;\n\
+_viv_uniform VXC_512Bits uniSumX2_16x1;\n\
 _viv_uniform VXC_512Bits uniResetFp32_4x4;\n\
 _viv_uniform int group_stride;\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_U8(\n\
-    image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\
-{\n\
-    int gidx = get_global_id(0) << 4;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
-    vxc_uchar16 src0;\n\
-    float sum = 0, sqr = 0;\n\
-    int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0;\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-    int8 input_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr_a);\n\
-    if(gidx < width)\n\
-    {\n\
-        for(coord.y = 0; coord.y < height;)\n\
-        {\n\
-            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
-            tmpSum += (tmpSum1);\n\
-            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
-            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\
-        }\n\
-        sqr += (tmpSqr * e2InScale + rowSumScale);\n\
-        sum = (tmpSum + sumInZp) * input_scale;\n\
-    }\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-        sum = 0; sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
+#define GROUP_NORM_SUMS_8BITS_IMPL(name, src_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float eps, int is2D) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0) << 4; \\\n\
+    int lidx = get_local_id(0); \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0); \\\n\
+    src_type src0; \\\n\
+    float2 sums_f32 = 0; \\\n\
+    int2 sums = 0, sum_x_x2; \\\n\
+ \\\n\
+    __local float lcl_sum[16]; \\\n\
+    __local float lcl_sqr[16]; \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr_a); \\\n\
+    if (gidx < width) \\\n\
+    { \\\n\
+        for(coord.y = 0; coord.y < height;) \\\n\
+        { \\\n\
+            VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.y++; \\\n\
+            VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\
+            VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\
+            sums = sums + sum_x_x2; \\\n\
+        } \\\n\
+        sums_f32 = convert_float2(sums); \\\n\
+        sums_f32.y = sums_f32.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sums_f32.x; \\\n\
+        sums_f32.x = sums_f32.x * input_scale + sum_x_tail; \\\n\
+    } \\\n\
+    lcl_sum[lidx] = sums_f32.x; \\\n\
+    lcl_sqr[lidx] = sums_f32.y; \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+ \\\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \\\n\
+    if(lidx == 0) \\\n\
+    { \\\n\
+        float4 one = (float4)(1, 1, 1, 1); \\\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\
+        float sum_x = 0,sum_x2 = 0; \\\n\
+        for(int i = 0; i < 4; i++) \\\n\
+        { \\\n\
+            sum_x += dot(tmp_sum[i], one); \\\n\
+            sum_x2 += dot(tmp_sqr[i], one); \\\n\
+        } \\\n\
+        float4 data = (float4)(sum_x, sum_x2, 0, 0); \\\n\
+        write_imagef(output, coord_out, data); \\\n\
+    } \\\n\
 }\n\
+GROUP_NORM_SUMS_8BITS_IMPL(U8, vxc_uchar16)\n\
+GROUP_NORM_SUMS_8BITS_IMPL(I8, vxc_char16)\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_U8_2D(\n\
-    image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\
-{\n\
-    int gidx = get_global_id(0) << 4;\n\
-    int lidx = get_local_id(0);\n\
-\n\
-    int2 coord = (int2)(gidx, get_global_id(1));\n\
-    vxc_uchar16 src0;\n\
-    float sum = 0, sqr = 0;\n\
-    int tmpSqr, tmpSum1, tmpSqr1;\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-    if(gidx < width)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
-        tmpSqr = tmpSqr1 + tmpZp1 * tmpSum1;\n\
-        sqr = (tmpSqr * e2InScale + rowSumScale);\n\
-        sum = (tmpSum1 + sumInZp) * input_scale;\n\
-    }\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-        sum = 0; sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
+#define GROUP_NORM_SUMS_8BITS_IMPL_2D(name, src_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float eps, int is2D) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0) << 4; \\\n\
+    int lidx = get_local_id(0); \\\n\
+ \\\n\
+    int2 coord = (int2)(gidx, get_global_id(1)); \\\n\
+    src_type src0; \\\n\
+    float2 sums = 0; \\\n\
+ \\\n\
+    __local float lcl_sum[16]; \\\n\
+    __local float lcl_sqr[16]; \\\n\
+    if(gidx < width) \\\n\
+    { \\\n\
+        VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_DP16x1(sums, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\
+        VXC_DP16x1(sums, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\
+        sums.y = sums.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sums.x; \\\n\
+        sums.x = sums.x * input_scale + sum_x_tail; \\\n\
+    } \\\n\
+    lcl_sum[lidx] = sums.x; \\\n\
+    lcl_sqr[lidx] = sums.y; \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+ \\\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0); \\\n\
+    if(lidx == 0) \\\n\
+    { \\\n\
+        float4 one = (float4)(1, 1, 1, 1); \\\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\
+        float sum_x = 0,sum_x2 = 0; \\\n\
+        for(int i = 0; i < 4; i++) \\\n\
+        { \\\n\
+            sum_x += dot(tmp_sum[i], one); \\\n\
+            sum_x2 += dot(tmp_sqr[i], one); \\\n\
+        } \\\n\
+        float4 data = (float4)(sum_x, sum_x2, 0, 0); \\\n\
+        write_imagef(output, coord_out, data); \\\n\
+    } \\\n\
 }\n\
+GROUP_NORM_SUMS_8BITS_IMPL_2D(U8, vxc_uchar16)\n\
+GROUP_NORM_SUMS_8BITS_IMPL_2D(I8, vxc_char16)\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_meanvari(\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_means(\n\
     image2d_t input, image2d_t output, float eps, float group_ratio)\n\
 {\n\
     int gidx = get_global_id(0);\n\
     int lidx = get_local_id(0);\n\
 \n\
     int2 coord = (int2)(gidx, get_global_id(1));\n\
-    vxc_uchar16 src0;\n\
+    vxc_uchar16 src0 = 1;\n\
     float2 sum_sqr = (float2)(0);\n\
-    vxc_float4 mean_vari;\n\
+    float4 mean_vari;\n\
     VXC_DP4x4(mean_vari, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniResetFp32_4x4);\n\
 \n\
     __local float2 lcl_data[16];\n\
@@ -9925,388 +9158,792 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_meanvar
     }\n\
 }\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidy = get_global_id(1);\n\
-    int gidz = get_global_id(2);\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
-    vxc_uchar16 src0, src2;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;\n\
+#define CONVERT_INPUT_TO_F32() \\\n\
+VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
+VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\
+VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4);\n\
 \n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    short zp = inputZP;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    float alpha = scale_inOut * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    norm = tmpData2 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData3 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+#define GROUP_NORM_8BITS_IMPL(name, src_type, dst_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_array_t scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+{ \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int gidz = get_global_id(2); \\\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+    src_type src0; \\\n\
+    dst_type dst; \\\n\
+    vxc_short8 src1; \\\n\
+    vxc_half8 scale_h; \\\n\
+    float scale_vari, bias_val; \\\n\
+    float4 bias_f, scale_f; \\\n\
+ \\\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    _viv_asm(COPY, scale_h, src1, 16); \\\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+ \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    vxc_int4 tmpVal0, tmpVal1; \\\n\
+    float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
+    float alpha = input_scale * output_scale * scale_vari; \\\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
+ \\\n\
+    CONVERT_INPUT_TO_F32() \\\n\
+    norm = tmpData0 * alpha + bias_val; \\\n\
+    tmpVal0 = convert_int4_rte(norm); \\\n\
+    norm = tmpData1 * alpha + bias_val; \\\n\
+    tmpVal1 = convert_int4_rte(norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    norm = tmpData2 * alpha + bias_val; \\\n\
+    tmpVal0 = convert_int4_rte(norm); \\\n\
+    norm = tmpData3 * alpha + bias_val; \\\n\
+    tmpVal1 = convert_int4_rte(norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+GROUP_NORM_8BITS_IMPL(U8_F16toU8, vxc_uchar16, vxc_uchar16)\n\
+GROUP_NORM_8BITS_IMPL(I8_F16toI8, vxc_char16,  vxc_char16)\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8_2D(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int2 coord = (int2)(get_global_id(0), gidz);\n\
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
-    vxc_uchar16 src0, src2;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    short zp = inputZP;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    float alpha = scale_inOut * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    norm = tmpData2 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData3 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+#define GROUP_NORM_8BITS_IMPL_2D(name, src_type, dst_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_array_t scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int2 coord = (int2)(get_global_id(0), gidz); \\\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \\\n\
+    src_type src0; \\\n\
+    dst_type dst; \\\n\
+    vxc_short8 src1; \\\n\
+    vxc_half8 scale_h; \\\n\
+    float scale_vari, bias_val; \\\n\
+    float4 bias_f, scale_f; \\\n\
+ \\\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    _viv_asm(COPY, scale_h, src1, 16); \\\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+ \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    vxc_int4 tmpVal0, tmpVal1; \\\n\
+    float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
+    float alpha = input_scale * output_scale * scale_vari; \\\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
+ \\\n\
+    CONVERT_INPUT_TO_F32() \\\n\
+    norm = tmpData0 * alpha + bias_val; \\\n\
+    tmpVal0 = convert_int4_rte(norm); \\\n\
+    norm = tmpData1 * alpha + bias_val; \\\n\
+    tmpVal1 = convert_int4_rte(norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    norm = tmpData2 * alpha + bias_val; \\\n\
+    tmpVal0 = convert_int4_rte(norm); \\\n\
+    norm = tmpData3 * alpha + bias_val; \\\n\
+    tmpVal1 = convert_int4_rte(norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+GROUP_NORM_8BITS_IMPL_2D(U8_F16toU8, vxc_uchar16, vxc_uchar16)\n\
+GROUP_NORM_8BITS_IMPL_2D(I8_F16toI8, vxc_char16,  vxc_char16)\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toU8(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidy = get_global_id(1);\n\
-    int gidz = get_global_id(2);\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
-    vxc_uchar16 src0, src2;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    scale_f = read_imagef(scale, coord_para.xy);\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    short zp = inputZP;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    float alpha = scale_inOut * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    norm = tmpData2 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData3 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+#define GROUP_NORM_8BITS_F32_IMPL(name, src_type, dst_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+{ \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int gidz = get_global_id(2); \\\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+    src_type src0; \\\n\
+    dst_type dst; \\\n\
+    float scale_vari, bias_val; \\\n\
+    float4 bias_f, scale_f; \\\n\
+ \\\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\
+    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+    scale_f = read_imagef(scale, coord_para.xy); \\\n\
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    vxc_int4 tmpVal0, tmpVal1; \\\n\
+    float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
+    float alpha = input_scale * output_scale * scale_vari; \\\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
+ \\\n\
+    CONVERT_INPUT_TO_F32() \\\n\
+    norm = tmpData0 * alpha + bias_val; \\\n\
+    tmpVal0 = convert_int4_rte(norm); \\\n\
+    norm = tmpData1 * alpha + bias_val; \\\n\
+    tmpVal1 = convert_int4_rte(norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    norm = tmpData2 * alpha + bias_val; \\\n\
+    tmpVal0 = convert_int4_rte(norm); \\\n\
+    norm = tmpData3 * alpha + bias_val; \\\n\
+    tmpVal1 = convert_int4_rte(norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+GROUP_NORM_8BITS_F32_IMPL(U8_F32toU8, vxc_uchar16, vxc_uchar16)\n\
+GROUP_NORM_8BITS_F32_IMPL(I8_F32toI8, vxc_char16,  vxc_char16)\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toU8_2D(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int2 coord = (int2)(get_global_id(0), gidz);\n\
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
-    vxc_uchar16 src0, src2;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    scale_f = read_imagef(scale, coord_para.xy);\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    short zp = inputZP;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    float alpha = scale_inOut * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    norm = tmpData2 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData3 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of group_normalization_u8_vx*/
+#define GROUP_NORM_8BITS_F32_IMPL_2D(name, src_type, dst_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int2 coord = (int2)(get_global_id(0), gidz); \\\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \\\n\
+    src_type src0; \\\n\
+    dst_type dst; \\\n\
+    float scale_vari, bias_val; \\\n\
+    float4 bias_f, scale_f; \\\n\
+ \\\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\
+    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+    scale_f = read_imagef(scale, coord_para.xy); \\\n\
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    vxc_int4 tmpVal0, tmpVal1; \\\n\
+    float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
+    float alpha = input_scale * output_scale * scale_vari; \\\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
+ \\\n\
+    CONVERT_INPUT_TO_F32() \\\n\
+    norm = tmpData0 * alpha + bias_val; \\\n\
+    tmpVal0 = convert_int4_rte(norm); \\\n\
+    norm = tmpData1 * alpha + bias_val; \\\n\
+    tmpVal1 = convert_int4_rte(norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    norm = tmpData2 * alpha + bias_val; \\\n\
+    tmpVal0 = convert_int4_rte(norm); \\\n\
+    norm = tmpData3 * alpha + bias_val; \\\n\
+    tmpVal1 = convert_int4_rte(norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GROUP_NORM_8BITS_F32_IMPL_2D(U8_F32toU8, vxc_uchar16, vxc_uchar16)\n\
+GROUP_NORM_8BITS_F32_IMPL_2D(I8_F32toI8, vxc_char16,  vxc_char16)\n\
+"; /* end of group_normalization_0_vx*/
 
-static const char group_normalization_u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char group_normalization_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform float input_zp;\n\
+\n\
+#define GROUP_NORM_8BITSTOF16_IMPL(name, src_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_array_t scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+{ \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int gidz = get_global_id(2); \\\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+    src_type src0; \\\n\
+    vxc_short8 src1, outval; \\\n\
+    vxc_half8 scale_h, dst; \\\n\
+    float scale_vari, bias_val; \\\n\
+    vxc_float4 bias_f, scale_f; \\\n\
+ \\\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    _viv_asm(COPY, scale_h, src1, 16); \\\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+ \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
+    half4 tmpVal0, tmpVal1; \\\n\
+    float alpha = scale_vari * input_scale; \\\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
+ \\\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\
+    norm = alpha * tmpData0 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData1 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    coord.x += 8; \\\n\
+    norm = alpha * tmpData2 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData3 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GROUP_NORM_8BITSTOF16_IMPL(U8_F16toF16, vxc_uchar16)\n\
+GROUP_NORM_8BITSTOF16_IMPL(I8_F16toF16, vxc_char16)\n\
+\n\
+\n\
+#define GROUP_NORM_8BITSTOF16_IMPL_2D(name, src_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_array_t scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int2 coord = (int2)(get_global_id(0), gidz); \\\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \\\n\
+    src_type src0; \\\n\
+    vxc_short8 src1, outval; \\\n\
+    vxc_half8 scale_h, dst; \\\n\
+    float scale_vari, bias_val; \\\n\
+    vxc_float4 bias_f, scale_f; \\\n\
+ \\\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    _viv_asm(COPY, scale_h, src1, 16); \\\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+ \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
+    half4 tmpVal0, tmpVal1; \\\n\
+    float alpha = scale_vari; \\\n\
+    float alpha = scale_vari * input_scale; \\\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
+ \\\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\
+    norm = alpha * tmpData0 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData1 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord.x += 8; \\\n\
+    norm = alpha * tmpData2 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData3 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GROUP_NORM_8BITSTOF16_IMPL_2D(U8_F16toF16, vxc_uchar16)\n\
+GROUP_NORM_8BITSTOF16_IMPL_2D(I8_F16toF16, vxc_char16)\n\
+\n\
+#define GROUP_NORM_8TOF16_F32_IMPL(name, src_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+{ \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int gidz = get_global_id(2); \\\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+    src_type src0; \\\n\
+    vxc_short8 outval; \\\n\
+    vxc_half8 dst; \\\n\
+    float scale_vari, bias_val; \\\n\
+    vxc_float4 bias_f, scale_f; \\\n\
+ \\\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\
+    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+    scale_f = read_imagef(scale, coord_para.xy); \\\n\
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
+    half4 tmpVal0, tmpVal1; \\\n\
+    float alpha = scale_vari * input_scale; \\\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
+ \\\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\
+    norm = alpha * tmpData0 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData1 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    coord.x += 8; \\\n\
+    norm = alpha * tmpData2 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData3 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GROUP_NORM_8TOF16_F32_IMPL(U8_F32toF16, vxc_uchar16)\n\
+GROUP_NORM_8TOF16_F32_IMPL(I8_F32toF16, vxc_char16)\n\
+\n\
+#define GROUP_NORM_8TOF16_F32_IMPL_2D(name, src_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int2 coord = (int2)(get_global_id(0), gidz); \\\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \\\n\
+    src_type src0; \\\n\
+    vxc_short8 outval; \\\n\
+    vxc_half8 dst; \\\n\
+    float scale_vari, bias_val; \\\n\
+    vxc_float4 bias_f, scale_f; \\\n\
+ \\\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\
+    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+    scale_f = read_imagef(scale, coord_para.xy); \\\n\
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
+    half4 tmpVal0, tmpVal1; \\\n\
+    float alpha = scale_vari * input_scale; \\\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
+ \\\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\
+    norm = alpha * tmpData0 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData1 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord.x += 8; \\\n\
+    norm = alpha * tmpData2 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData3 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GROUP_NORM_8TOF16_F32_IMPL_2D(U8_F32toF16, vxc_uchar16)\n\
+GROUP_NORM_8TOF16_F32_IMPL_2D(I8_F32toF16, vxc_char16)\n\
+"; /* end of group_normalization_1_vx*/
+
+static const char group_normalization_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int width;\n\
 _viv_uniform int height;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uniSum_X_X2_8x2;\n\
 _viv_uniform float input_scale;\n\
-_viv_uniform int inputZP;\n\
+_viv_uniform float input_scale2;\n\
+_viv_uniform float input_zp;\n\
+_viv_uniform float sum_x_tail;\n\
+_viv_uniform float sum_x2_tail0;\n\
+_viv_uniform float sum_x2_tail1;\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidy = get_global_id(1);\n\
-    int gidz = get_global_id(2);\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
-    vxc_uchar16 src0;\n\
-    vxc_short8 src1, outval;\n\
-    vxc_half8 scale_h, dst;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
 \n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord.x += 8;\n\
-    norm = alpha * tmpData2 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData3 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+#define GROUP_NORM_SUMS_16BITS_IMPL(name, src_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float eps, int is2D) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0) << 3; \\\n\
+    int lidx = get_local_id(0); \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0); \\\n\
+    vxc_short8 src0; \\\n\
+    src_type in_h; \\\n\
+    float4 sumsqr; \\\n\
+    float4 tmpSumSqr = (float4)(0); \\\n\
+ \\\n\
+    __local float lcl_sum[16]; \\\n\
+    __local float lcl_sqr[16]; \\\n\
+ \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr_a); \\\n\
+ \\\n\
+    if(gidx < width) \\\n\
+    { \\\n\
+        for(coord.y = 0; coord.y < height;) \\\n\
+        { \\\n\
+            VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.y++; \\\n\
+            _viv_asm(COPY, in_h, src0, 16); \\\n\
+            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \\\n\
+            tmpSumSqr += sumsqr; \\\n\
+        } \\\n\
+        tmpSumSqr.y = tmpSumSqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * tmpSumSqr.x; \\\n\
+        tmpSumSqr.x = tmpSumSqr.x * input_scale + sum_x_tail; \\\n\
+    } \\\n\
+ \\\n\
+    lcl_sum[lidx] = tmpSumSqr.x; \\\n\
+    lcl_sqr[lidx] = tmpSumSqr.y; \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+ \\\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \\\n\
+    if(lidx == 0) \\\n\
+    { \\\n\
+        float4 one = (float4)(1, 1, 1, 1); \\\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\
+ \\\n\
+        float sum = 0; \\\n\
+        float sqr = 0; \\\n\
+        for(int i = 0; i < 4; i++) \\\n\
+        { \\\n\
+            sum += dot(tmp_sum[i], one); \\\n\
+            sqr += dot(tmp_sqr[i], one); \\\n\
+        } \\\n\
+ \\\n\
+        float4 data = (float4)(sum, sqr, 0, 0); \\\n\
+        write_imagef(output, coord_out, data); \\\n\
+    } \\\n\
 }\n\
+GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_half8)\n\
+GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16_2D(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int2 coord = (int2)(get_global_id(0), gidz);\n\
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
-    vxc_uchar16 src0;\n\
-    vxc_short8 src1, outval;\n\
-    vxc_half8 scale_h, dst;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x += 8;\n\
-    norm = alpha * tmpData2 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData3 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float eps, int is2D) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0) << 3; \\\n\
+    int lidx = get_local_id(0); \\\n\
+ \\\n\
+    int2 coord = (int2)(gidx, get_global_id(1)); \\\n\
+    vxc_short8 src0; \\\n\
+    src_type in_h; \\\n\
+    float4 sumsqr = (float4)(0); \\\n\
+ \\\n\
+    __local float lcl_sum[16]; \\\n\
+    __local float lcl_sqr[16]; \\\n\
+ \\\n\
+    if(gidx < width) \\\n\
+    { \\\n\
+        VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, in_h, src0, 16); \\\n\
+        VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \\\n\
+        sumsqr.y = sumsqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sumsqr.x; \\\n\
+        sumsqr.x = sumsqr.x * input_scale + sum_x_tail; \\\n\
+    } \\\n\
+ \\\n\
+    lcl_sum[lidx] = sumsqr.x; \\\n\
+    lcl_sqr[lidx] = sumsqr.y; \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+ \\\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0); \\\n\
+    if(lidx == 0) \\\n\
+    { \\\n\
+        float4 one = (float4)(1, 1, 1, 1); \\\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\
+ \\\n\
+        float sum = 0; \\\n\
+        float sqr = 0; \\\n\
+        for(int i = 0; i < 4; i++) \\\n\
+        { \\\n\
+            sum += dot(tmp_sum[i], one); \\\n\
+            sqr += dot(tmp_sqr[i], one); \\\n\
+        } \\\n\
+ \\\n\
+        float4 data = (float4)(sum, sqr, 0, 0); \\\n\
+        write_imagef(output, coord_out, data); \\\n\
+    } \\\n\
 }\n\
+GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)\n\
+GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toF16(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidy = get_global_id(1);\n\
-    int gidz = get_global_id(2);\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
-    vxc_uchar16 src0;\n\
-    vxc_short8 outval;\n\
-    vxc_half8 dst;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    scale_f = read_imagef(scale, coord_para.xy);\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord.x += 8;\n\
-    norm = alpha * tmpData2 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData3 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+#define GROUP_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_array_t scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+{ \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int gidz = get_global_id(2); \\\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+    vxc_short8 src0; \\\n\
+    vxc_short8 src1; \\\n\
+    vxc_half8 scale_h; \\\n\
+    src_type in_h; \\\n\
+    float scale_vari, bias_val; \\\n\
+    float4 bias_f, scale_f; \\\n\
+ \\\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    _viv_asm(COPY, scale_h, src1, 16); \\\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+ \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    float4  tmpData0, tmpData1; \\\n\
+    copy_type outval; \\\n\
+    conv_type tmpVal0, tmpVal1; \\\n\
+    float alpha = input_scale * output_scale * scale_vari; \\\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
+    dst_type dst; \\\n\
+ \\\n\
+    _viv_asm(COPY, in_h, src0, 16); \\\n\
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
+ \\\n\
+    float4 norm; \\\n\
+    norm = alpha * tmpData0 + bias_val; \\\n\
+    _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData1 + bias_val; \\\n\
+    _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+GROUP_NORM_16BITS_IMPL(F16_F16toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)\n\
+GROUP_NORM_16BITS_IMPL(F16_F16toI16, vxc_half8,  vxc_short8, vxc_short8, int4)\n\
+GROUP_NORM_16BITS_IMPL(F16_F16toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)\n\
+GROUP_NORM_16BITS_IMPL(F16_F16toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)\n\
+GROUP_NORM_16BITS_IMPL(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\
+GROUP_NORM_16BITS_IMPL(I16_F16toF16, vxc_short8, vxc_half8,  vxc_short8, half4)\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toF16_2D(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int2 coord = (int2)(get_global_id(0), gidz);\n\
-    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
-    vxc_uchar16 src0;\n\
-    vxc_short8 outval;\n\
-    vxc_half8 dst;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f;\n\
-\n\
-    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
-    bias_f = read_imagef(bias, coord_para.xy);\n\
-    scale_f = read_imagef(scale, coord_para.xy);\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x += 8;\n\
-    norm = alpha * tmpData2 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData3 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+#define GROUP_NORM_16BITS_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_array_t scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int2 coord = (int2)(get_global_id(0), gidz); \\\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \\\n\
+    vxc_short8 src0; \\\n\
+    vxc_short8 src1; \\\n\
+    vxc_half8 scale_h; \\\n\
+    src_type in_h; \\\n\
+    float scale_vari, bias_val; \\\n\
+    float4 bias_f, scale_f; \\\n\
+ \\\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    _viv_asm(COPY, scale_h, src1, 16); \\\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+ \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    float4  tmpData0, tmpData1; \\\n\
+    copy_type outval; \\\n\
+    conv_type tmpVal0, tmpVal1; \\\n\
+    float alpha = output_scale * scale_vari; \\\n\
+    bias_val = input_scale * (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
+    dst_type dst; \\\n\
+ \\\n\
+    _viv_asm(COPY, in_h, src0, 16); \\\n\
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
+    float4 norm; \\\n\
+    norm = alpha * tmpData0 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData1 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-"; /* end of group_normalization_u8_f16_vx*/
+GROUP_NORM_16BITS_IMPL_2D(F16_F16toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)\n\
+GROUP_NORM_16BITS_IMPL_2D(F16_F16toI16, vxc_half8,  vxc_short8, vxc_short8, int4)\n\
+GROUP_NORM_16BITS_IMPL_2D(F16_F16toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)\n\
+GROUP_NORM_16BITS_IMPL_2D(F16_F16toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)\n\
+GROUP_NORM_16BITS_IMPL_2D(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\
+GROUP_NORM_16BITS_IMPL_2D(I16_F16toF16, vxc_short8, vxc_half8,  vxc_short8, half4)\n\
+\n\
+#define GROUP_NORM_16BITS_F32_IMPL(name, src_type, dst_type, copy_type, conv_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+{ \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int gidz = get_global_id(2); \\\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+    vxc_short8 src0; \\\n\
+    src_type in_h; \\\n\
+    float scale_vari, bias_val; \\\n\
+    float4 bias_f, scale_f; \\\n\
+ \\\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\
+    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+    scale_f = read_imagef(scale, coord_para.xy); \\\n\
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    float4  tmpData0, tmpData1; \\\n\
+    copy_type outval; \\\n\
+    conv_type tmpVal0, tmpVal1; \\\n\
+    float alpha = input_scale * output_scale * scale_vari; \\\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
+    dst_type dst; \\\n\
+ \\\n\
+    _viv_asm(COPY, in_h, src0, 16); \\\n\
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
+ \\\n\
+    float4 norm; \\\n\
+    norm = alpha * tmpData0 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData1 + bias_val; \\\n\
+    _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GROUP_NORM_16BITS_F32_IMPL(F16_F32toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)\n\
+GROUP_NORM_16BITS_F32_IMPL(F16_F32toI16, vxc_half8,  vxc_short8, vxc_short8, int4)\n\
+GROUP_NORM_16BITS_F32_IMPL(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)\n\
+GROUP_NORM_16BITS_F32_IMPL(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)\n\
+GROUP_NORM_16BITS_F32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\
+GROUP_NORM_16BITS_F32_IMPL(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)\n\
+\n\
+#define GROUP_NORM_16BITS_F32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int2 coord = (int2)(get_global_id(0), gidz); \\\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \\\n\
+    vxc_short8 src0; \\\n\
+    src_type in_h; \\\n\
+    float scale_vari, bias_val; \\\n\
+    float4 bias_f, scale_f; \\\n\
+ \\\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\
+    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+    scale_f = read_imagef(scale, coord_para.xy); \\\n\
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    float4  tmpData0, tmpData1; \\\n\
+    copy_type outval; \\\n\
+    conv_type tmpVal0, tmpVal1; \\\n\
+    float alpha = input_scale * output_scale * scale_vari; \\\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
+    dst_type dst; \\\n\
+ \\\n\
+    _viv_asm(COPY, in_h, src0, 16); \\\n\
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
+    float4 norm; \\\n\
+    norm = alpha * tmpData0 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData1 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)\n\
+GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI16, vxc_half8,  vxc_short8, vxc_short8, int4)\n\
+GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)\n\
+GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)\n\
+GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\
+GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)\n\
+\n\
+"; /* end of group_normalization_2_vx*/
 
 static const char grucell_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -10555,7 +10192,8 @@ __kernel void grucell_activation_z_h_F16_F16toF16_##act_name( \\\n\
     VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)\n\
+GRUCELL_F16_F16TOF16(SIGMOID,  sigmoid_func)\n\
+GRUCELL_F16_F16TOF16(HSIGMOID, hard_sigmoid)\n\
 \n\
 _viv_uniform float hstate_in_scale;\n\
 _viv_uniform float hstate_in_tail;\n\
@@ -10604,9 +10242,12 @@ __kernel void grucell_activation_z_h_##name0##_F16to##name1##_##act_name( \\\n\
     VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-GRUCELL_QNT_F16TO_QNT(U8,  U8,  SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\
-GRUCELL_QNT_F16TO_QNT(I8,  I8,  SIGMOID, sigmoid_func, vxc_char8,  vxc_char8)\n\
-GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)\n\
+GRUCELL_QNT_F16TO_QNT(U8,  U8,  SIGMOID,  sigmoid_func, vxc_uchar8, vxc_uchar8)\n\
+GRUCELL_QNT_F16TO_QNT(I8,  I8,  SIGMOID,  sigmoid_func, vxc_char8,  vxc_char8)\n\
+GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID,  sigmoid_func, vxc_short8, vxc_short8)\n\
+GRUCELL_QNT_F16TO_QNT(U8,  U8,  HSIGMOID, hard_sigmoid, vxc_uchar8, vxc_uchar8)\n\
+GRUCELL_QNT_F16TO_QNT(I8,  I8,  HSIGMOID, hard_sigmoid, vxc_char8,  vxc_char8)\n\
+GRUCELL_QNT_F16TO_QNT(I16, I16, HSIGMOID, hard_sigmoid, vxc_short8, vxc_short8)\n\
 "; /* end of grucell_activation_z_h_vx*/
 
 static const char grucell_cdnn_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -11455,7 +11096,8 @@ __kernel void grucell_h_times_activation_r_F16_F16toF16_##act_name( \\\n\
     _viv_asm(COPY, dst, dst1, 8); \\\n\
     VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)\n\
+GRUCELL_F16_F16TOF16(SIGMOID,  sigmoid_func)\n\
+GRUCELL_F16_F16TOF16(HSIGMOID, hard_sigmoid)\n\
 \n\
 _viv_uniform float hstate_in_scale;\n\
 _viv_uniform float hstate_in_tail;\n\
@@ -11492,9 +11134,12 @@ __kernel void grucell_h_times_activation_r_##name0##_F16toF16_##act_name( \\\n\
     _viv_asm(COPY, dst, dst1, 8); \\\n\
     VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-GRUCELL_QNT_F16TO_F16(U8,  SIGMOID, sigmoid_func, vxc_uchar8)\n\
-GRUCELL_QNT_F16TO_F16(I8,  SIGMOID, sigmoid_func, vxc_char8)\n\
-GRUCELL_QNT_F16TO_F16(I16, SIGMOID, sigmoid_func, vxc_short8)\n\
+GRUCELL_QNT_F16TO_F16(U8,  SIGMOID,  sigmoid_func, vxc_uchar8)\n\
+GRUCELL_QNT_F16TO_F16(I8,  SIGMOID,  sigmoid_func, vxc_char8)\n\
+GRUCELL_QNT_F16TO_F16(I16, SIGMOID,  sigmoid_func, vxc_short8)\n\
+GRUCELL_QNT_F16TO_F16(U8,  HSIGMOID, hard_sigmoid, vxc_uchar8)\n\
+GRUCELL_QNT_F16TO_F16(I8,  HSIGMOID, hard_sigmoid, vxc_char8)\n\
+GRUCELL_QNT_F16TO_F16(I16, HSIGMOID, hard_sigmoid, vxc_short8)\n\
 "; /* end of grucell_h_times_activation_r_vx*/
 
 static const char grucell_reset_after_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -11790,1141 +11435,181 @@ __kernel void hswish_BF16toBF16_2D(\n\
 }\n\
 "; /* end of hswish_vx*/
 
-static const char instance_normalization_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char instance_normalization_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int width;\n\
 _viv_uniform int height;\n\
-_viv_uniform float dimRatio;\n\
+_viv_uniform float inv_multiplier;\n\
 _viv_uniform int group_num;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
 \n\
-_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
-_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uniSum_X_X2_16x2;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform float input_scale2;\n\
+_viv_uniform float input_zp;\n\
+_viv_uniform float sum_x_tail;\n\
+_viv_uniform float sum_x2_tail0;\n\
+_viv_uniform float sum_x2_tail1;\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16(\n\
-    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidx = get_global_id(0) << 3;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(gidx, 0, gidz, gidz);\n\
-    vxc_short8 src0;\n\
-    vxc_half8 in_h;\n\
-    vxc_float4 sumsqr;\n\
-    vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\
+_viv_uniform VXC_512Bits uniSumX_16x1;\n\
+_viv_uniform VXC_512Bits uniSumX2_16x1;\n\
 \n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    int8 input_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr_a);\n\
-\n\
-    if(gidx < width)\n\
-    {\n\
-        for(coord.y = 0; coord.y < height;)\n\
-        {\n\
-            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            _viv_asm(COPY, in_h, src0, 16);\n\
-            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniFp16SumSqr_dp8x2);\n\
-            tmpSumSqr += sumsqr;\n\
-        }\n\
-    }\n\
-\n\
-    lcl_sum[lidx] = tmpSumSqr.x;\n\
-    lcl_sqr[lidx] = tmpSumSqr.y;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
-        float sum = 0;\n\
-        float sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            //sum += lcl_sum[i];\n\
-            //sqr += lcl_sqr[i];\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16_2D(\n\
-    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidx = get_global_id(0) << 3;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-\n\
-    int2 coord = (int2)(gidx, gidy);\n\
-    vxc_short8 src0;\n\
-    vxc_half8 in_h;\n\
-    vxc_float4 sumsqr;\n\
-    vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    int endH = gidy + height;\n\
-    if(gidx < width)\n\
-    {\n\
-        for(; coord.y < endH;)\n\
-        {\n\
-            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            _viv_asm(COPY, in_h, src0, 16);\n\
-            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniFp16SumSqr_dp8x2);\n\
-            tmpSumSqr += sumsqr;\n\
-        }\n\
-    }\n\
-\n\
-    lcl_sum[lidx] = tmpSumSqr.x;\n\
-    lcl_sqr[lidx] = tmpSumSqr.y;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
-        float sum = 0;\n\
-        float sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            //sum += lcl_sum[i];\n\
-            //sqr += lcl_sqr[i];\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16(\n\
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
-    vxc_short8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h, in_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
-\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-    bias_f = read_imagef(bias, coord_para);\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
-        coord_para.y += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
-\n\
-    int8 input_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr);\n\
-\n\
-    for(coord.y = 0; coord.y < height; coord.y++)\n\
-    {\n\
-    VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, in_h, src0, 16);\n\
-\n\
-    coord_in.y ++;\n\
-\n\
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        UniFP16toFP32Lo4_dp4x4);\n\
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertEndInt16Fp32_4x4);\n\
-\n\
-    vxc_float4 norm;\n\
-    norm = scale_vari * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = scale_vari * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
-                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16_2D(\n\
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-    int4 coord = (int4)(get_global_id(0), gidy, 0, 0);\n\
-    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
-    int endH = gidy + height;\n\
-    vxc_short8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h, in_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
-\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-    bias_f = read_imagef(bias, coord_para);\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
-        coord_para.y += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
-\n\
-    for(; coord.y < endH; coord.y++)\n\
-    {\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, in_h, src0, 16);\n\
-\n\
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        UniFP16toFP32Lo4_dp4x4);\n\
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertEndInt16Fp32_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = scale_vari * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = scale_vari * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}"; /* end of instance_normalization_f16_vx*/
-
-static const char instance_normalization_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform int width;\n\
-_viv_uniform int height;\n\
-_viv_uniform float dimRatio;\n\
-_viv_uniform int group_num;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
-\n\
-_viv_uniform float inFlScale_s2;\n\
-_viv_uniform float input_fl_scale;\n\
-_viv_uniform float inOut_fl_scale;\n\
-_viv_uniform float output_fl_scale;\n\
-\n\
-_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\
-_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I16(\n\
-    image2d_array_t input,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
-{\n\
-    int gidx = get_global_id(0) << 3;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(gidx, 0, gidz, gidz);\n\
-    vxc_short8 src0;\n\
-    float sum = 0, sqr = 0;\n\
-    vxc_float4 sumsqr = (vxc_float4)(0);\n\
-    vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    int8 input_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr_a);\n\
-    if(gidx < width)\n\
-    {\n\
-        for(coord.y = 0; coord.y < height;)\n\
-        {\n\
+#define INSTANCE_NORM_SUMS_8BITS_IMPL(name, src_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float eps, int rs_flag) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0) << 4; \\\n\
+    int lidx = get_local_id(0); \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int4 coord = (int4)(gidx, 0, gidz, gidz); \\\n\
+    src_type src0; \\\n\
+    float2 sums_f32 = 0; \\\n\
+    int2 sums = 0, sum_x_x2; \\\n\
+ \\\n\
+    __local float lcl_sum[16]; \\\n\
+    __local float lcl_sqr[16]; \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr_a); \\\n\
+    if(gidx < width) \\\n\
+    { \\\n\
+        for(coord.y = 0; coord.y < height;) \\\n\
+        { \\\n\
             VXC_OP4(img_load_3d, src0, input, coord, 0, \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniInt16SumSqr_dp8x2);\n\
-            //tmpSumSqr += sumsqr;\n\
-            tmpSumSqr.x += sumsqr.x;\n\
-            sqr += (sumsqr.y * inFlScale_s2);\n\
-        }\n\
-        sum = tmpSumSqr.x * input_fl_scale;\n\
-        //sqr = tmpSumSqr.y * inFlScale_s2;\n\
-    }\n\
-\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
-        sum = 0; sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            //sum += lcl_sum[i];\n\
-            //sqr += lcl_sqr[i];\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.y++; \\\n\
+            VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\
+            VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\
+            sums = sums + sum_x_x2; \\\n\
+        } \\\n\
+        sums_f32 = convert_float2(sums); \\\n\
+        sums_f32.y = sums_f32.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sums_f32.x; \\\n\
+        sums_f32.x = sums_f32.x * input_scale + sum_x_tail; \\\n\
+    } \\\n\
+    lcl_sum[lidx] = sums_f32.x; \\\n\
+    lcl_sqr[lidx] = sums_f32.y; \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+ \\\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \\\n\
+    if(lidx == 0) \\\n\
+    { \\\n\
+        float4 one = (float4)(1, 1, 1, 1); \\\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\
+        float sum = 0, sqr = 0; \\\n\
+        for(int i = 0; i < 4; i++) \\\n\
+        { \\\n\
+            sum += dot(tmp_sum[i], one); \\\n\
+            sqr += dot(tmp_sqr[i], one); \\\n\
+        } \\\n\
+        float4 data = (float4)(sum, sqr, 0, 0); \\\n\
+        write_imagef(output, coord_out, data); \\\n\
+    } \\\n\
 }\n\
+INSTANCE_NORM_SUMS_8BITS_IMPL(U8, vxc_uchar16)\n\
+INSTANCE_NORM_SUMS_8BITS_IMPL(I8, vxc_char16)\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I16_2D(\n\
-    image2d_array_t input,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
-{\n\
-    int gidx = get_global_id(0) << 3;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-\n\
-    int2 coord = (int2)(gidx, gidy);\n\
-    vxc_short8 src0;\n\
-    float sum = 0, sqr = 0;\n\
-    vxc_float4 sumsqr = (vxc_float4)(0);\n\
-    vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    int endH = gidy + height;\n\
-    if(gidx < width)\n\
-    {\n\
-        for(; coord.y < endH;)\n\
-        {\n\
-            VXC_ReadImage(src0, input, coord, 0,\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniInt16SumSqr_dp8x2);\n\
-            //tmpSumSqr += sumsqr;\n\
-            tmpSumSqr.x += sumsqr.x;\n\
-            sqr += (sumsqr.y * inFlScale_s2);\n\
-        }\n\
-        sum = tmpSumSqr.x * input_fl_scale;\n\
-        //sqr = tmpSumSqr.y * inFlScale_s2;\n\
-    }\n\
-\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
-        sum = 0; sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            //sum += lcl_sum[i];\n\
-            //sqr += lcl_sqr[i];\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
+#define INSTANCE_NORM_SUMS_8BITS_IMPL_2D(name, src_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float eps, int rs_flag) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0) << 4; \\\n\
+    int lidx = get_local_id(0); \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int gidy = gidz * height; \\\n\
+ \\\n\
+    int2 coord = (int2)(gidx, gidy); \\\n\
+    src_type src0; \\\n\
+    float2 sums_f32 = 0; \\\n\
+    int2 sums = 0, sum_x_x2; \\\n\
+    int endH = gidy + height; \\\n\
+ \\\n\
+    __local float lcl_sum[16]; \\\n\
+    __local float lcl_sqr[16]; \\\n\
+    if (gidx < width) \\\n\
+    { \\\n\
+        for(; coord.y < endH;) \\\n\
+        { \\\n\
+            VXC_ReadImage(src0, input, coord, 0,  \\\n\
+                             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.y++; \\\n\
+            VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\
+            VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\
+            sums = sums + sum_x_x2; \\\n\
+        } \\\n\
+        sums_f32 = convert_float2(sums); \\\n\
+        sums_f32.y = sums_f32.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sums_f32.x; \\\n\
+        sums_f32.x = sums_f32.x * input_scale + sum_x_tail; \\\n\
+    } \\\n\
+    lcl_sum[lidx] = sums_f32.x; \\\n\
+    lcl_sqr[lidx] = sums_f32.y; \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+ \\\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \\\n\
+    if(lidx == 0) \\\n\
+    { \\\n\
+        float4 one = (float4)(1, 1, 1, 1); \\\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\
+        float sum = 0, sqr = 0; \\\n\
+        for(int i = 0; i < 4; i++) \\\n\
+        { \\\n\
+            sum += dot(tmp_sum[i], one); \\\n\
+            sqr += dot(tmp_sqr[i], one); \\\n\
+        } \\\n\
+        float4 data = (float4)(sum, sqr, 0, 0); \\\n\
+        write_imagef(output, coord_out, data); \\\n\
+    } \\\n\
 }\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toF16(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
-    vxc_short8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
-\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    bias_f = read_imagef(bias, coord_para);\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
-        coord_para.y += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
-\n\
-    int8 input_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr);\n\
-\n\
-    for(coord.y = 0; coord.y < height; coord.y++)\n\
-    {\n\
-    VXC_OP4(img_load_3d, src0, input, coord_in, 0, \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.y ++;\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Fst_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Secd_4x4);\n\
-\n\
-    vxc_float4 norm;\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
-                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toF16_2D(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-    int4 coord = (int4)(get_global_id(0), gidy, 0, 0);\n\
-    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
-    int endH = gidy + height;\n\
-    vxc_short8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
-\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    bias_f = read_imagef(bias, coord_para);\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
-        coord_para.y += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
-\n\
-    for(; coord.y < endH; coord.y++)\n\
-    {\n\
-    VXC_ReadImage(src0, input, coord.xy, 0,\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Fst_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Secd_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toI16(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
-    vxc_short8 src0, src2;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
-\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    bias_f = read_imagef(bias, coord_para);\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
-        coord_para.y += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    float alpha = inOut_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
-\n\
-    int8 input_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr);\n\
-    for(coord.y = 0; coord.y < height; coord.y++)\n\
-    {\n\
-    VXC_OP4(img_load_3d, src0, input, coord_in, 0, \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.y ++;\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Fst_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Secd_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt32toInt16_2x8);\n\
-    VXC_OP4_NoDest(img_store_3d, output, coord, src2, \\\n\
-                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toI16_2D(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-    int2 coord = (int2)(get_global_id(0), gidy);\n\
-    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
-    int endH = gidy + height;\n\
-    vxc_short8 src0, src2;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
-\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    bias_f = read_imagef(bias, coord_para);\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
-        coord_para.y += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    float alpha = inOut_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
-\n\
-    for(; coord.y < endH; coord.y++)\n\
-    {\n\
-    VXC_ReadImage(src0, input, coord, 0,\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Fst_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Secd_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt32toInt16_2x8);\n\
-    VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}"; /* end of instance_normalization_i16_vx*/
-
-static const char instance_normalization_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform int width;\n\
-_viv_uniform int height;\n\
-_viv_uniform float dimRatio;\n\
-_viv_uniform int group_num;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniSumInt8_16x1;\n\
-_viv_uniform VXC_512Bits uniSqrSumInt8_16x1;\n\
-_viv_uniform float inFlScale_s2;\n\
-_viv_uniform float input_fl_scale;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4;\n\
-\n\
-_viv_uniform float inOut_fl_scale;\n\
-_viv_uniform float output_fl_scale;\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8(\n\
-    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidx = get_global_id(0) << 4;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(gidx, 0, gidz, gidz);\n\
-    vxc_char16 src0;\n\
-    float sum = 0, sqr = 0;\n\
-    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    int8 input_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr_a);\n\
-\n\
-    if(gidx < width)\n\
-    {\n\
-        for(coord.y = 0; coord.y < height;)\n\
-        {\n\
-            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);\n\
-            tmpSum += (tmpSum1);\n\
-            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);\n\
-            tmpSqr += (tmpSqr1);\n\
-        }\n\
-        sqr = tmpSqr * inFlScale_s2;\n\
-        sum = tmpSum * input_fl_scale;\n\
-    }\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
-        sum = 0; sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8_2D(\n\
-    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidx = get_global_id(0) << 4;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-\n\
-    int2 coord = (int2)(gidx, gidy);\n\
-    vxc_char16 src0;\n\
-    float sum = 0, sqr = 0;\n\
-    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    int endH = gidy + height;\n\
-    if(gidx < width)\n\
-    {\n\
-        for(; coord.y < endH;)\n\
-        {\n\
-            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);\n\
-            tmpSum += (tmpSum1);\n\
-            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);\n\
-            tmpSqr += (tmpSqr1);\n\
-        }\n\
-        sqr = tmpSqr * inFlScale_s2;\n\
-        sum = tmpSum * input_fl_scale;\n\
-    }\n\
-\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
-        sum = 0; sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16(\n\
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
-    vxc_char16 src0;\n\
-    vxc_short8 src1, outval;\n\
-    vxc_half8 scale_h, dst;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
-\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-    bias_f = read_imagef(bias, coord_para);\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
-        coord_para.y += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-\n\
-    coord_para = coord;\n\
-    int8 input_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord_para.z, baseAddr);\n\
-\n\
-    for(coord.y = 0; coord.y < height;)\n\
-    {\n\
-    VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    coord_para.xy = coord.xy;\n\
-    coord.y++;\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
-\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord_para.x += 8;\n\
-    norm = alpha * tmpData2 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData3 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16_2D(\n\
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-    int4 coord = (int4)(get_global_id(0), gidy, 0, 0);\n\
-    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
-    int endH = gidy + height;\n\
-    vxc_char16 src0;\n\
-    vxc_short8 src1, outval;\n\
-    vxc_half8 scale_h, dst;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
-\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    bias_f = read_imagef(bias, coord_para);\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
-        coord_para.y += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-\n\
-    for(; coord.y < endH;)\n\
-    {\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    coord_para = coord;\n\
-    coord.y++;\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    coord_para.x += 8;\n\
-    norm = alpha * tmpData2 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData3 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8(\n\
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
-    vxc_char16 src0, src2;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
-\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    bias_f = read_imagef(bias, coord_para);\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
-        coord_para.y += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    float alpha = inOut_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
-\n\
-    int8 input_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr);\n\
-\n\
-    for(coord.y = 0; coord.y < height; coord.y++)\n\
-    {\n\
-    VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.y ++;\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    norm = tmpData2 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData3 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8_2D(\n\
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-    int2 coord = (int2)(get_global_id(0), gidy);\n\
-    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
-    int endH = gidy + height;\n\
-    vxc_char16 src0, src2;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
-\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    bias_f = read_imagef(bias, coord_para);\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
-        coord_para.y += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    float alpha = inOut_fl_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
-\n\
-    for(; coord.y < endH; coord.y++)\n\
-    {\n\
-    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    norm = tmpData2 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData3 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-"; /* end of instance_normalization_i8_vx*/
-
-static const char instance_normalization_scale_f32_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform int height;\n\
-_viv_uniform float dimRatio;\n\
-_viv_uniform int group_num;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
-_viv_uniform int inputZP;\n\
-_viv_uniform float scale_inOut;\n\
-_viv_uniform float outputScale;\n\
-_viv_uniform int output_ZP;\n\
-_viv_uniform float inOut_fl_scale;\n\
-_viv_uniform float output_fl_scale;\n\
-_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;\n\
-\n\
-#define INSTANCENORM_8BITS_F32(src1_type_name, read_type) \\\n\
-__kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \\\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, \\\n\
-    image2d_array_t output, float eps, int rsFlg) \\\n\
+INSTANCE_NORM_SUMS_8BITS_IMPL_2D(U8, vxc_uchar16)\n\
+INSTANCE_NORM_SUMS_8BITS_IMPL_2D(I8, vxc_char16)\n\
+\n\
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;\n\
+#define INSTANCE_NORM_8BITS_IMPL(name, src_type, dst_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float eps, int rs_flag) \\\n\
 { \\\n\
     int gidz = get_global_id(1); \\\n\
     int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
     int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
-    int2 coord_para = (int2)(gidz, 0); \\\n\
-    read_type src0, src2; \\\n\
+    int2 coord_para = (int2)(0, gidz); \\\n\
+    src_type src0; \\\n\
+    dst_type dst; \\\n\
     float scale_vari, bias_val; \\\n\
-    vxc_float4 mean_vari = (vxc_float4)(0); \\\n\
- \\\n\
-    Image img1 = create_image_from_image2d(bias, 4); \\\n\
-    Image img2 = create_image_from_image2d(scale, 4); \\\n\
-    Image img3 = create_image_from_image2d(meanVari, 4); \\\n\
-    __global float* bias_ptr = (__global float*)img1.ptr; \\\n\
-    __global float* scal_ptr = (__global float*)img2.ptr; \\\n\
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); \\\n\
-    __global float4* vari_ptr = (__global float4*)sumVari_ptr; \\\n\
- \\\n\
-    float bval = bias_ptr[gidz]; \\\n\
-    float sval = scal_ptr[gidz]; \\\n\
+    float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\
  \\\n\
+    scale_f = read_imagef(scale, coord_para); \\\n\
+    bias_f = read_imagef(bias, coord_para); \\\n\
     for(int i = 0; i < group_num; i++) \\\n\
     { \\\n\
-        mean_vari += vari_ptr[i]; \\\n\
+        mean_vari += read_imagef(meanVari, coord_para); \\\n\
+        coord_para.x += 4; \\\n\
     } \\\n\
-    mean_vari *= dimRatio; \\\n\
+    mean_vari *= inv_multiplier; \\\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
     mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
  \\\n\
-    scale_vari = sval * mean_vari.s1; \\\n\
-    short zp = inputZP; \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
     vxc_int4 tmpVal0, tmpVal1; \\\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
-    float alpha = scale_inOut * scale_vari; \\\n\
-    bias_val = (bval - scale_vari * mean_vari.s0) * outputScale + output_ZP; \\\n\
+    float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
+    float alpha = input_scale * output_scale * scale_vari; \\\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
  \\\n\
     int8 input_desc, output_desc; \\\n\
     _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
@@ -12936,240 +11621,543 @@ __kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \\\n\
  \\\n\
     for(coord.y = 0; coord.y < height; coord.y++) \\\n\
     { \\\n\
-    VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\
+    VXC_OP4(img_load_3d, src0, input, coord_in, 0, \\\n\
                     VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
     coord_in.y ++; \\\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
-             uniConvert1stUint8SubZpToFp32_4x4); \\\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
-             uniConvert2ndUint8SubZpToFp32_4x4); \\\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
-             uniConvert3rdUint8SubZpToFp32_4x4); \\\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
-             uniConvert4thUint8SubZpToFp32_4x4); \\\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\
     norm = tmpData0 * alpha + bias_val; \\\n\
     tmpVal0 = convert_int4_rte(norm); \\\n\
     norm = tmpData1 * alpha + bias_val; \\\n\
     tmpVal1 = convert_int4_rte(norm); \\\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
     norm = tmpData2 * alpha + bias_val; \\\n\
     tmpVal0 = convert_int4_rte(norm); \\\n\
     norm = tmpData3 * alpha + bias_val; \\\n\
     tmpVal1 = convert_int4_rte(norm); \\\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\
-    VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\
     } \\\n\
 }\n\
-INSTANCENORM_8BITS_F32(U8, vxc_uchar16)\n\
-INSTANCENORM_8BITS_F32(I8, vxc_char16)\n\
+INSTANCE_NORM_8BITS_IMPL(U8_F32toU8, vxc_uchar16, vxc_uchar16)\n\
+INSTANCE_NORM_8BITS_IMPL(I8_F32toI8, vxc_char16,  vxc_char16)\n\
 \n\
-#define INSTANCENORM_8BITS_F32_2D(src1_type_name, read_type) \\\n\
-__kernel void instance_norm_##src1_type_name##F32to##src1_type_name##_2D( \\\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, \\\n\
-    image2d_array_t output, float eps, int rsFlg) \\\n\
+#define INSTANCE_NORM_8BITS_IMPL_2D(name, src_type, dst_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float eps, int rs_flag) \\\n\
 { \\\n\
     int gidz = get_global_id(1); \\\n\
     int gidy = gidz * height; \\\n\
     int2 coord = (int2)(get_global_id(0), gidy); \\\n\
-    int2 coord_para = (int2)(gidz, 0); \\\n\
+    int2 coord_para = (int2)(0, gidz); \\\n\
     int endH = gidy + height; \\\n\
-    read_type src0, src2; \\\n\
+    src_type src0; \\\n\
+    dst_type dst; \\\n\
     float scale_vari, bias_val; \\\n\
-    vxc_float4 mean_vari = (vxc_float4)(0); \\\n\
- \\\n\
-    Image img1 = create_image_from_image2d(bias, 4); \\\n\
-    Image img2 = create_image_from_image2d(scale, 4); \\\n\
-    Image img3 = create_image_from_image2d(meanVari, 4); \\\n\
-    __global float* bias_ptr = (__global float*)img1.ptr; \\\n\
-    __global float* scal_ptr = (__global float*)img2.ptr; \\\n\
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); \\\n\
-    __global float4* vari_ptr = (__global float4*)sumVari_ptr; \\\n\
- \\\n\
-    float bval = bias_ptr[gidz]; \\\n\
-    float sval = scal_ptr[gidz]; \\\n\
+    float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\
  \\\n\
+    scale_f = read_imagef(scale, coord_para); \\\n\
+    bias_f = read_imagef(bias, coord_para); \\\n\
     for(int i = 0; i < group_num; i++) \\\n\
     { \\\n\
-        mean_vari += vari_ptr[i]; \\\n\
+        mean_vari += read_imagef(meanVari, coord_para); \\\n\
+        coord_para.x += 4; \\\n\
     } \\\n\
- \\\n\
-    mean_vari *= dimRatio; \\\n\
+    mean_vari *= inv_multiplier; \\\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
     mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
  \\\n\
-    scale_vari = sval * mean_vari.s1; \\\n\
-    short zp = inputZP; \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
     vxc_int4 tmpVal0, tmpVal1; \\\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
-    float alpha = scale_inOut * scale_vari; \\\n\
-    bias_val = (bval - scale_vari * mean_vari.s0) * outputScale + output_ZP; \\\n\
+    float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
+    float alpha = input_scale * output_scale * scale_vari; \\\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
  \\\n\
     for(; coord.y < endH; coord.y++) \\\n\
     { \\\n\
-    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
-             uniConvert1stUint8SubZpToFp32_4x4); \\\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
-             uniConvert2ndUint8SubZpToFp32_4x4); \\\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
-             uniConvert3rdUint8SubZpToFp32_4x4); \\\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
-             uniConvert4thUint8SubZpToFp32_4x4); \\\n\
+    VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\
     norm = tmpData0 * alpha + bias_val; \\\n\
     tmpVal0 = convert_int4_rte(norm); \\\n\
     norm = tmpData1 * alpha + bias_val; \\\n\
     tmpVal1 = convert_int4_rte(norm); \\\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
     norm = tmpData2 * alpha + bias_val; \\\n\
     tmpVal0 = convert_int4_rte(norm); \\\n\
     norm = tmpData3 * alpha + bias_val; \\\n\
     tmpVal1 = convert_int4_rte(norm); \\\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\
-    VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
     } \\\n\
 }\n\
-INSTANCENORM_8BITS_F32_2D(U8, vxc_uchar16)\n\
-INSTANCENORM_8BITS_F32_2D(I8, vxc_char16)\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F32toI16(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int2 coord_para = (int2)(gidz, 0);\n\
-    vxc_short8 src0, src2;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4  mean_vari = (vxc_float4)(0);\n\
-\n\
-    Image img1 = create_image_from_image2d(bias, 4);\n\
-    Image img2 = create_image_from_image2d(scale, 4);\n\
-    Image img3 = create_image_from_image2d(meanVari, 4);\n\
-    __global float* bias_ptr = (__global float*)img1.ptr;\n\
-    __global float* scal_ptr = (__global float*)img2.ptr;\n\
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\
-    __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\
-\n\
-    float bval = bias_ptr[gidz];\n\
-    float sval = scal_ptr[gidz];\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += vari_ptr[i];\n\
-    }\n\
-\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = sval * mean_vari.s1;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    float alpha = inOut_fl_scale * scale_vari;\n\
-    bias_val = (bval - scale_vari * mean_vari.s0) * output_fl_scale;\n\
-\n\
-    int8 input_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr);\n\
-    for(coord.y = 0; coord.y < height; coord.y++)\n\
-    {\n\
-    VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.y ++;\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Fst_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Secd_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt32toInt16_2x8);\n\
-    VXC_OP4_NoDest(img_store_3d, output, coord, src2, \\\n\
-                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F32toI16_2D(\n\
-    image2d_t input, image2d_t bias, image2d_t scale,\n\
-    image2d_t meanVari, image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-    int2 coord = (int2)(get_global_id(0), gidy);\n\
-    int2 coord_para = (int2)(gidz, 0);\n\
-    int endH = gidy + height;\n\
-    vxc_short8 src0, src2;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 mean_vari = (vxc_float4)(0);\n\
-\n\
-    Image img1 = create_image_from_image2d(bias, 4);\n\
-    Image img2 = create_image_from_image2d(scale, 4);\n\
-    Image img3 = create_image_from_image2d(meanVari, 4);\n\
-    __global float* bias_ptr = (__global float*)img1.ptr;\n\
-    __global float* scal_ptr = (__global float*)img2.ptr;\n\
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\
-    __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\
-\n\
-    float bval = bias_ptr[gidz];\n\
-    float sval = scal_ptr[gidz];\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += vari_ptr[i];\n\
-    }\n\
-\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = sval * mean_vari.s1;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    float alpha = inOut_fl_scale * scale_vari;\n\
-    bias_val = (bval - scale_vari * mean_vari.s0) * output_fl_scale;\n\
-\n\
-    for(; coord.y < endH; coord.y++)\n\
-    {\n\
-    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Fst_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertInt16Fp32Secd_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt32toInt16_2x8);\n\
-    VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}"; /* end of instance_normalization_scale_f32_vx*/
+INSTANCE_NORM_8BITS_IMPL_2D(U8_F32toU8, vxc_uchar16, vxc_uchar16)\n\
+INSTANCE_NORM_8BITS_IMPL_2D(I8_F32toI8, vxc_char16,  vxc_char16)"; /* end of instance_normalization_0_vx*/
 
-static const char instance_normalization_scale_f32_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char instance_normalization_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int width;\n\
 _viv_uniform int height;\n\
-_viv_uniform float dimRatio;\n\
+_viv_uniform float inv_multiplier;\n\
+_viv_uniform int group_num;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform float input_zp;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;\n\
+\n\
+#define INSTANCE_NORM_8_TO_F16_IMPL(name, src_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float eps, int rs_flag) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
+    int4 coord_para = (int4)(0, gidz, 0, 0); \\\n\
+    src_type src0; \\\n\
+    vxc_short8 outval; \\\n\
+    vxc_half8 dst; \\\n\
+    float scale_vari, bias_val; \\\n\
+    float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\
+ \\\n\
+    scale_f = read_imagef(scale, coord_para.xy); \\\n\
+    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+    for(int i = 0; i < group_num; i++) \\\n\
+    { \\\n\
+        mean_vari += read_imagef(meanVari, coord_para.xy); \\\n\
+        coord_para.x += 4; \\\n\
+    } \\\n\
+    mean_vari *= inv_multiplier; \\\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
+    half4 tmpVal0, tmpVal1; \\\n\
+    float alpha = scale_vari * input_scale; \\\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
+ \\\n\
+    coord_para = coord; \\\n\
+    int8 input_desc, output_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr_a); \\\n\
+ \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_para.z, baseAddr); \\\n\
+    for(coord.y = 0; coord.y < height;) \\\n\
+    { \\\n\
+    VXC_OP4(img_load_3d, src0, input, coord, 0, \\\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_para.xy = coord.xy; \\\n\
+    coord.y++; \\\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\
+    norm = alpha * tmpData0 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData1 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    coord_para.x += 8; \\\n\
+    norm = alpha * tmpData2 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData3 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+INSTANCE_NORM_8_TO_F16_IMPL(U8_F32toF16, vxc_uchar16)\n\
+INSTANCE_NORM_8_TO_F16_IMPL(I8_F32toF16, vxc_char16)\n\
+\n\
+#define INSTANCE_NORM_8_TO_F16_IMPL_2D(name, src_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float eps, int rs_flag) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int gidy = gidz * height; \\\n\
+    int4 coord = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
+    int4 coord_para = (int4)(0, gidz, 0, 0); \\\n\
+    int endH = gidy + height; \\\n\
+    src_type src0; \\\n\
+    vxc_short8 outval; \\\n\
+    vxc_half8 dst; \\\n\
+    float scale_vari, bias_val; \\\n\
+    float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\
+ \\\n\
+    scale_f = read_imagef(scale, coord_para.xy); \\\n\
+    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+    for(int i = 0; i < group_num; i++) \\\n\
+    { \\\n\
+        mean_vari += read_imagef(meanVari, coord_para.xy); \\\n\
+        coord_para.x += 4; \\\n\
+    } \\\n\
+    mean_vari *= inv_multiplier; \\\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
+ \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
+    half4 tmpVal0, tmpVal1; \\\n\
+    float alpha = scale_vari * input_scale; \\\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
+    for(; coord.y < endH;) \\\n\
+    { \\\n\
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_para = coord; \\\n\
+    coord.y++; \\\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\
+    norm = alpha * tmpData0 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData1 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_para.x += 8; \\\n\
+    norm = alpha * tmpData2 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData3 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+INSTANCE_NORM_8_TO_F16_IMPL_2D(U8_F32toF16, vxc_uchar16)\n\
+INSTANCE_NORM_8_TO_F16_IMPL_2D(I8_F32toF16, vxc_char16)\n\
+"; /* end of instance_normalization_1_vx*/
+
+static const char instance_normalization_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform float inv_multiplier;\n\
+_viv_uniform int group_num;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uniSum_X_X2_8x2;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform float input_scale2;\n\
+_viv_uniform float input_zp;\n\
+_viv_uniform float sum_x_tail;\n\
+_viv_uniform float sum_x2_tail0;\n\
+_viv_uniform float sum_x2_tail1;\n\
+\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+#define INSTANCE_NORM_SUMS_16BITS_IMPL(name, src_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float eps, int rs_flag) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0) << 3; \\\n\
+    int lidx = get_local_id(0); \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int4 coord = (int4)(gidx, 0, gidz, gidz); \\\n\
+    vxc_short8 src0; \\\n\
+    src_type in_h; \\\n\
+    float4 sumsqr; \\\n\
+    float4 tmpSumSqr = (float4)(0); \\\n\
+ \\\n\
+    __local float lcl_sum[16]; \\\n\
+    __local float lcl_sqr[16]; \\\n\
+ \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr_a); \\\n\
+ \\\n\
+    if(gidx < width) \\\n\
+    { \\\n\
+        for(coord.y = 0; coord.y < height;) \\\n\
+        { \\\n\
+            VXC_OP4(img_load_3d, src0, input, coord, 0, \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.y++; \\\n\
+            _viv_asm(COPY, in_h, src0, 16); \\\n\
+            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \\\n\
+                uniSum_X_X2_8x2); \\\n\
+            tmpSumSqr += sumsqr; \\\n\
+        } \\\n\
+        tmpSumSqr.y = tmpSumSqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * tmpSumSqr.x; \\\n\
+        tmpSumSqr.x = tmpSumSqr.x * input_scale + sum_x_tail; \\\n\
+    } \\\n\
+ \\\n\
+    lcl_sum[lidx] = tmpSumSqr.x; \\\n\
+    lcl_sqr[lidx] = tmpSumSqr.y; \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+ \\\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \\\n\
+    if(lidx == 0) \\\n\
+    { \\\n\
+        float4 one = (float4)(1, 1, 1, 1); \\\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\
+ \\\n\
+        float sum = 0; \\\n\
+        float sqr = 0; \\\n\
+        for(int i = 0; i < 4; i++) \\\n\
+        { \\\n\
+            sum += dot(tmp_sum[i], one); \\\n\
+            sqr += dot(tmp_sqr[i], one); \\\n\
+        } \\\n\
+ \\\n\
+        float4 data = (float4)(sum, sqr, 0, 0); \\\n\
+        write_imagef(output, coord_out, data); \\\n\
+    } \\\n\
+}\n\
+INSTANCE_NORM_SUMS_16BITS_IMPL(F16, vxc_half8)\n\
+INSTANCE_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)\n\
+\n\
+#define INSTANCE_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float eps, int rs_flag) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0) << 3; \\\n\
+    int lidx = get_local_id(0); \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int gidy = gidz * height; \\\n\
+ \\\n\
+    int2 coord = (int2)(gidx, gidy); \\\n\
+    vxc_short8 src0; \\\n\
+    src_type in_h; \\\n\
+    float4 sumsqr; \\\n\
+    float4 tmpSumSqr = (float4)(0); \\\n\
+ \\\n\
+    __local float lcl_sum[16]; \\\n\
+    __local float lcl_sqr[16]; \\\n\
+ \\\n\
+    int endH = gidy + height; \\\n\
+    if(gidx < width) \\\n\
+    { \\\n\
+        for(; coord.y < endH;) \\\n\
+        { \\\n\
+            VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.y++; \\\n\
+            _viv_asm(COPY, in_h, src0, 16); \\\n\
+            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \\\n\
+                uniSum_X_X2_8x2); \\\n\
+            tmpSumSqr += sumsqr; \\\n\
+        } \\\n\
+        tmpSumSqr.y = tmpSumSqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * tmpSumSqr.x; \\\n\
+        tmpSumSqr.x = tmpSumSqr.x * input_scale + sum_x_tail; \\\n\
+    } \\\n\
+ \\\n\
+    lcl_sum[lidx] = tmpSumSqr.x; \\\n\
+    lcl_sqr[lidx] = tmpSumSqr.y; \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+ \\\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \\\n\
+    if(lidx == 0) \\\n\
+    { \\\n\
+        float4 one = (float4)(1, 1, 1, 1); \\\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\
+ \\\n\
+        float sum = 0; \\\n\
+        float sqr = 0; \\\n\
+        for(int i = 0; i < 4; i++) \\\n\
+        { \\\n\
+            sum += dot(tmp_sum[i], one); \\\n\
+            sqr += dot(tmp_sqr[i], one); \\\n\
+        } \\\n\
+ \\\n\
+        float4 data = (float4)(sum, sqr, 0, 0); \\\n\
+        write_imagef(output, coord_out, data); \\\n\
+    } \\\n\
+}\n\
+INSTANCE_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)\n\
+INSTANCE_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)\n\
+\n\
+#define INSTANCE_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float eps, int rs_flag) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
+    int4 coord_para = (int4)(0, gidz, 0, 0); \\\n\
+    vxc_short8 src0; \\\n\
+    src_type in_h; \\\n\
+    float scale_vari, bias_val; \\\n\
+    float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\
+ \\\n\
+    scale_f = read_imagef(scale, coord_para.xy); \\\n\
+    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+ \\\n\
+    for(int i = 0; i < group_num; i++) \\\n\
+    { \\\n\
+        mean_vari += read_imagef(meanVari, coord_para.xy); \\\n\
+        coord_para.x += 4; \\\n\
+    } \\\n\
+    mean_vari *= inv_multiplier; \\\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
+ \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    float alpha = input_scale * output_scale * scale_vari; \\\n\
+    float4  tmpData0, tmpData1; \\\n\
+    copy_type outval; \\\n\
+    conv_type tmpVal0, tmpVal1; \\\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
+    dst_type dst; \\\n\
+ \\\n\
+    int8 input_desc, output_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.z, baseAddr_a); \\\n\
+ \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr); \\\n\
+ \\\n\
+    for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+    VXC_OP4(img_load_3d, src0, input, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, in_h, src0, 16); \\\n\
+ \\\n\
+    coord_in.y ++; \\\n\
+ \\\n\
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
+ \\\n\
+    float4 norm; \\\n\
+    norm = alpha * tmpData0 + bias_val; \\\n\
+    _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData1 + bias_val; \\\n\
+    _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+INSTANCE_NORM_16BITS_IMPL(F16_F32toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)\n\
+INSTANCE_NORM_16BITS_IMPL(F16_F32toI16, vxc_half8,  vxc_short8, vxc_short8, int4)\n\
+INSTANCE_NORM_16BITS_IMPL(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)\n\
+INSTANCE_NORM_16BITS_IMPL(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)\n\
+INSTANCE_NORM_16BITS_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\
+INSTANCE_NORM_16BITS_IMPL(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)\n\
+\n\
+#define INSTANCE_NORM_16BITS_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __read_only  image2d_t       meanVari, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float eps, int rs_flag) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int gidy = gidz * height; \\\n\
+    int4 coord = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
+    int4 coord_para = (int4)(0, gidz, 0, 0); \\\n\
+    int endH = gidy + height; \\\n\
+    vxc_short8 src0; \\\n\
+    src_type in_h; \\\n\
+    float scale_vari, bias_val; \\\n\
+    float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\
+ \\\n\
+    scale_f = read_imagef(scale, coord_para.xy); \\\n\
+    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+ \\\n\
+    for(int i = 0; i < group_num; i++) \\\n\
+    { \\\n\
+        mean_vari += read_imagef(meanVari, coord_para.xy); \\\n\
+        coord_para.x += 4; \\\n\
+    } \\\n\
+    mean_vari *= inv_multiplier; \\\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
+ \\\n\
+    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    float alpha = input_scale * output_scale * scale_vari; \\\n\
+    float4  tmpData0, tmpData1; \\\n\
+    copy_type outval; \\\n\
+    conv_type tmpVal0, tmpVal1; \\\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\
+    bias_val = bias_val - input_zp * alpha; \\\n\
+    dst_type dst; \\\n\
+ \\\n\
+    for(; coord.y < endH; coord.y++) \\\n\
+    { \\\n\
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, in_h, src0, 16); \\\n\
+ \\\n\
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
+    float4 norm; \\\n\
+    norm = alpha * tmpData0 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal0, norm); \\\n\
+    norm = alpha * tmpData1 + bias_val; \\\n\
+    _viv_asm(CONV, tmpVal1, norm); \\\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, outval, dst, 16); \\\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+INSTANCE_NORM_16BITS_IMPL_2D(F16_F32toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)\n\
+INSTANCE_NORM_16BITS_IMPL_2D(F16_F32toI16, vxc_half8,  vxc_short8, vxc_short8, int4)\n\
+INSTANCE_NORM_16BITS_IMPL_2D(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)\n\
+INSTANCE_NORM_16BITS_IMPL_2D(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)\n\
+INSTANCE_NORM_16BITS_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\
+INSTANCE_NORM_16BITS_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)\n\
+"; /* end of instance_normalization_2_vx*/
+
+static const char instance_normalization_3_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform float inv_multiplier;\n\
 _viv_uniform int group_num;\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
 _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
 \n\
-constant vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
-constant float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_BF16(\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16(\n\
     image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
 {\n\
     int gidx = get_global_id(0) << 3;\n\
@@ -13178,8 +12166,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     int4 coord = (int4)(gidx, 0, gidz, gidz);\n\
     vxc_short8 src0, src1, src2;\n\
     float4 srcA, srcB;\n\
-    vxc_float sum = 0, sqr = 0;\n\
-\n\
+    float sum = 0, sqr = 0;\n\
+    float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
     __local float lcl_sum[16];\n\
     __local float lcl_sqr[16];\n\
 \n\
@@ -13229,7 +12218,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     }\n\
 }\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_BF16_2D(\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16_2D(\n\
     image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
 {\n\
     int gidx = get_global_id(0) << 3;\n\
@@ -13240,7 +12229,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     int2 coord = (int2)(gidx, gidy);\n\
     vxc_short8 src0, src1, src2;\n\
     float4 srcA, srcB;\n\
-    vxc_float sum = 0, sqr = 0;\n\
+    float sum = 0, sqr = 0;\n\
+    float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
 \n\
     __local float lcl_sum[16];\n\
     __local float lcl_sqr[16];\n\
@@ -13287,7 +12278,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     }\n\
 }\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16F32toBF16(\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16_F32toBF16(\n\
     image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
     image2d_array_t output, float eps, int rsFlg)\n\
 {\n\
@@ -13296,30 +12287,26 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
     int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
     vxc_short8 src0, src1, src2;\n\
     float scale_vari, bias_val;\n\
-    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+    float4 mean_vari = (float4)(0);\n\
 \n\
-    Image img1 = create_image_from_image2d(bias, 4);\n\
-    Image img2 = create_image_from_image2d(scale, 4);\n\
     Image img3 = create_image_from_image2d(meanVari, 4);\n\
-    __global float* bias_ptr = (__global float*)img1.ptr;\n\
-    __global float* scal_ptr = (__global float*)img2.ptr;\n\
     __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz));\n\
     __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\
 \n\
-    float bval = bias_ptr[gidz];\n\
-    float sval = scal_ptr[gidz];\n\
+    float sval = read_imagef(scale, coord.yz).x;\n\
+    float bval = read_imagef(bias, coord.yz).x;\n\
 \n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
         mean_vari += vari_ptr[i];\n\
     }\n\
 \n\
-    mean_vari *= dimRatio;\n\
+    mean_vari *= inv_multiplier;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
     mean_vari.s1 = rsqrt(mean_vari.s1);\n\
 \n\
     scale_vari = sval * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
+    float4  tmpData0, tmpData1;\n\
     bias_val = (bval - scale_vari * mean_vari.s0);\n\
 \n\
     int8 input_desc, output_desc;\n\
@@ -13343,7 +12330,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
     _viv_asm(COPY, tmpData0, src1, 16);\n\
     _viv_asm(COPY, tmpData1, src2, 16);\n\
 \n\
-    vxc_float4 norm;\n\
+    float4 norm;\n\
     norm = scale_vari * tmpData0 + bias_val;\n\
     _viv_asm(COPY, src0, norm, 16);\n\
     norm = scale_vari * tmpData1 + bias_val;\n\
@@ -13365,30 +12352,26 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
     int endH = gidy + height;\n\
     vxc_short8 src0, src1, src2;\n\
     float scale_vari, bias_val;\n\
-    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+    float4 mean_vari = (float4)(0);\n\
 \n\
-    Image img1 = create_image_from_image2d(bias, 4);\n\
-    Image img2 = create_image_from_image2d(scale, 4);\n\
     Image img3 = create_image_from_image2d(meanVari, 4);\n\
-    __global float* bias_ptr = (__global float*)img1.ptr;\n\
-    __global float* scal_ptr = (__global float*)img2.ptr;\n\
     __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\
     __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\
 \n\
-    float bval = bias_ptr[gidz];\n\
-    float sval = scal_ptr[gidz];\n\
+    float sval = read_imagef(scale, coord_para.yx).x;\n\
+    float bval = read_imagef(bias, coord_para.yx).x;\n\
 \n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
         mean_vari += vari_ptr[i];\n\
     }\n\
 \n\
-    mean_vari *= dimRatio;\n\
+    mean_vari *= inv_multiplier;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
     mean_vari.s1 = rsqrt(mean_vari.s1);\n\
 \n\
     scale_vari = sval * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
+    float4  tmpData0, tmpData1;\n\
     bias_val = (bval - scale_vari * mean_vari.s0);\n\
 \n\
     for(; coord.y < endH; coord.y++)\n\
@@ -13402,7 +12385,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
     _viv_asm(COPY, tmpData0, src1, 16);\n\
     _viv_asm(COPY, tmpData1, src2, 16);\n\
 \n\
-    vxc_float4 norm;\n\
+    float4 norm;\n\
     norm = scale_vari * tmpData0 + bias_val;\n\
     _viv_asm(COPY, src0, norm, 16);\n\
     norm = scale_vari * tmpData1 + bias_val;\n\
@@ -13410,558 +12393,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
     VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
     VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     }\n\
-}"; /* end of instance_normalization_scale_f32_bf16_vx*/
-
-static const char instance_normalization_scale_f32_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform int height;\n\
-_viv_uniform float dimRatio;\n\
-_viv_uniform int group_num;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F32toF16(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    vxc_short8 src0;\n\
-    vxc_half8  in_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 mean_vari = (vxc_float4)(0);\n\
-\n\
-    Image img1 = create_image_from_image2d(bias, 4);\n\
-    Image img2 = create_image_from_image2d(scale, 4);\n\
-    Image img3 = create_image_from_image2d(meanVari, 4);\n\
-    __global float* bias_ptr = (__global float*)img1.ptr;\n\
-    __global float* scal_ptr = (__global float*)img2.ptr;\n\
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz));\n\
-    __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\
-\n\
-    float bval = bias_ptr[gidz];\n\
-    float sval = scal_ptr[gidz];\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += vari_ptr[i];\n\
-    }\n\
-\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = sval * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    bias_val = (bval - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
-\n\
-    int8 input_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr);\n\
-\n\
-    for(coord.y = 0; coord.y < height; coord.y++)\n\
-    {\n\
-    VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, in_h, src0, 16);\n\
-\n\
-    coord_in.y ++;\n\
-\n\
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        UniFP16toFP32Lo4_dp4x4);\n\
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertEndInt16Fp32_4x4);\n\
-\n\
-    vxc_float4 norm;\n\
-    norm = scale_vari * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = scale_vari * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
-                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F32toF16_2D(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-    int2 coord = (int2)(get_global_id(0), gidy);\n\
-    int2 coord_para = (int2)(gidz, 0);\n\
-    int endH = gidy + height;\n\
-    vxc_short8 src0;\n\
-    vxc_half8  in_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 mean_vari = (vxc_float4)(0);\n\
-\n\
-    Image img1 = create_image_from_image2d(bias, 4);\n\
-    Image img2 = create_image_from_image2d(scale, 4);\n\
-    Image img3 = create_image_from_image2d(meanVari, 4);\n\
-    __global float* bias_ptr = (__global float*)img1.ptr;\n\
-    __global float* scal_ptr = (__global float*)img2.ptr;\n\
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\
-    __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\
-\n\
-    float bval = bias_ptr[gidz];\n\
-    float sval = scal_ptr[gidz];\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += vari_ptr[i];\n\
-    }\n\
-\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = sval * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    bias_val = (bval - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
-\n\
-    for(; coord.y < endH; coord.y++)\n\
-    {\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, in_h, src0, 16);\n\
-\n\
-    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        UniFP16toFP32Lo4_dp4x4);\n\
-    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertEndInt16Fp32_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = scale_vari * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = scale_vari * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}"; /* end of instance_normalization_scale_f32_f16_vx*/
-
-static const char instance_normalization_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform int width;\n\
-_viv_uniform int height;\n\
-_viv_uniform float dimRatio;\n\
-_viv_uniform int group_num;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniSumU8_16x1;\n\
-_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\
-_viv_uniform float input_scale;\n\
-_viv_uniform int inputZP;\n\
-_viv_uniform int sumInZp;\n\
-_viv_uniform int tmpZp1;\n\
-_viv_uniform float e2InScale;\n\
-_viv_uniform float rowSumScale;\n\
-_viv_uniform float scale_inOut;\n\
-_viv_uniform float outputScale;\n\
-_viv_uniform int output_ZP;\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8(\n\
-    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidx = get_global_id(0) << 4;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(gidx, 0, gidz, gidz);\n\
-    vxc_uchar16 src0;\n\
-    float sum = 0, sqr = 0;\n\
-    int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0;\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-    int8 input_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr_a);\n\
-    if(gidx < width)\n\
-    {\n\
-        for(coord.y = 0; coord.y < height;)\n\
-        {\n\
-            VXC_OP4(img_load_3d, src0, input, coord, 0, \\\n\
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
-            tmpSum += (tmpSum1);\n\
-            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
-            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\
-        }\n\
-        sqr += (tmpSqr * e2InScale + rowSumScale);\n\
-        sum = (tmpSum + sumInZp) * input_scale;\n\
-    }\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-        sum = 0; sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8_2D(\n\
-    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidx = get_global_id(0) << 4;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-\n\
-    int2 coord = (int2)(gidx, gidy);\n\
-    vxc_uchar16 src0;\n\
-    float sum = 0, sqr = 0;\n\
-    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\
-    int endH = gidy + height;\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-    if(gidx < width)\n\
-    {\n\
-        for(; coord.y < endH;)\n\
-        {\n\
-            VXC_ReadImage(src0, input, coord, 0,\n\
-                             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
-            tmpSum += (tmpSum1);\n\
-            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
-            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\
-        }\n\
-        sqr += (tmpSqr * e2InScale + rowSumScale);\n\
-        sum = (tmpSum + sumInZp) * input_scale;\n\
-    }\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-        sum = 0; sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8(\n\
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
-    vxc_uchar16 src0, src2;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
-\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    bias_f = read_imagef(bias, coord_para);\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
-        coord_para.y += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    short zp = inputZP;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    float alpha = scale_inOut * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
-\n\
-    int8 input_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr);\n\
-\n\
-    for(coord.y = 0; coord.y < height; coord.y++)\n\
-    {\n\
-    VXC_OP4(img_load_3d, src0, input, coord_in, 0, \\\n\
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.y ++;\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    norm = tmpData2 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData3 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8_2D(\n\
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-    int2 coord = (int2)(get_global_id(0), gidy);\n\
-    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
-    int endH = gidy + height;\n\
-    vxc_uchar16 src0, src2;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
-\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    bias_f = read_imagef(bias, coord_para);\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
-        coord_para.y += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    short zp = inputZP;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    float alpha = scale_inOut * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
-\n\
-    for(; coord.y < endH; coord.y++)\n\
-    {\n\
-    VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
-    norm = tmpData0 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData1 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    norm = tmpData2 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData3 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}"; /* end of instance_normalization_u8_vx*/
-
-static const char instance_normalization_u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform int width;\n\
-_viv_uniform int height;\n\
-_viv_uniform float dimRatio;\n\
-_viv_uniform int group_num;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
-_viv_uniform float input_scale;\n\
-_viv_uniform int inputZP;\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16(\n\
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
-    vxc_uchar16 src0;\n\
-    vxc_short8 src1, outval;\n\
-    vxc_half8 scale_h, dst;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
-\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-    bias_f = read_imagef(bias, coord_para);\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
-        coord_para.y += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-\n\
-    coord_para = coord;\n\
-    int8 input_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord_para.z, baseAddr);\n\
-    for(coord.y = 0; coord.y < height;)\n\
-    {\n\
-    VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    coord_para.xy = coord.xy;\n\
-    coord.y++;\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    coord_para.x += 8;\n\
-    norm = alpha * tmpData2 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData3 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16_2D(\n\
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-    int4 coord = (int4)(get_global_id(0), gidy, 0, 0);\n\
-    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
-    int endH = gidy + height;\n\
-    vxc_uchar16 src0;\n\
-    vxc_short8 src1, outval;\n\
-    vxc_half8 scale_h, dst;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
-\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
-    bias_f = read_imagef(bias, coord_para);\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
-        coord_para.y += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-    for(; coord.y < endH;)\n\
-    {\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    coord_para = coord;\n\
-    coord.y++;\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    coord_para.x += 8;\n\
-    norm = alpha * tmpData2 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData3 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-"; /* end of instance_normalization_u8_f16_vx*/
+}"; /* end of instance_normalization_3_vx*/
 
 static const char l2normalizescale_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -14059,6 +12491,7 @@ _viv_uniform float zpSqrt16x;\n\
 _viv_uniform VXC_512Bits uniSumAll_16x1;\n\
 _viv_uniform int inputZP;\n\
 \n\
+\n\
 #define L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \\\n\
     vxc_float4 rsqrt0;\\\n\
     Image dst_img = create_image_from_image2d(output, 1); \\\n\
@@ -14108,31 +12541,31 @@ _viv_uniform int inputZP;\n\
                     dst_ptr[0] = dst.s0; \\\n\
                 break; \\\n\
                 case 2: \\\n\
-                    VXC_Vstore2(dst_ptr, 0, dst); \\\n\
+                    VXC_Vstore2(dst_ptr, 0, dst.s01); \\\n\
                 break; \\\n\
                 case 3: \\\n\
-                    VXC_Vstore3(dst_ptr, 0, dst); \\\n\
+                    VXC_Vstore3(dst_ptr, 0, dst.s012); \\\n\
                 break; \\\n\
                 case 4: \\\n\
-                    VXC_Vstore4(dst_ptr, 0, dst); \\\n\
+                    VXC_Vstore4(dst_ptr, 0, dst.0123); \\\n\
                 break; \\\n\
                 case 5: \\\n\
-                    VXC_Vstore2(dst_ptr, 0, dst); \\\n\
+                    VXC_Vstore2(dst_ptr, 0, dst.s01); \\\n\
                     dst.s012 = dst.s234; \\\n\
                     dst_ptr += 2; \\\n\
-                    VXC_Vstore3(dst_ptr, 0, dst); \\\n\
+                    VXC_Vstore3(dst_ptr, 0, dst.s012); \\\n\
                 break; \\\n\
                 case 6: \\\n\
-                    VXC_Vstore3(dst_ptr, 0, dst); \\\n\
+                    VXC_Vstore3(dst_ptr, 0, dst.s012); \\\n\
                     dst.s012 = dst.s345; \\\n\
                     dst_ptr += 3; \\\n\
-                    VXC_Vstore3(dst_ptr, 0, dst); \\\n\
+                    VXC_Vstore3(dst_ptr, 0, dst.s012); \\\n\
                 break; \\\n\
                 case 7: \\\n\
-                    VXC_Vstore4(dst_ptr, 0, dst); \\\n\
+                    VXC_Vstore4(dst_ptr, 0, dst.0123); \\\n\
                      dst.s012 = dst.s456; \\\n\
                     dst_ptr += 4; \\\n\
-                    VXC_Vstore3(dst_ptr, 0, dst); \\\n\
+                    VXC_Vstore3(dst_ptr, 0, dst.s012); \\\n\
                 break; \\\n\
                 default: \\\n\
                     VXC_Vstore8(dst_ptr, 0, dst); \\\n\
@@ -14142,16 +12575,13 @@ _viv_uniform int inputZP;\n\
     } \\\n\
 \n\
 \n\
-#define L2NORMSCALE_AXIS0_2D(in0_name, in1_name, out_name, read_type, read_type2, src_type, INPUTSCALE, \\\n\
+#define L2NORMSCALE_AXIS0(in0_name, in1_name, out_name, read_type, read_type2, src_type, INPUTSCALE, \\\n\
                             dst_type, convert_type, output_type, copy_type) \\\n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\
-     void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \\\n\
+     void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name \\\n\
     (\\\n\
-    __read_only  image2d_t input,\\\n\
-    __read_only  image2d_t scale,\\\n\
-    __write_only image2d_t output,\\\n\
-    int axis\\\n\
-    )\\\n\
+    __read_only  image2d_t input, __read_only  image2d_t scale, __write_only image2d_t output,\\\n\
+    int axis )\\\n\
 { \\\n\
     int lidx = get_local_id(0); \\\n\
     int offset  = get_global_id(0); \\\n\
@@ -14201,19 +12631,15 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\
     L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \\\n\
 }\n\
 \n\
-L2NORMSCALE_AXIS0_2D(F16, F16, F16, ushort, vxc_ushort8, vxc_half8, 1, \\\n\
+L2NORMSCALE_AXIS0(F16, F16, F16, ushort, vxc_ushort8, vxc_half8, 1, \\\n\
                      ushort, half4, vxc_half8, vxc_ushort8)\n\
 \n\
-#define L2NORMSCALE_AXIS0_QNT_2D(in0_name, in1_name, out_name,\\\n\
+#define L2NORMSCALE_AXIS0_QNT(in0_name, in1_name, out_name,\\\n\
                     src_type, src_scalar_type, dst_type, convert_type, output_type, copy_type) \\\n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\
-void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \\\n\
+void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name \\\n\
     (\\\n\
-    __read_only  image2d_t input,\\\n\
-    __read_only  image2d_t scale,\\\n\
-    __write_only image2d_t output,\\\n\
-    int axis\\\n\
-    )\\\n\
+    __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output, int axis)\\\n\
 { \\\n\
     int lidx = get_local_id(0); \\\n\
     int offset  = get_global_id(0); \\\n\
@@ -14267,14 +12693,223 @@ void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \\\n\
     L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \\\n\
 }\n\
 \n\
-L2NORMSCALE_AXIS0_QNT_2D(U8,  F16, F16, vxc_uchar8, uchar, ushort, half4, vxc_half8,  vxc_ushort8)\n\
-L2NORMSCALE_AXIS0_QNT_2D(U8,  F16, U8,  vxc_uchar8, uchar, uchar,  int4,  vxc_uchar8, vxc_uchar8)\n\
-L2NORMSCALE_AXIS0_QNT_2D(I8,  F16, F16, vxc_char8,  char,  ushort, half4, vxc_half8,  vxc_ushort8)\n\
-L2NORMSCALE_AXIS0_QNT_2D(I8,  F16, I8,  vxc_char8,  char,  char,   int4,  vxc_char8,  vxc_char8)\n\
-L2NORMSCALE_AXIS0_QNT_2D(I16, F16, F16, vxc_short8, short, ushort, half4, vxc_half8,  vxc_ushort8)\n\
-L2NORMSCALE_AXIS0_QNT_2D(I16, F16, I16, vxc_short8, short, short,  int4,  vxc_short8, vxc_short8)\n\
+L2NORMSCALE_AXIS0_QNT(U8,  F16, F16, vxc_uchar8, uchar, ushort, half4, vxc_half8,  vxc_ushort8)\n\
+L2NORMSCALE_AXIS0_QNT(U8,  F16, U8,  vxc_uchar8, uchar, uchar,  int4,  vxc_uchar8, vxc_uchar8)\n\
+L2NORMSCALE_AXIS0_QNT(I8,  F16, F16, vxc_char8,  char,  ushort, half4, vxc_half8,  vxc_ushort8)\n\
+L2NORMSCALE_AXIS0_QNT(I8,  F16, I8,  vxc_char8,  char,  char,   int4,  vxc_char8,  vxc_char8)\n\
+L2NORMSCALE_AXIS0_QNT(I16, F16, F16, vxc_short8, short, ushort, half4, vxc_half8,  vxc_ushort8)\n\
+L2NORMSCALE_AXIS0_QNT(I16, F16, I16, vxc_short8, short, short,  int4,  vxc_short8, vxc_short8)\n\
 "; /* end of l2normalizescale_axis0_vx*/
 
+static const char l2normalizescale_axis0_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int inputWidth;\n\
+_viv_uniform float output_ZP;\n\
+_viv_uniform float zP2x;\n\
+_viv_uniform int inputZP;\n\
+\n\
+_viv_uniform float inOutScale;\n\
+_viv_uniform float e2InScale;\n\
+_viv_uniform float zpSqr8x;\n\
+_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1)))\n\
+     void l2normalizescale_axis0_F16_F16toF16_2D(\n\
+    __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output, int axis)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int lidx = get_local_id(0);\n\
+    vxc_short8 src0, src1, dst;\n\
+    vxc_half8 in_h, scale_h, tmpDst;\n\
+    float sum = 0;\n\
+    vxc_float4 scale_f0, scale_f1, sumsqr, tmpData0, tmpData1;\n\
+    __local float lcl_sum[16];\n\
+    float4 one = (float4)(1, 1, 1, 1);\n\
+    for(; coord.x < inputWidth;)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord.x += 128;\n\
+        _viv_asm(COPY, in_h, src0, 16);\n\
+        VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \\\n\
+                    uniFp16SumSqr_dp8x2);\n\
+        sum += sumsqr.y;\n\
+    }\n\
+    lcl_sum[lidx] = sum;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0];\n\
+    float4 data0;\n\
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\
+    sum = dot(data0, one);\n\
+    float alpha = rsqrt(sum);\n\
+\n\
+    for(coord.x = get_global_id(0); coord.x < inputWidth; coord.x += 128)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_ReadImage(src1, scale, coord.xz, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+        _viv_asm(COPY, in_h, src0, 16);\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertSecFp16Fp32_4x4);\n\
+\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertSecFp16Fp32_4x4);\n\
+\n\
+        half4 tmpVal0, tmpVal1;\n\
+        tmpData0 *= scale_f0 * alpha;\n\
+        tmpData1 *= scale_f1 * alpha;\n\
+        _viv_asm(CONV, tmpVal0, tmpData0);\n\
+        _viv_asm(CONV, tmpVal1, tmpData1);\n\
+        VXC_DP2x8(tmpDst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertHalfToFp16_2x8);\n\
+        _viv_asm(COPY, dst, tmpDst, 16);\n\
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+\n\
+#define L2NORMSCALE_QINTF16TOQINT_AXIS0_2D(in0_name, in1_name, out_name, read_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\
+     void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D( \\\n\
+    __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output, int axis)\\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+    int lidx = get_local_id(0); \\\n\
+    read_type src0, dst; \\\n\
+    vxc_short8 src1; \\\n\
+    vxc_half8 scale_h; \\\n\
+    float sum = 0; \\\n\
+    vxc_float4 scale_f0, scale_f1, sumsqr; \\\n\
+    __local float lcl_sum[16]; \\\n\
+    float4 one = (float4)(1, 1, 1, 1); \\\n\
+    for(; coord.x < inputWidth;) \\\n\
+    { \\\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x += 128; \\\n\
+        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \\\n\
+                    uniInt16SumSqr_dp8x2); \\\n\
+        sum += sumsqr.y - zP2x * sumsqr.x + zpSqr8x; \\\n\
+    } \\\n\
+    sum *= e2InScale; \\\n\
+    lcl_sum[lidx] = sum; \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \\\n\
+    float4 data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \\\n\
+    sum = dot(data0, one); \\\n\
+    float alpha = rsqrt(sum) * inOutScale; \\\n\
+    short zp = inputZP; \\\n\
+    vxc_float4  tmpData0, tmpData1; \\\n\
+    for(coord.x = get_global_id(0); coord.x < inputWidth; coord.x += 128) \\\n\
+    { \\\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_ReadImage(src1, scale, coord.xz, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, scale_h, src1, 16); \\\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4); \\\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertSecFp16Fp32_4x4); \\\n\
+ \\\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert1stUint8SubZpToFp32_4x4); \\\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert2ndUint8SubZpToFp32_4x4); \\\n\
+ \\\n\
+        int4 tmpVal0 = convert_int4_rte(tmpData0 * scale_f0 * alpha + output_ZP); \\\n\
+        int4 tmpVal1 = convert_int4_rte(tmpData1 * scale_f1 * alpha + output_ZP); \\\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniConvertInt32toUint8_2x8); \\\n\
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+\n\
+L2NORMSCALE_QINTF16TOQINT_AXIS0_2D(U8,  F16,  U8,   vxc_uchar8)\n\
+L2NORMSCALE_QINTF16TOQINT_AXIS0_2D(I8,  F16,  I8,   vxc_char8)\n\
+L2NORMSCALE_QINTF16TOQINT_AXIS0_2D(I16, F16,  I16,  vxc_short8)\n\
+\n\
+#define L2NORMSCALE_QINTF16TOF16_AXIS0_2D(in0_name, in1_name, out_name, read_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\
+     void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D( \\\n\
+    __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output, int axis)\\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+    int lidx = get_local_id(0); \\\n\
+    read_type src0; \\\n\
+    vxc_short8 src1, dst; \\\n\
+    vxc_half8 scale_h, tmpDst; \\\n\
+    float sum = 0; \\\n\
+    vxc_float4 scale_f0, scale_f1, sumsqr, tmpData0, tmpData1; \\\n\
+    __local float lcl_sum[16]; \\\n\
+    float4 one = (float4)(1, 1, 1, 1); \\\n\
+    for(; coord.x < inputWidth;) \\\n\
+    { \\\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x += 128; \\\n\
+        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \\\n\
+                    uniInt16SumSqr_dp8x2); \\\n\
+        sum += sumsqr.y - zP2x * sumsqr.x + zpSqr8x; \\\n\
+    } \\\n\
+    sum *= e2InScale; \\\n\
+    lcl_sum[lidx] = sum; \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \\\n\
+    float4 data0; \\\n\
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \\\n\
+    sum = dot(data0, one); \\\n\
+    float alpha = rsqrt(sum) * inOutScale; \\\n\
+    short zp = inputZP; \\\n\
+    for(coord.x = get_global_id(0); coord.x < inputWidth; coord.x += 128) \\\n\
+    { \\\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_ReadImage(src1, scale, coord.xz, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, scale_h, src1, 16); \\\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4); \\\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertSecFp16Fp32_4x4); \\\n\
+ \\\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert1stUint8SubZpToFp32_4x4); \\\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert2ndUint8SubZpToFp32_4x4); \\\n\
+ \\\n\
+        half4 tmpVal0, tmpVal1; \\\n\
+        tmpData0 *= scale_f0 * alpha; \\\n\
+        tmpData1 *= scale_f1 * alpha; \\\n\
+        _viv_asm(CONV, tmpVal0, tmpData0); \\\n\
+        _viv_asm(CONV, tmpVal1, tmpData1); \\\n\
+        VXC_DP2x8(tmpDst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertHalfToFp16_2x8); \\\n\
+        _viv_asm(COPY, dst, tmpDst, 16); \\\n\
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+\n\
+L2NORMSCALE_QINTF16TOF16_AXIS0_2D(U8,  F16,  F16,   vxc_uchar8)\n\
+L2NORMSCALE_QINTF16TOF16_AXIS0_2D(I8,  F16,  F16,   vxc_char8)\n\
+L2NORMSCALE_QINTF16TOF16_AXIS0_2D(I16, F16,  F16,   vxc_short8)\n\
+"; /* end of l2normalizescale_axis0_2d_vx*/
+
 static const char l2normalizescale_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 /********************************************L2NormalizeScale*****************************************/\n\
@@ -14444,1208 +13079,1131 @@ L2NORMSCALE_AXIS1_QNT_2D(I16, F16, F16, vxc_short8,  vxc_short8,  vxc_half8,   h
 L2NORMSCALE_AXIS1_QNT_2D(I16, F16, I16, vxc_short8,  vxc_short8,  vxc_short8,  int4,   vxc_short8)\n\
 "; /* end of l2normalizescale_axis1_vx*/
 
-static const char layer_normalization_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char layer_normalization_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
-/**************************layernorm float16***********************************/\n\
+_viv_uniform VXC_512Bits uniSumX_16x1;\n\
+_viv_uniform VXC_512Bits uniSumX2_16x1;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
 _viv_uniform int width;\n\
-_viv_uniform float dimRatio;\n\
-_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\
-\n\
-__kernel void layer_norm_F16toF16(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale,\n\
-    image2d_array_t output, float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\
-    int4 coord_out = coord;\n\
-\n\
-    int8 input_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr_a);\n\
-\n\
-    vxc_short8 src0, src1;\n\
-    vxc_float sum = 0, sqr = 0;\n\
-    VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord_out.z, baseAddr);\n\
-\n\
-    for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\
-    {\n\
-        vxc_half8  val0_h;\n\
-        _viv_asm(COPY, val0_h, src0, 16);\n\
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        vxc_float4 sumsqr;\n\
-        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniFp16SumSqr_dp8x2);\n\
-        sum += sumsqr.x;\n\
-        sqr += sumsqr.y;\n\
-    }\n\
-    vxc_float mean;\n\
-    mean = sum * dimRatio;\n\
-    vxc_float vari;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-    vxc_float4 bias_f;\n\
-    for(coord.x = 0; coord.x < width; coord.x += 4)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f = read_imagef(bias, coord.xw);\n\
-        vxc_half8 in_h, scale_h;\n\
-        _viv_asm(COPY, in_h, src0, 16);\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        vxc_float4 in_f, scale_f;\n\
-        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        vxc_float4 sub, norm;\n\
-        sub = in_f - mean;\n\
-        norm = scale_f * vari * sub + bias_f;\n\
-        half4 norm_h;\n\
-        _viv_asm(CONV, norm_h, norm);\n\
-        vxc_half8 dst;\n\
-        VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniExtractHalf4_dp4x4);\n\
-        vxc_short8 dstval;\n\
-        _viv_asm(COPY, dstval, dst, 16);\n\
-        coord_out.x = coord.x;\n\
-        VXC_OP4_NoDest(img_store_3d, output, coord_out, dstval, \\\n\
-                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-/*****************************layernorm uint8 to uint8****************************/\n\
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniSumU8_16x1;\n\
-_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\
-_viv_uniform float input_scale;\n\
-_viv_uniform int inputZP;\n\
-_viv_uniform float outputScale;\n\
+_viv_uniform float inv_multiplier;\n\
+_viv_uniform float output_scale;\n\
 _viv_uniform float output_zp;\n\
-_viv_uniform int sumInZp;\n\
-_viv_uniform int tmpZp1;\n\
-_viv_uniform int tmpZp2;\n\
-_viv_uniform float e2InScale;\n\
-_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
 \n\
-__kernel void layer_norm_U8toU8(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale,\n\
-    image2d_array_t output, float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\
-    int4 coord_out = coord;\n\
+#define CONV2F32(dst, src, section) \\\n\
+        VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+            uniDataToFP32_##section##_4x4);\n\
 \n\
-    vxc_uchar16 src0, src2;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float sum = 0, sqr = 0;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    int tmpSum = 0, tmpSqr = 0;\n\
-    vxc_int4 tmpSum1;\n\
-    vxc_int4 tmpSqr1;\n\
-    short zp = inputZP;\n\
-\n\
-    int8 input_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord_out.z, baseAddr);\n\
-\n\
-    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
-        tmpSum += (tmpSum1.x);\n\
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
-        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\
-    }\n\
-    sum = (tmpSum + sumInZp) * input_scale;\n\
-    sqr = (tmpSqr + tmpZp2) * e2InScale;\n\
-\n\
-    float mean, vari;\n\
-    mean = sum * dimRatio;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
-    int2 coord_bias = (int2)(0, 0);\n\
-\n\
-    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord_bias.x = coord.x;\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertSecFp16Fp32_4x4);\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-\n\
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert2ndUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert3rdUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert4thUint8SubZpToFp32_4x4);\n\
-        tmpData0 *= input_scale;\n\
-        tmpData1 *= input_scale;\n\
-        tmpData2 *= input_scale;\n\
-        tmpData3 *= input_scale;\n\
-\n\
-        vxc_float4 norm;\n\
-        tmpData0 -= mean;\n\
-        norm = scale_f0 * vari * tmpData0 + bias_f0;\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        coord_bias.x += 4;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-\n\
-        tmpData1 -= mean;\n\
-        norm = scale_f1 * vari * tmpData1 + bias_f1;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertSecFp16Fp32_4x4);\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-\n\
-        tmpData2 -= mean;\n\
-        norm = scale_f0 * vari * tmpData2 + bias_f0;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-\n\
-        tmpData3 -= mean;\n\
-        norm = scale_f1 * vari * tmpData3 + bias_f1;\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-        coord_out.x = coord.x;\n\
-        VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \\\n\
-                VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
+#define LAYER_NORM_8BITS_IMPL(name, src_type) \\\n\
+__kernel void layer_norm_axis0_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           eps) \\\n\
+{ \\\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+    int4 coord_out = coord; \\\n\
+ \\\n\
+    src_type src0, dst; \\\n\
+    vxc_short8 src1; \\\n\
+    vxc_half8 scale_h; \\\n\
+    float sum = 0, sqr = 0; \\\n\
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+    uint2 _sums = 0, sum_x_x2; \\\n\
+ \\\n\
+    int8 input_desc, output_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr_a); \\\n\
+ \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.z, baseAddr); \\\n\
+ \\\n\
+    for (coord.x = 0; coord.x < width; ) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x += 16; \\\n\
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\
+        _sums = _sums + sum_x_x2; \\\n\
+    } \\\n\
+ \\\n\
+    float2 sums = convert_float2(_sums) * inv_multiplier; \\\n\
+ \\\n\
+    sums.y = sums.y - sums.x * sums.x + eps; \\\n\
+    sums.y = rsqrt(sums.y); \\\n\
+    int4 tmpVal0, tmpVal1; \\\n\
+    float4  tmpData0, tmpData1, tmpData2, tmpData3; \\\n\
+    int2 coord_bias = (int2)(0, 0); \\\n\
+ \\\n\
+    for(coord.x = 0; coord.x < width; coord.x += 16) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_bias.x = coord.x; \\\n\
+        _viv_asm(COPY, scale_h, src1, 16); \\\n\
+        CONV2F32(scale_f0, scale_h, 0); \\\n\
+        CONV2F32(scale_f1, scale_h, 1); \\\n\
+        bias_f0 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+        bias_f1 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+ \\\n\
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0), \\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, scale_h, src1, 16); \\\n\
+        CONV2F32(tmpData0, src0, 0); \\\n\
+        CONV2F32(tmpData1, src0, 1); \\\n\
+        CONV2F32(tmpData2, src0, 2); \\\n\
+        CONV2F32(tmpData3, src0, 3); \\\n\
+ \\\n\
+        float4 norm; \\\n\
+        tmpData0 = tmpData0 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\
+        bias_f0 = read_imagef(bias, coord_bias); \\\n\
+        CONV2F32(scale_f0, scale_h, 0); \\\n\
+        coord_bias.x += 4; \\\n\
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+ \\\n\
+        tmpData1 = tmpData1 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\
+        bias_f1 = read_imagef(bias, coord_bias); \\\n\
+        CONV2F32(scale_f1, scale_h, 1); \\\n\
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+ \\\n\
+        tmpData2 = tmpData2 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData2 + bias_f0; \\\n\
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+ \\\n\
+        tmpData3 = tmpData3 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData3 + bias_f1; \\\n\
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+        coord_out.x = coord.x; \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
 }\n\
-/***************************layernorm float16 to uint8**************************/\n\
-__kernel void layer_norm_F16toU8(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale,\n\
-    image2d_array_t output, float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\
-    int4 coord_out = coord;\n\
+LAYER_NORM_8BITS_IMPL(U8_F16toU8, vxc_uchar16)\n\
+LAYER_NORM_8BITS_IMPL(I8_F16toI8, vxc_char16)\n\
 \n\
-    int8 input_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+#define LAYER_NORM_SUMS_2D() \\\n\
+    uint2 _sums = 0, sum_x_x2; \\\n\
+ \\\n\
+    for (coord.x = 0; coord.x < width; ) \\\n\
+    { \\\n\
+        VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x += 16; \\\n\
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\
+        _sums = _sums + sum_x_x2; \\\n\
+    } \\\n\
+ \\\n\
+    float2 sums = convert_float2(_sums) * inv_multiplier; \\\n\
+ \\\n\
+    sums.y = sums.y - sums.x * sums.x + eps; \\\n\
+    sums.y = rsqrt(sums.y);\n\
 \n\
-    vxc_short8 src0, src1;\n\
-    vxc_float sum = 0, sqr = 0;\n\
-    VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+#define LAYER_NORM_8BITS_IMPL_2D(name, src_type) \\\n\
+__kernel void layer_norm_axis0_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           eps) \\\n\
+{ \\\n\
+    int4 coord = (int4)(0, get_global_id(1), 0, 0); \\\n\
+ \\\n\
+    src_type src0, dst; \\\n\
+    vxc_short8 src1; \\\n\
+    vxc_half8 scale_h; \\\n\
+    float sum = 0, sqr = 0; \\\n\
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+ \\\n\
+    LAYER_NORM_SUMS_2D(); \\\n\
+ \\\n\
+    int4 tmpVal0, tmpVal1; \\\n\
+    float4  tmpData0, tmpData1, tmpData2, tmpData3; \\\n\
+    int2 coord_bias = (int2)(0, 0); \\\n\
+ \\\n\
+    for (coord.x = 0; coord.x < width; coord.x += 16) \\\n\
+    { \\\n\
+        VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_bias.x = coord.x; \\\n\
+        _viv_asm(COPY, scale_h, src1, 16); \\\n\
+        CONV2F32(scale_f0, scale_h, 0); \\\n\
+        CONV2F32(scale_f1, scale_h, 1); \\\n\
+        bias_f0 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+        bias_f1 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+ \\\n\
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0), \\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, scale_h, src1, 16); \\\n\
+        CONV2F32(tmpData0, src0, 0); \\\n\
+        CONV2F32(tmpData1, src0, 1); \\\n\
+        CONV2F32(tmpData2, src0, 2); \\\n\
+        CONV2F32(tmpData3, src0, 3); \\\n\
+ \\\n\
+        float4 norm; \\\n\
+        tmpData0 = tmpData0 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\
+        bias_f0 = read_imagef(bias, coord_bias); \\\n\
+        CONV2F32(scale_f0, scale_h, 0); \\\n\
+        coord_bias.x += 4; \\\n\
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+ \\\n\
+        tmpData1 = tmpData1 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\
+        bias_f1 = read_imagef(bias, coord_bias); \\\n\
+        CONV2F32(scale_f1, scale_h, 1); \\\n\
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+ \\\n\
+        tmpData2 = tmpData2 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData2 + bias_f0; \\\n\
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+ \\\n\
+        tmpData3 = tmpData3 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData3 + bias_f1; \\\n\
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+LAYER_NORM_8BITS_IMPL_2D(U8_F16toU8, vxc_uchar16)\n\
+LAYER_NORM_8BITS_IMPL_2D(I8_F16toI8, vxc_char16)\n\
 \n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord_out.z, baseAddr);\n\
-\n\
-    for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\
-    {\n\
-        vxc_half8  val0_h;\n\
-        _viv_asm(COPY, val0_h, src0, 16);\n\
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        vxc_float4 sumsqr;\n\
-        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniFp16SumSqr_dp8x2);\n\
-        sum += sumsqr.x;\n\
-        sqr += sumsqr.y;\n\
-    }\n\
-    vxc_float mean;\n\
-    mean = sum * dimRatio;\n\
-    vxc_float vari;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-    vxc_float4 bias_f;\n\
-    for(coord.x = 0; coord.x < width; coord.x += 4)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f = read_imagef(bias, coord.xw);\n\
-        vxc_half8 in_h, scale_h;\n\
-        _viv_asm(COPY, in_h, src0, 16);\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        vxc_float4 in_f, scale_f;\n\
-        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        vxc_float4 sub, norm;\n\
-        sub = in_f - mean;\n\
-        norm = scale_f * vari * sub + bias_f;\n\
-        norm = norm * outputScale + output_zp;\n\
-        int4 output_int4;\n\
-        output_int4 = convert_int4_rte(norm);\n\
-        vxc_uchar8 dst;\n\
-        VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\n\
-            uniConvertInt32toUint8_2x8);\n\
-        coord_out.x = coord.x;\n\
+#define LAYER_NORM_8TOF16_IMPL(name, src_type) \\\n\
+__kernel void layer_norm_axis0_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           eps) \\\n\
+{ \\\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+    int4 coord_out = coord; \\\n\
+ \\\n\
+    src_type src0; \\\n\
+    vxc_short8 src1; \\\n\
+    vxc_half8 scale_h; \\\n\
+    float sum = 0, sqr = 0; \\\n\
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+    uint2 _sums = 0, sum_x_x2; \\\n\
+ \\\n\
+    int8 input_desc, output_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr_a); \\\n\
+ \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.z, baseAddr); \\\n\
+ \\\n\
+    for (coord.x = 0; coord.x < width; ) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x += 16; \\\n\
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\
+        _sums = _sums + sum_x_x2; \\\n\
+    } \\\n\
+ \\\n\
+    float2 sums = convert_float2(_sums) * inv_multiplier; \\\n\
+ \\\n\
+    sums.y = sums.y - sums.x * sums.x + eps; \\\n\
+    sums.y = rsqrt(sums.y); \\\n\
+    half4 tmpVal0, tmpVal1; \\\n\
+    float4  tmpData0, tmpData1, tmpData2, tmpData3; \\\n\
+    int2 coord_bias = (int2)(0, 0); \\\n\
+ \\\n\
+    vxc_short8 dst; \\\n\
+    vxc_half8 result; \\\n\
+    for(coord.x = 0; coord.x < width; coord.x += 16) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_bias.x = coord.x; \\\n\
+        _viv_asm(COPY, scale_h, src1, 16); \\\n\
+        CONV2F32(scale_f0, scale_h, 0); \\\n\
+        CONV2F32(scale_f1, scale_h, 1); \\\n\
+        bias_f0 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+        bias_f1 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+ \\\n\
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0), \\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, scale_h, src1, 16); \\\n\
+        CONV2F32(tmpData0, src0, 0); \\\n\
+        CONV2F32(tmpData1, src0, 1); \\\n\
+        CONV2F32(tmpData2, src0, 2); \\\n\
+        CONV2F32(tmpData3, src0, 3); \\\n\
+ \\\n\
+        float4 norm; \\\n\
+        tmpData0 = tmpData0 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\
+        bias_f0 = read_imagef(bias, coord_bias); \\\n\
+        CONV2F32(scale_f0, scale_h, 0); \\\n\
+        coord_bias.x += 4; \\\n\
+        _viv_asm(CONV, tmpVal0, norm); \\\n\
+ \\\n\
+        tmpData1 = tmpData1 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\
+        bias_f1 = read_imagef(bias, coord_bias); \\\n\
+        CONV2F32(scale_f1, scale_h, 1); \\\n\
+        _viv_asm(CONV, tmpVal1, norm); \\\n\
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+        _viv_asm(COPY, dst, result, 16); \\\n\
+        coord_out.x = coord.x; \\\n\
         VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, \\\n\
-                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}"; /* end of layer_normalization_vx*/
-
-static const char layer_normalization_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+        tmpData2 = tmpData2 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData2 + bias_f0; \\\n\
+        _viv_asm(CONV, tmpVal0, norm); \\\n\
+ \\\n\
+        tmpData3 = tmpData3 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData3 + bias_f1; \\\n\
+        _viv_asm(CONV, tmpVal1, norm); \\\n\
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+        _viv_asm(COPY, dst, result, 16); \\\n\
+        coord_out.x += 8; \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+LAYER_NORM_8TOF16_IMPL(U8_F16toF16, vxc_uchar16)\n\
+LAYER_NORM_8TOF16_IMPL(I8_F16toF16, vxc_char16)\n\
 \n\
-/**************************layernorm float16***********************************/\n\
+#define LAYER_NORM_8TOF16_IMPL_2D(name, src_type) \\\n\
+__kernel void layer_norm_axis0_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           eps) \\\n\
+{ \\\n\
+    int4 coord = (int4)(0, get_global_id(1), 0, 0); \\\n\
+ \\\n\
+    src_type src0; \\\n\
+    vxc_short8 src1; \\\n\
+    vxc_half8 scale_h; \\\n\
+    float sum = 0, sqr = 0; \\\n\
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+ \\\n\
+    LAYER_NORM_SUMS_2D(); \\\n\
+ \\\n\
+    half4 tmpVal0, tmpVal1; \\\n\
+    float4  tmpData0, tmpData1, tmpData2, tmpData3; \\\n\
+    int2 coord_bias = (int2)(0, 0); \\\n\
+ \\\n\
+    vxc_short8 dst; \\\n\
+    vxc_half8 result; \\\n\
+    for (coord.x = 0; coord.x < width; coord.x += 16) \\\n\
+    { \\\n\
+        VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_bias.x = coord.x; \\\n\
+        _viv_asm(COPY, scale_h, src1, 16); \\\n\
+        CONV2F32(scale_f0, scale_h, 0); \\\n\
+        CONV2F32(scale_f1, scale_h, 1); \\\n\
+        bias_f0 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+        bias_f1 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+ \\\n\
+        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0), \\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, scale_h, src1, 16); \\\n\
+        CONV2F32(tmpData0, src0, 0); \\\n\
+        CONV2F32(tmpData1, src0, 1); \\\n\
+        CONV2F32(tmpData2, src0, 2); \\\n\
+        CONV2F32(tmpData3, src0, 3); \\\n\
+ \\\n\
+        float4 norm; \\\n\
+        tmpData0 = tmpData0 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\
+        bias_f0 = read_imagef(bias, coord_bias); \\\n\
+        CONV2F32(scale_f0, scale_h, 0); \\\n\
+        coord_bias.x += 4; \\\n\
+        _viv_asm(CONV, tmpVal0, norm); \\\n\
+ \\\n\
+        tmpData1 = tmpData1 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\
+        bias_f1 = read_imagef(bias, coord_bias); \\\n\
+        CONV2F32(scale_f1, scale_h, 1); \\\n\
+        _viv_asm(CONV, tmpVal1, norm); \\\n\
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+        _viv_asm(COPY, dst, result, 16); \\\n\
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x += 8; \\\n\
+ \\\n\
+        tmpData2 = tmpData2 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData2 + bias_f0; \\\n\
+        _viv_asm(CONV, tmpVal0, norm); \\\n\
+ \\\n\
+        tmpData3 = tmpData3 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData3 + bias_f1; \\\n\
+        _viv_asm(CONV, tmpVal1, norm); \\\n\
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+        _viv_asm(COPY, dst, result, 16); \\\n\
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x -= 8; \\\n\
+    } \\\n\
+}\n\
+LAYER_NORM_8TOF16_IMPL_2D(U8_F16toF16, vxc_uchar16)\n\
+LAYER_NORM_8TOF16_IMPL_2D(I8_F16toF16, vxc_char16)\n\
+"; /* end of layer_normalization_0_vx*/
+
+static const char layer_normalization_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniSum_X_X2_8x2;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
 _viv_uniform int width;\n\
-_viv_uniform float dimRatio;\n\
-_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\
-\n\
-__kernel void layer_norm_F16toF16_2D(\n\
-    image2d_t input, image2d_t bias, image2d_t scale,\n\
-    image2d_t output, float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
-    vxc_short8 src0, src1;\n\
-    vxc_float sum = 0, sqr = 0;\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\
-    {\n\
-        vxc_half8  val0_h;\n\
-        _viv_asm(COPY, val0_h, src0, 16);\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        vxc_float4 sumsqr;\n\
-        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniFp16SumSqr_dp8x2);\n\
-        sum += sumsqr.x;\n\
-        sqr += sumsqr.y;\n\
-    }\n\
-    vxc_float mean;\n\
-    mean = sum * dimRatio;\n\
-    vxc_float vari;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-    vxc_float4 bias_f;\n\
-    for(coord.x = 0; coord.x < width; coord.x += 4)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f = read_imagef(bias, coord.xw);\n\
-        vxc_half8 in_h, scale_h;\n\
-        _viv_asm(COPY, in_h, src0, 16);\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        vxc_float4 in_f, scale_f;\n\
-        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        vxc_float4 sub, norm;\n\
-        sub = in_f - mean;\n\
-        norm = scale_f * vari * sub + bias_f;\n\
-        half4 norm_h;\n\
-        _viv_asm(CONV, norm_h, norm);\n\
-        vxc_half8 dst;\n\
-        VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniExtractHalf4_dp4x4);\n\
-        vxc_short8 dstval;\n\
-        _viv_asm(COPY, dstval, dst, 16);\n\
-        VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-/*****************************layernorm uint8 to uint8****************************/\n\
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniSumU8_16x1;\n\
-_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\
-_viv_uniform float input_scale;\n\
-_viv_uniform int inputZP;\n\
-_viv_uniform float outputScale;\n\
+_viv_uniform float inv_multiplier;\n\
+_viv_uniform float output_scale;\n\
 _viv_uniform float output_zp;\n\
-_viv_uniform int sumInZp;\n\
-_viv_uniform int tmpZp1;\n\
-_viv_uniform int tmpZp2;\n\
-_viv_uniform float e2InScale;\n\
-_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
 \n\
-__kernel void layer_norm_U8toU8_2D(\n\
-    image2d_t input, image2d_t bias, image2d_t scale,\n\
-    image2d_t output, float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
-    vxc_uchar16 src0, src2;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float sum = 0, sqr = 0;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    int tmpSum = 0, tmpSqr = 0;\n\
-    vxc_int4 tmpSum1;\n\
-    vxc_int4 tmpSqr1;\n\
-    short zp = inputZP;\n\
+#define CONV2F32(dst, src, section) \\\n\
+        VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+            uniDataToFP32_##section##_4x4);\n\
 \n\
-    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
-        tmpSum += (tmpSum1.x);\n\
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
-        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\
-    }\n\
-    sum = (tmpSum + sumInZp) * input_scale;\n\
-    sqr = (tmpSqr + tmpZp2) * e2InScale;\n\
-\n\
-    float mean, vari;\n\
-    mean = sum * dimRatio;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
-    int2 coord_bias = (int2)(0, 0);\n\
-\n\
-    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord_bias.x = coord.x;\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertSecFp16Fp32_4x4);\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-\n\
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert2ndUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert3rdUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert4thUint8SubZpToFp32_4x4);\n\
-        tmpData0 = tmpData0 * input_scale - mean;\n\
-        tmpData1 = tmpData1 * input_scale - mean;\n\
-        tmpData2 = tmpData2 * input_scale - mean;\n\
-        tmpData3 = tmpData3 * input_scale - mean;\n\
-\n\
-        vxc_float4 norm;\n\
-        norm = scale_f0 * vari * tmpData0 + bias_f0;\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        coord_bias.x += 4;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-\n\
-        norm = scale_f1 * vari * tmpData1 + bias_f1;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertSecFp16Fp32_4x4);\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-\n\
-        norm = scale_f0 * vari * tmpData2 + bias_f0;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-\n\
-        norm = scale_f1 * vari * tmpData3 + bias_f1;\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-        VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
+#define LAYER_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \\\n\
+__kernel void layer_norm_axis0_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           eps) \\\n\
+{ \\\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+    int4 coord_out = coord; \\\n\
+ \\\n\
+    vxc_short8 in0; \\\n\
+    src_type src0; \\\n\
+    copy_type dst; \\\n\
+    vxc_short8 src1; \\\n\
+    dst_type result; \\\n\
+    vxc_half8 scale_h; \\\n\
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+    float2 _sums = 0, sum_x_x2; \\\n\
+ \\\n\
+    int8 input_desc, output_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr_a); \\\n\
+ \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.z, baseAddr); \\\n\
+ \\\n\
+    for (coord.x = 0; coord.x < width; ) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, in0, 16); \\\n\
+        coord.x += 8; \\\n\
+        VXC_DP8x2(sum_x_x2, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \\\n\
+        _sums = _sums + sum_x_x2; \\\n\
+    } \\\n\
+ \\\n\
+    float2 sums = _sums * inv_multiplier; \\\n\
+ \\\n\
+    sums.y = sums.y - sums.x * sums.x + eps; \\\n\
+    sums.y = rsqrt(sums.y); \\\n\
+    conv_type tmpVal0, tmpVal1; \\\n\
+    float4  tmpData0, tmpData1; \\\n\
+    int2 coord_bias = (int2)(0, 0); \\\n\
+ \\\n\
+    for(coord.x = 0; coord.x < width; coord.x += 8) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, in0, 16); \\\n\
+        VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_bias.x = coord.x; \\\n\
+        _viv_asm(COPY, scale_h, src1, 16); \\\n\
+        CONV2F32(scale_f0, scale_h, 0); \\\n\
+        CONV2F32(scale_f1, scale_h, 1); \\\n\
+        bias_f0 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+        bias_f1 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+ \\\n\
+        CONV2F32(tmpData0, src0, 0); \\\n\
+        CONV2F32(tmpData1, src0, 1); \\\n\
+ \\\n\
+        float4 norm; \\\n\
+        tmpData0 = tmpData0 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\
+        norm = norm * output_scale + output_zp; \\\n\
+        _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\
+ \\\n\
+        tmpData1 = tmpData1 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\
+        norm = norm * output_scale + output_zp; \\\n\
+        _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+        _viv_asm(COPY, dst, result, 16); \\\n\
+ \\\n\
+        coord_out.x = coord.x; \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
 }\n\
-/***************************layernorm float16 to uint8**************************/\n\
-__kernel void layer_norm_F16toU8_2D(\n\
-    image2d_t input, image2d_t bias, image2d_t scale,\n\
-    image2d_t output, float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
-    vxc_short8 src0, src1;\n\
-    vxc_float sum = 0, sqr = 0;\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\
-    {\n\
-        vxc_half8  val0_h;\n\
-        _viv_asm(COPY, val0_h, src0, 16);\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        vxc_float4 sumsqr;\n\
-        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniFp16SumSqr_dp8x2);\n\
-        sum += sumsqr.x;\n\
-        sqr += sumsqr.y;\n\
-    }\n\
-    vxc_float mean;\n\
-    mean = sum * dimRatio;\n\
-    vxc_float vari;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-    vxc_float4 bias_f;\n\
-    for(coord.x = 0; coord.x < width; coord.x += 4)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f = read_imagef(bias, coord.xw);\n\
-        vxc_half8 in_h, scale_h;\n\
-        _viv_asm(COPY, in_h, src0, 16);\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        vxc_float4 in_f, scale_f;\n\
-        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        vxc_float4 sub, norm;\n\
-        sub = in_f - mean;\n\
-        norm = scale_f * vari * sub + bias_f;\n\
-        norm = norm * outputScale + output_zp;\n\
-        int4 output_int4;\n\
-        output_int4 = convert_int4_rte(norm);\n\
-        vxc_uchar8 dst;\n\
-        VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\n\
-            uniConvertInt32toUint8_2x8);\n\
-        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
+LAYER_NORM_16BITS_IMPL(F16_F16toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)\n\
+LAYER_NORM_16BITS_IMPL(F16_F16toI16, vxc_half8,  vxc_short8, vxc_short8, int4)\n\
+LAYER_NORM_16BITS_IMPL(F16_F16toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)\n\
+LAYER_NORM_16BITS_IMPL(F16_F16toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)\n\
+LAYER_NORM_16BITS_IMPL(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\
+LAYER_NORM_16BITS_IMPL(I16_F16toF16, vxc_short8, vxc_half8,  vxc_short8, half4)\n\
+\n\
+#define LAYER_NORM_16BITS_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \\\n\
+__kernel void layer_norm_axis0_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           eps) \\\n\
+{ \\\n\
+    int4 coord = (int4)(0, get_global_id(1), 0, 0); \\\n\
+ \\\n\
+    vxc_short8 in0; \\\n\
+    src_type src0; \\\n\
+    copy_type dst; \\\n\
+    dst_type result; \\\n\
+    vxc_short8 src1; \\\n\
+    vxc_half8 scale_h; \\\n\
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+    float2 _sums = 0, sum_x_x2; \\\n\
+ \\\n\
+    for (coord.x = 0; coord.x < width; ) \\\n\
+    { \\\n\
+        VXC_ReadImage(in0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, in0, 16); \\\n\
+        coord.x += 8; \\\n\
+        VXC_DP8x2(sum_x_x2, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \\\n\
+        _sums = _sums + sum_x_x2; \\\n\
+    } \\\n\
+ \\\n\
+    float2 sums = _sums * inv_multiplier; \\\n\
+ \\\n\
+    sums.y = sums.y - sums.x * sums.x + eps; \\\n\
+    sums.y = rsqrt(sums.y); \\\n\
+ \\\n\
+    conv_type tmpVal0, tmpVal1; \\\n\
+    float4  tmpData0, tmpData1; \\\n\
+    int2 coord_bias = (int2)(0, 0); \\\n\
+ \\\n\
+    for (coord.x = 0; coord.x < width; coord.x += 8) \\\n\
+    { \\\n\
+        VXC_ReadImage(in0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, in0, 16); \\\n\
+        VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_bias.x = coord.x; \\\n\
+        _viv_asm(COPY, scale_h, src1, 16); \\\n\
+        CONV2F32(scale_f0, scale_h, 0); \\\n\
+        CONV2F32(scale_f1, scale_h, 1); \\\n\
+        bias_f0 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+        bias_f1 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+ \\\n\
+        CONV2F32(tmpData0, src0, 0); \\\n\
+        CONV2F32(tmpData1, src0, 1); \\\n\
+ \\\n\
+        float4 norm; \\\n\
+        tmpData0 = tmpData0 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\
+        coord_bias.x += 4; \\\n\
+        norm = norm * output_scale + output_zp; \\\n\
+        _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\
+ \\\n\
+        tmpData1 = tmpData1 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\
+        norm = norm * output_scale + output_zp; \\\n\
+        _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+        _viv_asm(COPY, dst, result, 16); \\\n\
+ \\\n\
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
 }\n\
-"; /* end of layer_normalization_2d_vx*/
+LAYER_NORM_16BITS_IMPL_2D(F16_F16toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)\n\
+LAYER_NORM_16BITS_IMPL_2D(F16_F16toI16, vxc_half8,  vxc_short8, vxc_short8, int4)\n\
+LAYER_NORM_16BITS_IMPL_2D(F16_F16toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)\n\
+LAYER_NORM_16BITS_IMPL_2D(F16_F16toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)\n\
+LAYER_NORM_16BITS_IMPL_2D(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\
+LAYER_NORM_16BITS_IMPL_2D(I16_F16toF16, vxc_short8, vxc_half8,  vxc_short8, half4)\n\
+\n\
+#define LAYER_NORM_16_32_IMPL(name, src_type, dst_type, copy_type, conv_type) \\\n\
+__kernel void layer_norm_axis0_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           eps) \\\n\
+{ \\\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+    int4 coord_out = coord; \\\n\
+ \\\n\
+    vxc_short8 in0; \\\n\
+    src_type src0; \\\n\
+    copy_type dst; \\\n\
+    dst_type result; \\\n\
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+    float2 _sums = 0, sum_x_x2; \\\n\
+ \\\n\
+    int8 input_desc, output_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr_a); \\\n\
+ \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.z, baseAddr); \\\n\
+ \\\n\
+    for (coord.x = 0; coord.x < width; ) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, in0, 16); \\\n\
+        coord.x += 8; \\\n\
+        VXC_DP8x2(sum_x_x2, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \\\n\
+        _sums = _sums + sum_x_x2; \\\n\
+    } \\\n\
+ \\\n\
+    float2 sums = _sums * inv_multiplier; \\\n\
+ \\\n\
+    sums.y = sums.y - sums.x * sums.x + eps; \\\n\
+    sums.y = rsqrt(sums.y); \\\n\
+    conv_type tmpVal0, tmpVal1; \\\n\
+    float4  tmpData0, tmpData1; \\\n\
+ \\\n\
+    Image img1 = create_image_from_image2d(bias, 4); \\\n\
+    Image img2 = create_image_from_image2d(scale, 4); \\\n\
+    __global float* bias_ptr = (__global float*)img1.ptr; \\\n\
+    __global float* scale_ptr = (__global float*)img2.ptr; \\\n\
+    for(coord.x = 0; coord.x < width; coord.x += 8) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, in0, 16); \\\n\
+        bias_f0 = vload4(0, bias_ptr); \\\n\
+        bias_f1 = vload4(1, bias_ptr); \\\n\
+        scale_f0 = vload4(0, scale_ptr); \\\n\
+        scale_f1 = vload4(1, scale_ptr); \\\n\
+        bias_ptr += 8; \\\n\
+        scale_ptr += 8; \\\n\
+ \\\n\
+        CONV2F32(tmpData0, src0, 0); \\\n\
+        CONV2F32(tmpData1, src0, 1); \\\n\
+ \\\n\
+        float4 norm; \\\n\
+        tmpData0 = tmpData0 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\
+        norm = norm * output_scale + output_zp; \\\n\
+        _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\
+ \\\n\
+        tmpData1 = tmpData1 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\
+        norm = norm * output_scale + output_zp; \\\n\
+        _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+        _viv_asm(COPY, dst, result, 16); \\\n\
+ \\\n\
+        coord_out.x = coord.x; \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+LAYER_NORM_16_32_IMPL(F16_F32toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)\n\
+LAYER_NORM_16_32_IMPL(F16_F32toI16, vxc_half8,  vxc_short8, vxc_short8, int4)\n\
+LAYER_NORM_16_32_IMPL(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)\n\
+LAYER_NORM_16_32_IMPL(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)\n\
+LAYER_NORM_16_32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\
+LAYER_NORM_16_32_IMPL(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)\n\
+\n\
+#define LAYER_NORM_16_32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \\\n\
+__kernel void layer_norm_axis0_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           eps) \\\n\
+{ \\\n\
+    int4 coord = (int4)(0, get_global_id(1), 0, 0); \\\n\
+ \\\n\
+    vxc_short8 in0; \\\n\
+    src_type src0; \\\n\
+    copy_type dst; \\\n\
+    dst_type result; \\\n\
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+    float2 _sums = 0, sum_x_x2; \\\n\
+ \\\n\
+    for (coord.x = 0; coord.x < width; ) \\\n\
+    { \\\n\
+        VXC_ReadImage(in0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, in0, 16); \\\n\
+        coord.x += 8; \\\n\
+        VXC_DP8x2(sum_x_x2, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \\\n\
+        _sums = _sums + sum_x_x2; \\\n\
+    } \\\n\
+ \\\n\
+    float2 sums = _sums * inv_multiplier; \\\n\
+ \\\n\
+    sums.y = sums.y - sums.x * sums.x + eps; \\\n\
+    sums.y = rsqrt(sums.y); \\\n\
+ \\\n\
+    conv_type tmpVal0, tmpVal1; \\\n\
+    float4  tmpData0, tmpData1; \\\n\
+ \\\n\
+    Image img1 = create_image_from_image2d(bias, 4); \\\n\
+    Image img2 = create_image_from_image2d(scale, 4); \\\n\
+    __global float* bias_ptr = (__global float*)img1.ptr; \\\n\
+    __global float* scale_ptr = (__global float*)img2.ptr; \\\n\
+    for (coord.x = 0; coord.x < width; coord.x += 8) \\\n\
+    { \\\n\
+        VXC_ReadImage(in0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, in0, 16); \\\n\
+        bias_f0 = vload4(0, bias_ptr); \\\n\
+        bias_f1 = vload4(1, bias_ptr); \\\n\
+        scale_f0 = vload4(0, scale_ptr); \\\n\
+        scale_f1 = vload4(1, scale_ptr); \\\n\
+        bias_ptr += 8; \\\n\
+        scale_ptr += 8; \\\n\
+ \\\n\
+        CONV2F32(tmpData0, src0, 0); \\\n\
+        CONV2F32(tmpData1, src0, 1); \\\n\
+ \\\n\
+        float4 norm; \\\n\
+        tmpData0 = tmpData0 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\
+        norm = norm * output_scale + output_zp; \\\n\
+        _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\
+ \\\n\
+        tmpData1 = tmpData1 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\
+        norm = norm * output_scale + output_zp; \\\n\
+        _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+        _viv_asm(COPY, dst, result, 16); \\\n\
+ \\\n\
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+LAYER_NORM_16_32_IMPL_2D(F16_F32toF16, vxc_half8,  vxc_half8,  vxc_short8, half4)\n\
+LAYER_NORM_16_32_IMPL_2D(F16_F32toI16, vxc_half8,  vxc_short8, vxc_short8, int4)\n\
+LAYER_NORM_16_32_IMPL_2D(F16_F32toI8,  vxc_half8,  vxc_char8,  vxc_char8,  int4)\n\
+LAYER_NORM_16_32_IMPL_2D(F16_F32toU8,  vxc_half8,  vxc_uchar8, vxc_uchar8, int4)\n\
+LAYER_NORM_16_32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\
+LAYER_NORM_16_32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)\n\
+"; /* end of layer_normalization_1_vx*/
 
-static const char layer_normalization_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char layer_normalization_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
-/**************************layernorm float16***********************************/\n\
+_viv_uniform VXC_512Bits uniSumX_16x1;\n\
+_viv_uniform VXC_512Bits uniSumX2_16x1;\n\
+_viv_uniform VXC_512Bits uniSum_X_X2_8x2;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
 _viv_uniform int width;\n\
-_viv_uniform float dimRatio;\n\
-_viv_uniform float dimRatio_scale;\n\
-_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-\n\
-_viv_uniform float e2InScale;\n\
-_viv_uniform float outputScale;\n\
+_viv_uniform float inv_multiplier;\n\
+_viv_uniform float output_scale;\n\
 _viv_uniform float output_zp;\n\
-_viv_uniform float input_scale;\n\
-_viv_uniform int inputZP;\n\
 \n\
-__kernel void layer_norm_I16toI16(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale,\n\
-    image2d_array_t output, float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\
-    int4 coord_in = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\
+#define CONV2F32(dst, src, section) \\\n\
+        VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+            uniDataToFP32_##section##_4x4);\n\
 \n\
-    int8 input_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr);\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    vxc_float sum = 0, sqr = 0;\n\
-    for(; coord_in.x < width;)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord_in.x += 8;\n\
-        vxc_float4 sumsqr;\n\
-        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-                    uniInt16SumSqr_dp8x2);\n\
-        sum += sumsqr.x;\n\
-        sqr = sqr + sumsqr.y * e2InScale;\n\
-    }\n\
-    vxc_float mean;\n\
-    mean = sum * dimRatio_scale;\n\
-    vxc_float vari;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    vxc_half8 scale_h;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-\n\
-    int2 coord_bias = (int2)(0, 0);\n\
-\n\
-    for(coord_in.x = 0; coord_in.x < width; coord_in.x += 8, coord.x += 8)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord_bias.x = coord_in.x;\n\
-        VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert2ndUint8SubZpToFp32_4x4);\n\
-\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvertSecFp16Fp32_4x4);\n\
-\n\
-        vxc_float4 sub, norm;\n\
-        sub = tmpData0 * input_scale - mean;\n\
-        norm = scale_f0 * vari * sub + bias_f0;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        sub = tmpData1 * input_scale - mean;\n\
-        norm = scale_f1 * vari * sub + bias_f1;\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-\n\
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-                uniConvertInt32toUint8_2x8);\n\
-        VXC_OP4_NoDest(img_store_3d, output, coord, dst, \\\n\
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
+#define LAYER_NORM_8_32_IMPL(name, src_type) \\\n\
+__kernel void layer_norm_axis0_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           eps) \\\n\
+{ \\\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+    int4 coord_out = coord; \\\n\
+ \\\n\
+    src_type src0, dst; \\\n\
+    vxc_short8 src1; \\\n\
+    vxc_half8 scale_h; \\\n\
+    float sum = 0, sqr = 0; \\\n\
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+    uint2 _sums = 0, sum_x_x2; \\\n\
+ \\\n\
+    int8 input_desc, output_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr_a); \\\n\
+ \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.z, baseAddr); \\\n\
+ \\\n\
+    for (coord.x = 0; coord.x < width; ) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x += 16; \\\n\
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\
+        _sums = _sums + sum_x_x2; \\\n\
+    } \\\n\
+ \\\n\
+    float2 sums = convert_float2(_sums) * inv_multiplier; \\\n\
+ \\\n\
+    sums.y = sums.y - sums.x * sums.x + eps; \\\n\
+    sums.y = rsqrt(sums.y); \\\n\
+    int4 tmpVal0, tmpVal1; \\\n\
+    float4  tmpData0, tmpData1, tmpData2, tmpData3; \\\n\
+ \\\n\
+    Image img1 = create_image_from_image2d(bias, 4); \\\n\
+    Image img2 = create_image_from_image2d(scale, 4); \\\n\
+    __global float* bias_ptr = (__global float*)img1.ptr; \\\n\
+    __global float* scale_ptr = (__global float*)img2.ptr; \\\n\
+    for(coord.x = 0; coord.x < width; coord.x += 16) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        bias_f0 = vload4(0, bias_ptr); \\\n\
+        bias_f1 = vload4(1, bias_ptr); \\\n\
+        scale_f0 = vload4(0, scale_ptr); \\\n\
+        scale_f1 = vload4(1, scale_ptr); \\\n\
+ \\\n\
+        CONV2F32(tmpData0, src0, 0); \\\n\
+        CONV2F32(tmpData1, src0, 1); \\\n\
+        CONV2F32(tmpData2, src0, 2); \\\n\
+        CONV2F32(tmpData3, src0, 3); \\\n\
+ \\\n\
+        float4 norm; \\\n\
+        tmpData0 = tmpData0 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\
+        bias_f0 = vload4(2, bias_ptr); \\\n\
+        scale_f0 = vload4(2, scale_ptr); \\\n\
+ \\\n\
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+ \\\n\
+        tmpData1 = tmpData1 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\
+        bias_f1 = vload4(3, bias_ptr); \\\n\
+        scale_f1 = vload4(3, scale_ptr); \\\n\
+        bias_ptr += 16; \\\n\
+        scale_ptr += 16; \\\n\
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+ \\\n\
+        tmpData2 = tmpData2 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData2 + bias_f0; \\\n\
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+ \\\n\
+        tmpData3 = tmpData3 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData3 + bias_f1; \\\n\
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+        coord_out.x = coord.x; \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
 }\n\
+LAYER_NORM_8_32_IMPL(U8_F32toU8, vxc_uchar16)\n\
+LAYER_NORM_8_32_IMPL(I8_F32toI8, vxc_char16)\n\
 \n\
-__kernel void layer_norm_I16toI16_2D(\n\
-    image2d_t input, image2d_t bias, image2d_t scale,\n\
-    image2d_t output, float eps)\n\
-{\n\
-    int2 coord = (int2)(0, get_global_id(1));\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    vxc_float sum = 0, sqr = 0;\n\
-    for(; coord.x < width;)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord.x += 8;\n\
-        vxc_float4 sumsqr;\n\
-        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-                    uniInt16SumSqr_dp8x2);\n\
-        sum += sumsqr.x;\n\
-        sqr = sqr + sumsqr.y * e2InScale;\n\
-    }\n\
-    vxc_float mean, vari;\n\
-    mean = sum * dimRatio_scale;\n\
-    vari = sqr * dimRatio - mean * mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    vxc_half8 scale_h;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-\n\
-    int2 coord_bias = (int2)(0, 0);\n\
-\n\
-    for(coord.x = 0; coord.x < width; coord.x += 8)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord_bias.x = coord.x;\n\
-        VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert2ndUint8SubZpToFp32_4x4);\n\
-\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                    UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                    uniConvertSecFp16Fp32_4x4);\n\
-\n\
-        vxc_float4 sub, norm;\n\
-        sub = tmpData0 * input_scale - mean;\n\
-        norm = scale_f0 * vari * sub + bias_f0;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        sub = tmpData1 * input_scale - mean;\n\
-        norm = scale_f1 * vari * sub + bias_f1;\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-\n\
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-                    uniConvertInt32toUint8_2x8);\n\
-        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
+#define LAYER_NORM_8_32_IMPL_2D(name, src_type) \\\n\
+__kernel void layer_norm_axis0_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           eps) \\\n\
+{ \\\n\
+    int4 coord = (int4)(0, get_global_id(1), 0, 0); \\\n\
+ \\\n\
+    src_type src0, dst; \\\n\
+    vxc_short8 src1; \\\n\
+    vxc_half8 scale_h; \\\n\
+    float sum = 0, sqr = 0; \\\n\
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+    uint2 _sums = 0, sum_x_x2; \\\n\
+ \\\n\
+    for (coord.x = 0; coord.x < width; ) \\\n\
+    { \\\n\
+        VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x += 16; \\\n\
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\
+        _sums = _sums + sum_x_x2; \\\n\
+    } \\\n\
+ \\\n\
+    float2 sums = convert_float2(_sums) * inv_multiplier; \\\n\
+ \\\n\
+    sums.y = sums.y - sums.x * sums.x + eps; \\\n\
+    sums.y = rsqrt(sums.y); \\\n\
+    int4 tmpVal0, tmpVal1; \\\n\
+    float4  tmpData0, tmpData1, tmpData2, tmpData3; \\\n\
+ \\\n\
+    Image img1 = create_image_from_image2d(bias, 4); \\\n\
+    Image img2 = create_image_from_image2d(scale, 4); \\\n\
+    __global float* bias_ptr = (__global float*)img1.ptr; \\\n\
+    __global float* scale_ptr = (__global float*)img2.ptr; \\\n\
+    for (coord.x = 0; coord.x < width; coord.x += 16) \\\n\
+    { \\\n\
+        VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        bias_f0 = vload4(0, bias_ptr); \\\n\
+        bias_f1 = vload4(1, bias_ptr); \\\n\
+        scale_f0 = vload4(0, scale_ptr); \\\n\
+        scale_f1 = vload4(1, scale_ptr); \\\n\
+ \\\n\
+        CONV2F32(tmpData0, src0, 0); \\\n\
+        CONV2F32(tmpData1, src0, 1); \\\n\
+        CONV2F32(tmpData2, src0, 2); \\\n\
+        CONV2F32(tmpData3, src0, 3); \\\n\
+ \\\n\
+        float4 norm; \\\n\
+        tmpData0 = tmpData0 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\
+        bias_f0 = vload4(2, bias_ptr); \\\n\
+        scale_f0 = vload4(2, scale_ptr); \\\n\
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+ \\\n\
+        tmpData1 = tmpData1 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\
+ \\\n\
+        bias_f1 = vload4(3, bias_ptr); \\\n\
+        scale_f1 = vload4(3, scale_ptr); \\\n\
+        bias_ptr += 16; \\\n\
+        scale_ptr += 16; \\\n\
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+ \\\n\
+        tmpData2 = tmpData2 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData2 + bias_f0; \\\n\
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+ \\\n\
+        tmpData3 = tmpData3 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData3 + bias_f1; \\\n\
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
 }\n\
-"; /* end of layer_normalization_i16_vx*/
+LAYER_NORM_8_32_IMPL_2D(U8_F32toU8, vxc_uchar16)\n\
+LAYER_NORM_8_32_IMPL_2D(I8_F32toI8, vxc_char16)\n\
+\n\
+#define LAYER_NORM_8_32TOF16_IMPL(name, src_type) \\\n\
+__kernel void layer_norm_axis0_##name( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           eps) \\\n\
+{ \\\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+    int4 coord_out = coord; \\\n\
+ \\\n\
+    src_type src0; \\\n\
+    vxc_short8 src1; \\\n\
+    vxc_half8 scale_h; \\\n\
+    float sum = 0, sqr = 0; \\\n\
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+    uint2 _sums = 0, sum_x_x2; \\\n\
+ \\\n\
+    int8 input_desc, output_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr_a); \\\n\
+ \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.z, baseAddr); \\\n\
+ \\\n\
+    for (coord.x = 0; coord.x < width; ) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x += 16; \\\n\
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\
+        _sums = _sums + sum_x_x2; \\\n\
+    } \\\n\
+ \\\n\
+    float2 sums = convert_float2(_sums) * inv_multiplier; \\\n\
+ \\\n\
+    sums.y = sums.y - sums.x * sums.x + eps; \\\n\
+    sums.y = rsqrt(sums.y); \\\n\
+    half4 tmpVal0, tmpVal1; \\\n\
+    float4  tmpData0, tmpData1, tmpData2, tmpData3; \\\n\
+ \\\n\
+    vxc_short8 dst; \\\n\
+    vxc_half8 result; \\\n\
+    Image img1 = create_image_from_image2d(bias, 4); \\\n\
+    Image img2 = create_image_from_image2d(scale, 4); \\\n\
+    __global float* bias_ptr = (__global float*)img1.ptr; \\\n\
+    __global float* scale_ptr = (__global float*)img2.ptr; \\\n\
+    for(coord.x = 0; coord.x < width; coord.x += 16) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        bias_f0 = vload4(0, bias_ptr); \\\n\
+        bias_f1 = vload4(1, bias_ptr); \\\n\
+        scale_f0 = vload4(0, scale_ptr); \\\n\
+        scale_f1 = vload4(1, scale_ptr); \\\n\
+ \\\n\
+        CONV2F32(tmpData0, src0, 0); \\\n\
+        CONV2F32(tmpData1, src0, 1); \\\n\
+        CONV2F32(tmpData2, src0, 2); \\\n\
+        CONV2F32(tmpData3, src0, 3); \\\n\
+ \\\n\
+        float4 norm; \\\n\
+        tmpData0 = tmpData0 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\
+        bias_f0 = vload4(2, bias_ptr); \\\n\
+        scale_f0 = vload4(2, scale_ptr); \\\n\
+        _viv_asm(CONV, tmpVal0, norm); \\\n\
+ \\\n\
+        tmpData1 = tmpData1 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\
+ \\\n\
+        bias_f1 = vload4(3, bias_ptr); \\\n\
+        scale_f1 = vload4(3, scale_ptr); \\\n\
+        bias_ptr += 16; \\\n\
+        scale_ptr += 16; \\\n\
+        _viv_asm(CONV, tmpVal1, norm); \\\n\
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+        _viv_asm(COPY, dst, result, 16); \\\n\
+        coord_out.x = coord.x; \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+        tmpData2 = tmpData2 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData2 + bias_f0; \\\n\
+        _viv_asm(CONV, tmpVal0, norm); \\\n\
+ \\\n\
+        tmpData3 = tmpData3 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData3 + bias_f1; \\\n\
+        _viv_asm(CONV, tmpVal1, norm); \\\n\
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+        _viv_asm(COPY, dst, result, 16); \\\n\
+        coord_out.x += 8; \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+LAYER_NORM_8_32TOF16_IMPL(U8_F32toF16, vxc_uchar16)\n\
+LAYER_NORM_8_32TOF16_IMPL(I8_F32toF16, vxc_char16)\n\
+\n\
+#define LAYER_NORM_8_32TOF16_IMPL_2D(name, src_type) \\\n\
+__kernel void layer_norm_axis0_##name##_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       bias, \\\n\
+    __read_only  image2d_t       scale, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           eps) \\\n\
+{ \\\n\
+    int4 coord = (int4)(0, get_global_id(1), 0, 0); \\\n\
+ \\\n\
+    src_type src0; \\\n\
+    vxc_short8 src1; \\\n\
+    vxc_half8 scale_h; \\\n\
+    float sum = 0, sqr = 0; \\\n\
+    float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+    uint2 _sums = 0, sum_x_x2; \\\n\
+ \\\n\
+    for (coord.x = 0; coord.x < width; ) \\\n\
+    { \\\n\
+        VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x += 16; \\\n\
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\
+        VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\
+        _sums = _sums + sum_x_x2; \\\n\
+    } \\\n\
+ \\\n\
+    float2 sums = convert_float2(_sums) * inv_multiplier; \\\n\
+ \\\n\
+    sums.y = sums.y - sums.x * sums.x + eps; \\\n\
+    sums.y = rsqrt(sums.y); \\\n\
+    half4 tmpVal0, tmpVal1; \\\n\
+    float4  tmpData0, tmpData1, tmpData2, tmpData3; \\\n\
+ \\\n\
+    vxc_short8 dst; \\\n\
+    vxc_half8 result; \\\n\
+    Image img1 = create_image_from_image2d(bias, 4); \\\n\
+    Image img2 = create_image_from_image2d(scale, 4); \\\n\
+    __global float* bias_ptr = (__global float*)img1.ptr; \\\n\
+    __global float* scale_ptr = (__global float*)img2.ptr; \\\n\
+    for (coord.x = 0; coord.x < width; coord.x += 16) \\\n\
+    { \\\n\
+        VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        bias_f0 = vload4(0, bias_ptr); \\\n\
+        bias_f1 = vload4(1, bias_ptr); \\\n\
+        scale_f0 = vload4(0, scale_ptr); \\\n\
+        scale_f1 = vload4(1, scale_ptr); \\\n\
+        bias_ptr += 8; \\\n\
+        scale_ptr += 8; \\\n\
+ \\\n\
+        CONV2F32(tmpData0, src0, 0); \\\n\
+        CONV2F32(tmpData1, src0, 1); \\\n\
+        CONV2F32(tmpData2, src0, 2); \\\n\
+        CONV2F32(tmpData3, src0, 3); \\\n\
+ \\\n\
+        float4 norm; \\\n\
+        tmpData0 = tmpData0 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\
+        bias_f0 = vload4(2, bias_ptr); \\\n\
+        scale_f0 = vload4(2, scale_ptr); \\\n\
+        _viv_asm(CONV, tmpVal0, norm); \\\n\
+ \\\n\
+        tmpData1 = tmpData1 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\
+        bias_f1 = vload4(3, bias_ptr); \\\n\
+        scale_f1 = vload4(3, scale_ptr); \\\n\
+        _viv_asm(CONV, tmpVal1, norm); \\\n\
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+        _viv_asm(COPY, dst, result, 16); \\\n\
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x += 8; \\\n\
+ \\\n\
+        tmpData2 = tmpData2 - sums.x; \\\n\
+        norm = scale_f0 * sums.y * tmpData2 + bias_f0; \\\n\
+        _viv_asm(CONV, tmpVal0, norm); \\\n\
+ \\\n\
+        tmpData3 = tmpData3 - sums.x; \\\n\
+        norm = scale_f1 * sums.y * tmpData3 + bias_f1; \\\n\
+        _viv_asm(CONV, tmpVal1, norm); \\\n\
+        VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniExtract8Data_2x8); \\\n\
+        _viv_asm(COPY, dst, result, 16); \\\n\
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x -= 8; \\\n\
+    } \\\n\
+}\n\
+LAYER_NORM_8_32TOF16_IMPL_2D(U8_F32toF16, vxc_uchar16)\n\
+LAYER_NORM_8_32TOF16_IMPL_2D(I8_F32toF16, vxc_char16)\n\
+"; /* end of layer_normalization_2_vx*/
 
-static const char layer_normalization_scale_f32_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-/**************************layernorm float16***********************************/\n\
-_viv_uniform int width;\n\
-_viv_uniform float dimRatio;\n\
-_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\
-\n\
-__kernel void layer_norm_F16F32toF16(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale,\n\
-    image2d_array_t output, float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\
-    int4 coord_out = coord;\n\
-\n\
-    int8 input_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr_a);\n\
-\n\
-    vxc_short8 src0;\n\
-    vxc_float sum = 0, sqr = 0;\n\
-    VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    Image img1 = create_image_from_image2d(bias, 4);\n\
-    Image img2 = create_image_from_image2d(scale, 4);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord_out.z, baseAddr);\n\
-\n\
-    for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\
-    {\n\
-        vxc_half8  val0_h;\n\
-        _viv_asm(COPY, val0_h, src0, 16);\n\
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        vxc_float4 sumsqr;\n\
-        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniFp16SumSqr_dp8x2);\n\
-        sum += sumsqr.x;\n\
-        sqr += sumsqr.y;\n\
-    }\n\
-    vxc_float mean;\n\
-    mean = sum * dimRatio;\n\
-    vxc_float vari;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-    vxc_float4 bias_f, scale_f, in_f;\n\
-    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0));\n\
-    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0));\n\
-    for(coord.x = 0; coord.x < width; coord.x += 4)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f = vload4(0, bias_ptr + coord.x);\n\
-        scale_f = vload4(0, scale_ptr + coord.x);\n\
-        vxc_half8 in_h;\n\
-        _viv_asm(COPY, in_h, src0, 16);\n\
-        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        vxc_float4 sub, norm;\n\
-        sub = in_f - mean;\n\
-        norm = scale_f * vari * sub + bias_f;\n\
-        half4 norm_h;\n\
-        _viv_asm(CONV, norm_h, norm);\n\
-        vxc_half8 dst;\n\
-        VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniExtractHalf4_dp4x4);\n\
-        vxc_short8 dstval;\n\
-        _viv_asm(COPY, dstval, dst, 16);\n\
-        coord_out.x = coord.x;\n\
-        VXC_OP4_NoDest(img_store_3d, output, coord_out, dstval, \\\n\
-                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-/*****************************layernorm uint8 to uint8****************************/\n\
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniSumU8_16x1;\n\
-_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\
-_viv_uniform float input_scale;\n\
-_viv_uniform int inputZP;\n\
-_viv_uniform float outputScale;\n\
-_viv_uniform float output_zp;\n\
-_viv_uniform int sumInZp;\n\
-_viv_uniform int tmpZp1;\n\
-_viv_uniform int tmpZp2;\n\
-_viv_uniform float e2InScale;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\
-_viv_uniform float dimRatio_scale;\n\
-\n\
-__kernel void layer_norm_U8F32toU8(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale,\n\
-    image2d_array_t output, float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\
-    int4 coord_out = coord;\n\
-\n\
-    vxc_uchar16 src0, src2;\n\
-    float sum = 0, sqr = 0;\n\
-    vxc_float4 bias_f0, bias_f1, bias_f2, bias_f3, scale_f0, scale_f1, scale_f2, scale_f3;\n\
-    int tmpSum = 0, tmpSqr = 0;\n\
-    vxc_int4 tmpSum1;\n\
-    vxc_int4 tmpSqr1;\n\
-    short zp = inputZP;\n\
-\n\
-    int8 input_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord_out.z, baseAddr);\n\
-\n\
-    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
-        tmpSum += (tmpSum1.x);\n\
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
-        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\
-    }\n\
-    sum = (tmpSum + sumInZp) * input_scale;\n\
-    sqr = (tmpSqr + tmpZp2) * e2InScale;\n\
-\n\
-    float mean, vari;\n\
-    mean = sum * dimRatio;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
-\n\
-    Image img1 = create_image_from_image2d(bias, 4);\n\
-    Image img2 = create_image_from_image2d(scale, 4);\n\
-    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0));\n\
-    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0));\n\
-    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f0 = vload4(0, bias_ptr);\n\
-        bias_f1 = vload4(1, bias_ptr);\n\
-        bias_f2 = vload4(2, bias_ptr);\n\
-        bias_f3 = vload4(3, bias_ptr);\n\
-        scale_f0 = vload4(0, scale_ptr);\n\
-        scale_f1 = vload4(1, scale_ptr);\n\
-        scale_f2 = vload4(2, scale_ptr);\n\
-        scale_f3 = vload4(3, scale_ptr);\n\
-        bias_ptr += 16;\n\
-        scale_ptr += 16;\n\
-\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert2ndUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert3rdUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert4thUint8SubZpToFp32_4x4);\n\
-        tmpData0 *= input_scale;\n\
-        tmpData1 *= input_scale;\n\
-        tmpData2 *= input_scale;\n\
-        tmpData3 *= input_scale;\n\
-\n\
-        vxc_float4 norm;\n\
-        tmpData0 -= mean;\n\
-        norm = scale_f0 * vari * tmpData0 + bias_f0;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-\n\
-        tmpData1 -= mean;\n\
-        norm = scale_f1 * vari * tmpData1 + bias_f1;\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-\n\
-        tmpData2 -= mean;\n\
-        norm = scale_f2 * vari * tmpData2 + bias_f2;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-\n\
-        tmpData3 -= mean;\n\
-        norm = scale_f3 * vari * tmpData3 + bias_f3;\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-        coord_out.x = coord.x;\n\
-        VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \\\n\
-                VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel void layer_norm_I16F32toI16(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale,\n\
-    image2d_array_t output, float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\
-    int4 coord_in = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\
-\n\
-    int8 input_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr);\n\
-\n\
-    vxc_short8 src0, dst;\n\
-    vxc_float sum = 0, sqr = 0;\n\
-    for(; coord_in.x < width;)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord_in.x += 8;\n\
-        vxc_float4 sumsqr;\n\
-        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-                    uniInt16SumSqr_dp8x2);\n\
-        sum += sumsqr.x;\n\
-        sqr = sqr + sumsqr.y * e2InScale;\n\
-    }\n\
-    vxc_float mean;\n\
-    mean = sum * dimRatio_scale;\n\
-    vxc_float vari;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-\n\
-    int2 coord_bias = (int2)(0, 0);\n\
-    Image img1 = create_image_from_image2d(bias, 4);\n\
-    Image img2 = create_image_from_image2d(scale, 4);\n\
-    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord_bias);\n\
-    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord_bias);\n\
-    for(coord_in.x = 0; coord_in.x < width; coord_in.x += 8, coord.x += 8)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f0 = vload4(0, bias_ptr);\n\
-        bias_f1 = vload4(1, bias_ptr);\n\
-        scale_f0 = vload4(0, scale_ptr);\n\
-        scale_f1 = vload4(1, scale_ptr);\n\
-        bias_ptr += 8;\n\
-        scale_ptr += 8;\n\
-\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert2ndUint8SubZpToFp32_4x4);\n\
-\n\
-        vxc_float4 sub, norm;\n\
-        sub = tmpData0 * input_scale - mean;\n\
-        norm = scale_f0 * vari * sub + bias_f0;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        sub = tmpData1 * input_scale - mean;\n\
-        norm = scale_f1 * vari * sub + bias_f1;\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-\n\
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-                uniConvertInt32toUint8_2x8);\n\
-        VXC_OP4_NoDest(img_store_3d, output, coord, dst, \\\n\
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}"; /* end of layer_normalization_scale_f32_vx*/
-
-static const char layer_normalization_scale_f32_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-/**************************layernorm float16***********************************/\n\
-_viv_uniform int width;\n\
-_viv_uniform float dimRatio;\n\
-_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\
-\n\
-__kernel void layer_norm_F16F32toF16_2D(\n\
-    image2d_t input, image2d_t bias, image2d_t scale,\n\
-    image2d_t output, float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
-    vxc_short8 src0, src1;\n\
-    vxc_float sum = 0, sqr = 0;\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    Image img1 = create_image_from_image2d(bias, 4);\n\
-    Image img2 = create_image_from_image2d(scale, 4);\n\
-\n\
-    for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\
-    {\n\
-        vxc_half8  val0_h;\n\
-        _viv_asm(COPY, val0_h, src0, 16);\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        vxc_float4 sumsqr;\n\
-        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniFp16SumSqr_dp8x2);\n\
-        sum += sumsqr.x;\n\
-        sqr += sumsqr.y;\n\
-    }\n\
-    vxc_float mean;\n\
-    mean = sum * dimRatio;\n\
-    vxc_float vari;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-    vxc_float4 bias_f, scale_f, in_f;\n\
-    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);\n\
-    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);\n\
-    for(coord.x = 0; coord.x < width; coord.x += 4)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f = vload4(0, bias_ptr + coord.x);\n\
-        scale_f = vload4(0, scale_ptr + coord.x);\n\
-\n\
-        vxc_half8 in_h;\n\
-        _viv_asm(COPY, in_h, src0, 16);\n\
-        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        vxc_float4 sub, norm;\n\
-        sub = in_f - mean;\n\
-        norm = scale_f * vari * sub + bias_f;\n\
-        half4 norm_h;\n\
-        _viv_asm(CONV, norm_h, norm);\n\
-        vxc_half8 dst;\n\
-        VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniExtractHalf4_dp4x4);\n\
-        vxc_short8 dstval;\n\
-        _viv_asm(COPY, dstval, dst, 16);\n\
-        VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-/*****************************layernorm uint8 to uint8****************************/\n\
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniSumU8_16x1;\n\
-_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\
-_viv_uniform float input_scale;\n\
-_viv_uniform int inputZP;\n\
-_viv_uniform float outputScale;\n\
-_viv_uniform float output_zp;\n\
-_viv_uniform int sumInZp;\n\
-_viv_uniform int tmpZp1;\n\
-_viv_uniform int tmpZp2;\n\
-_viv_uniform float e2InScale;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\
-_viv_uniform float dimRatio_scale;\n\
-\n\
-__kernel void layer_norm_U8F32toU8_2D(\n\
-    image2d_t input, image2d_t bias, image2d_t scale,\n\
-    image2d_t output, float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
-    vxc_uchar16 src0, src2;\n\
-    float sum = 0, sqr = 0;\n\
-    vxc_float4 bias_f0, bias_f1, bias_f2, bias_f3, scale_f0, scale_f1, scale_f2, scale_f3;\n\
-    int tmpSum = 0, tmpSqr = 0;\n\
-    vxc_int4 tmpSum1;\n\
-    vxc_int4 tmpSqr1;\n\
-    short zp = inputZP;\n\
-\n\
-    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
-        tmpSum += (tmpSum1.x);\n\
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
-        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\
-    }\n\
-    sum = (tmpSum + sumInZp) * input_scale;\n\
-    sqr = (tmpSqr + tmpZp2) * e2InScale;\n\
-\n\
-    float mean, vari;\n\
-    mean = sum * dimRatio;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
-\n\
-    Image img1 = create_image_from_image2d(bias, 4);\n\
-    Image img2 = create_image_from_image2d(scale, 4);\n\
-    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);\n\
-    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);\n\
-    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f0 = vload4(0, bias_ptr);\n\
-        bias_f1 = vload4(1, bias_ptr);\n\
-        bias_f2 = vload4(2, bias_ptr);\n\
-        bias_f3 = vload4(3, bias_ptr);\n\
-        scale_f0 = vload4(0, scale_ptr);\n\
-        scale_f1 = vload4(1, scale_ptr);\n\
-        scale_f2 = vload4(2, scale_ptr);\n\
-        scale_f3 = vload4(3, scale_ptr);\n\
-        bias_ptr += 16;\n\
-        scale_ptr += 16;\n\
-\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert2ndUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert3rdUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert4thUint8SubZpToFp32_4x4);\n\
-        tmpData0 = tmpData0 * input_scale - mean;\n\
-        tmpData1 = tmpData1 * input_scale - mean;\n\
-        tmpData2 = tmpData2 * input_scale - mean;\n\
-        tmpData3 = tmpData3 * input_scale - mean;\n\
-\n\
-        vxc_float4 norm;\n\
-        norm = scale_f0 * vari * tmpData0 + bias_f0;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-\n\
-        norm = scale_f1 * vari * tmpData1 + bias_f1;\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-\n\
-        norm = scale_f2 * vari * tmpData2 + bias_f2;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-\n\
-        norm = scale_f3 * vari * tmpData3 + bias_f3;\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-        VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel void layer_norm_I16F32toI16_2D(\n\
-    image2d_t input, image2d_t bias, image2d_t scale,\n\
-    image2d_t output, float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    vxc_float sum = 0, sqr = 0;\n\
-    for(; coord.x < width;)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord.x += 8;\n\
-        vxc_float4 sumsqr;\n\
-        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-                    uniInt16SumSqr_dp8x2);\n\
-        sum += sumsqr.x;\n\
-        sqr = sqr + sumsqr.y * e2InScale;\n\
-    }\n\
-    vxc_float mean, vari;\n\
-    mean = sum * dimRatio_scale;\n\
-    vari = sqr * dimRatio - mean * mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    vxc_half8 scale_h;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-\n\
-    Image img1 = create_image_from_image2d(bias, 4);\n\
-    Image img2 = create_image_from_image2d(scale, 4);\n\
-\n\
-    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);\n\
-    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);\n\
-    for(coord.x = 0; coord.x < width; coord.x += 8)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f0 = vload4(0, bias_ptr);\n\
-        bias_f1 = vload4(1, bias_ptr);\n\
-        scale_f0 = vload4(0, scale_ptr);\n\
-        scale_f1 = vload4(1, scale_ptr);\n\
-        bias_ptr += 8;\n\
-        scale_ptr += 8;\n\
-\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert2ndUint8SubZpToFp32_4x4);\n\
-\n\
-        vxc_float4 sub, norm;\n\
-        sub = tmpData0 * input_scale - mean;\n\
-        norm = scale_f0 * vari * sub + bias_f0;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        sub = tmpData1 * input_scale - mean;\n\
-        norm = scale_f1 * vari * sub + bias_f1;\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-\n\
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-                    uniConvertInt32toUint8_2x8);\n\
-        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}"; /* end of layer_normalization_scale_f32_2d_vx*/
-
-static const char layer_normalization_scale_f32_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char layer_normalization_3_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 /**************************layernorm float16***********************************/\n\
 _viv_uniform int width;\n\
@@ -15803,1372 +14361,7 @@ __kernel void layer_norm_BF16F32toBF16_2D(\n\
         VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
         VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     }\n\
-}"; /* end of layer_normalization_scale_f32_bf16_vx*/
-
-static const char layer_normalization_u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-/*****************************layernorm uint8 to fp16****************************/\n\
-_viv_uniform int width;\n\
-_viv_uniform float dimRatio;\n\
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniSumU8_16x1;\n\
-_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\
-_viv_uniform float input_scale;\n\
-_viv_uniform int inputZP;\n\
-_viv_uniform int sumInZp;\n\
-_viv_uniform int tmpZp1;\n\
-_viv_uniform int tmpZp2;\n\
-_viv_uniform float e2InScale;\n\
-_viv_uniform VXC_512Bits UniPackFP16even_2x8;\n\
-\n\
-__kernel void layer_norm_U8toF16(\n\
-    image2d_array_t input,\n\
-    image2d_t bias,\n\
-    image2d_t scale,\n\
-    image2d_array_t output,\n\
-              float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\
-    int4 coord_out = coord;\n\
-    vxc_uchar16 src0;\n\
-    float sum = 0, sqr = 0;\n\
-    int tmpSum = 0, tmpSqr = 0;\n\
-    vxc_int4 tmpSum1;\n\
-    vxc_int4 tmpSqr1;\n\
-\n\
-    int8 input_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord_out.z, baseAddr);\n\
-\n\
-    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
-        tmpSum += (tmpSum1.x);\n\
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
-        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\
-    }\n\
-    sum = (tmpSum + sumInZp) * input_scale;\n\
-    sqr = (tmpSqr + tmpZp2) * e2InScale;\n\
-\n\
-    float mean, vari;\n\
-    mean = sum * dimRatio;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
-    int2 coord_bias = (int2)(0, 0);\n\
-    vxc_half8 scale_h;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    vxc_short8 src1, outval;\n\
-    short zp = inputZP;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    vxc_half8 dst;\n\
-\n\
-    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        coord_bias.x = coord.x;\n\
-\n\
-        scale_f0 = read_imagef(scale, coord_bias);\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        scale_f1 = read_imagef(scale, coord_bias);\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert2ndUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert3rdUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert4thUint8SubZpToFp32_4x4);\n\
-        tmpData0 *= input_scale;\n\
-        tmpData1 *= input_scale;\n\
-        tmpData2 *= input_scale;\n\
-        tmpData3 *= input_scale;\n\
-\n\
-        vxc_float4 norm;\n\
-        tmpData0 -= mean;\n\
-        norm = scale_f0 * vari * tmpData0 + bias_f0;\n\
-\n\
-        scale_f0 = read_imagef(scale, coord_bias);\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        _viv_asm(CONV, tmpVal0, norm);\n\
-\n\
-        tmpData1 -= mean;\n\
-        norm = scale_f1 * vari * tmpData1 + bias_f1;\n\
-\n\
-        scale_f1 = read_imagef(scale, coord_bias);\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-\n\
-        _viv_asm(CONV, tmpVal1, norm);\n\
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniPackFP16even_2x8);\n\
-        _viv_asm(COPY, outval, dst, 16);\n\
-        coord_out.x = coord.x;\n\
-        VXC_OP4_NoDest(img_store_3d, output, coord_out, outval, \\\n\
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-\n\
-        tmpData2 -= mean;\n\
-        norm = scale_f0 * vari * tmpData2 + bias_f0;\n\
-        _viv_asm(CONV, tmpVal0, norm);\n\
-\n\
-        tmpData3 -= mean;\n\
-        norm = scale_f1 * vari * tmpData3 + bias_f1;\n\
-        _viv_asm(CONV, tmpVal1, norm);\n\
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniPackFP16even_2x8);\n\
-        _viv_asm(COPY, outval, dst, 16);\n\
-        coord_out.x += 8;\n\
-        VXC_OP4_NoDest(img_store_3d, output, coord_out, outval, \\\n\
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel void layer_norm_U8toF16_2D(\n\
-    image2d_t input,\n\
-    image2d_t bias,\n\
-    image2d_t scale,\n\
-    image2d_t output,\n\
-        float eps)\n\
-{\n\
-    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
-    vxc_uchar16 src0;\n\
-    float sum = 0, sqr = 0;\n\
-    int tmpSum = 0, tmpSqr = 0;\n\
-    vxc_int4 tmpSum1;\n\
-    vxc_int4 tmpSqr1;\n\
-\n\
-    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
-        tmpSum += (tmpSum1.x);\n\
-        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
-        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\
-    }\n\
-    sum = (tmpSum + sumInZp) * input_scale;\n\
-    sqr = (tmpSqr + tmpZp2) * e2InScale;\n\
-\n\
-    float mean, vari;\n\
-    mean = sum * dimRatio;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
-    vari += eps;\n\
-    vari = rsqrt(vari);\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
-    int2 coord_bias = (int2)(0, 0);\n\
-    vxc_half8 scale_h;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    vxc_short8 src1, outval;\n\
-    short zp = inputZP;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    vxc_half8 dst;\n\
-\n\
-    int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
-    {\n\
-        coord_bias.x = coord.x;\n\
-        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-        scale_f0 = read_imagef(scale, coord_bias);\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        scale_f1 = read_imagef(scale, coord_bias);\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert2ndUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert3rdUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvert4thUint8SubZpToFp32_4x4);\n\
-        tmpData0 *= input_scale;\n\
-        tmpData1 *= input_scale;\n\
-        tmpData2 *= input_scale;\n\
-        tmpData3 *= input_scale;\n\
-\n\
-        vxc_float4 norm;\n\
-        tmpData0 -= mean;\n\
-        norm = scale_f0 * vari * tmpData0 + bias_f0;\n\
-\n\
-        scale_f0 = read_imagef(scale, coord_bias);\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-\n\
-        coord_bias.x += 4;\n\
-        _viv_asm(CONV, tmpVal0, norm);\n\
-\n\
-        tmpData1 -= mean;\n\
-        norm = scale_f1 * vari * tmpData1 + bias_f1;\n\
-\n\
-        scale_f1 = read_imagef(scale, coord_bias);\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-\n\
-        _viv_asm(CONV, tmpVal1, norm);\n\
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniPackFP16even_2x8);\n\
-        _viv_asm(COPY, outval, dst, 16);\n\
-        coord_out.x = coord.x;\n\
-        VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-        tmpData2 -= mean;\n\
-        norm = scale_f0 * vari * tmpData2 + bias_f0;\n\
-        _viv_asm(CONV, tmpVal0, norm);\n\
-\n\
-        tmpData3 -= mean;\n\
-        norm = scale_f1 * vari * tmpData3 + bias_f1;\n\
-        _viv_asm(CONV, tmpVal1, norm);\n\
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniPackFP16even_2x8);\n\
-        _viv_asm(COPY, outval, dst, 16);\n\
-        coord_out.x += 8;\n\
-        VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-"; /* end of layer_normalization_u8_f16_vx*/
-
-static const char layer_normalization_wh_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
-_viv_uniform int width;\n\
-\n\
-_viv_uniform int height;\n\
-\n\
-_viv_uniform int height_depth;\n\
-_viv_uniform float dimRatio;\n\
-_viv_uniform int group_num;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-_viv_uniform float outputScale;\n\
-_viv_uniform float output_zp;\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32(\n\
-    image2d_array_t input, image2d_t output)\n\
-{\n\
-    int gidx = get_global_id(0) << 3;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
-    vxc_short8 src0;\n\
-    vxc_half8 in_h;\n\
-    vxc_float4 sumsqr;\n\
-    vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    int8 input_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.w, baseAddr_a);\n\
-\n\
-    if(gidx < width)\n\
-    {\n\
-        for(coord.y = 0; coord.y < height;)\n\
-        {\n\
-            VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            _viv_asm(COPY, in_h, src0, 16);\n\
-            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniFp16SumSqr_dp8x2);\n\
-            tmpSumSqr += sumsqr;\n\
-        }\n\
-    }\n\
-\n\
-    lcl_sum[lidx] = tmpSumSqr.x;\n\
-    lcl_sqr[lidx] = tmpSumSqr.y;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
-        float sum = 0;\n\
-        float sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32_2D(\n\
-    image2d_array_t input, image2d_t output)\n\
-{\n\
-    int gidx = get_global_id(0) << 3;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-\n\
-    int2 coord = (int2)(gidx, gidy);\n\
-    vxc_short8 src0;\n\
-    vxc_half8 in_h;\n\
-    vxc_float4 sumsqr;\n\
-    vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    int endH = gidy + height;\n\
-    if(gidx < width)\n\
-    {\n\
-        for(; coord.y < endH;)\n\
-        {\n\
-            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            _viv_asm(COPY, in_h, src0, 16);\n\
-            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniFp16SumSqr_dp8x2);\n\
-            tmpSumSqr += sumsqr;\n\
-        }\n\
-    }\n\
-\n\
-    lcl_sum[lidx] = tmpSumSqr.x;\n\
-    lcl_sqr[lidx] = tmpSumSqr.y;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
-        float sum = 0;\n\
-        float sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16(\n\
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int2 coord_sum = (int2)(0, gidz);\n\
-    int4 coord_para = coord;\n\
-    coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\
-    vxc_short8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h, in_h;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    vxc_float4 mean_vari = (vxc_float4)(0);\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_sum);\n\
-        coord_sum.x += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    int4 coord_bias = coord_para;\n\
-\n\
-    int8 input_desc, scale_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\
-    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\
-    _viv_asm(MOV, coord_para.w, baseAddr_c);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr);\n\
-\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    vxc_half8 dst;\n\
-\n\
-    for(coord.y = 0; coord.y < height; coord.y++)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
-                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord_in.y ++;\n\
-        coord_para.y = coord.y;\n\
-        coord_bias.y = coord.y;\n\
-        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
-                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x = coord.x;\n\
-\n\
-        _viv_asm(COPY, in_h, src0, 16);\n\
-        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertSecFp16Fp32_4x4);\n\
-\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvertSecFp16Fp32_4x4);\n\
-\n\
-        vxc_float4 sub, norm;\n\
-        sub = tmpData0 - mean_vari.s0;\n\
-        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;\n\
-        _viv_asm(CONV, tmpVal0, norm);\n\
-        sub = tmpData1 - mean_vari.s0;\n\
-        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;\n\
-        _viv_asm(CONV, tmpVal1, norm);\n\
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-                    uniConvertHalfToFp16_2x8);\n\
-        _viv_asm(COPY, outval, dst, 16);\n\
-        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16_2D(\n\
-    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_t output, float eps)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), 0);\n\
-    int2 coord_bias = (int2)(0, 0);\n\
-    vxc_short8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h, in_h;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    vxc_float4 mean_vari = (vxc_float4)(0);\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_bias);\n\
-        coord_bias.x += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    coord_bias = coord;\n\
-\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    vxc_half8 dst;\n\
-\n\
-    for(coord.y = 0; coord.y < height; coord.y++)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord_bias.y = coord.y;\n\
-        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x = coord.x;\n\
-\n\
-        _viv_asm(COPY, in_h, src0, 16);\n\
-        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertSecFp16Fp32_4x4);\n\
-\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvertSecFp16Fp32_4x4);\n\
-\n\
-        vxc_float4 sub, norm;\n\
-        sub = tmpData0 - mean_vari.s0;\n\
-        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;\n\
-        _viv_asm(CONV, tmpVal0, norm);\n\
-        sub = tmpData1 - mean_vari.s0;\n\
-        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;\n\
-        _viv_asm(CONV, tmpVal1, norm);\n\
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-                    uniConvertHalfToFp16_2x8);\n\
-        _viv_asm(COPY, outval, dst, 16);\n\
-        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8(\n\
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int2 coord_sum = (int2)(0, gidz);\n\
-    int4 coord_para = coord;\n\
-    coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\
-    vxc_short8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h, in_h;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    vxc_float4 mean_vari = (vxc_float4)(0);\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_sum);\n\
-        coord_sum.x += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    int4 coord_bias = coord_para;\n\
-\n\
-    int8 input_desc, scale_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\
-    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\
-    _viv_asm(MOV, coord_para.w, baseAddr_c);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr);\n\
-\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_uchar16 outval;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-\n\
-    for(coord.y = 0; coord.y < height; coord.y++)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
-                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord_in.y ++;\n\
-        coord_para.y = coord.y;\n\
-        coord_bias.y = coord.y;\n\
-        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
-                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x = coord.x;\n\
-\n\
-        _viv_asm(COPY, in_h, src0, 16);\n\
-        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertSecFp16Fp32_4x4);\n\
-\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvertSecFp16Fp32_4x4);\n\
-\n\
-        vxc_float4 sub, norm;\n\
-        sub = tmpData0 - mean_vari.s0;\n\
-        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        sub = tmpData1 - mean_vari.s0;\n\
-        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-                uniConvertInt32toUint8_2x8);\n\
-        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8_2D(\n\
-    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_t output, float eps)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), 0);\n\
-    int2 coord_bias = (int2)(0, 0);\n\
-    vxc_short8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h, in_h;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    vxc_float4 mean_vari = (vxc_float4)(0);\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_bias);\n\
-        coord_bias.x += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    coord_bias = coord;\n\
-\n\
-    vxc_float4  tmpData0, tmpData1;\n\
-    vxc_uchar16 outval;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-\n\
-    for(coord.y = 0; coord.y < height; coord.y++)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord_bias.y = coord.y;\n\
-        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x = coord.x;\n\
-\n\
-        _viv_asm(COPY, in_h, src0, 16);\n\
-        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniConvertSecFp16Fp32_4x4);\n\
-\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvertSecFp16Fp32_4x4);\n\
-\n\
-        vxc_float4 sub, norm;\n\
-        sub = tmpData0 - mean_vari.s0;\n\
-        norm = scale_f0 * mean_vari.s1 * sub + bias_f0;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        sub = tmpData1 - mean_vari.s0;\n\
-        norm = scale_f1 * mean_vari.s1 * sub + bias_f1;\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-                uniConvertInt32toUint8_2x8);\n\
-        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}"; /* end of layer_normalization_wh_f16_vx*/
-
-static const char layer_normalization_wh_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\
-_viv_uniform float e2InScale;\n\
-_viv_uniform int width;\n\
-\n\
-_viv_uniform float input_scale;\n\
-_viv_uniform int height;\n\
-\n\
-_viv_uniform int height_depth;\n\
-_viv_uniform float dimRatio;\n\
-_viv_uniform int group_num;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-_viv_uniform float outputScale;\n\
-_viv_uniform float output_zp;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
-_viv_uniform int inputZP;\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32(\n\
-    image2d_array_t input, image2d_t output)\n\
-{\n\
-    int gidx = get_global_id(0) << 4;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
-    vxc_short8 src0;\n\
-    float4 tmpSumSqr = (float4)(0);\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    int8 input_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.w, baseAddr_a);\n\
-\n\
-    if(gidx < width)\n\
-    {\n\
-        for(coord.y = 0; coord.y < height;)\n\
-        {\n\
-            VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
-                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            vxc_float4 sumsqr;\n\
-            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-                    uniInt16SumSqr_dp8x2);\n\
-            tmpSumSqr += sumsqr;\n\
-        }\n\
-        tmpSumSqr.x *= input_scale;\n\
-        tmpSumSqr.y *= e2InScale;\n\
-    }\n\
-    lcl_sum[lidx] = tmpSumSqr.x;\n\
-    lcl_sqr[lidx] = tmpSumSqr.y;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-        float4 data = (float4)(0);\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            data.x += dot(tmp_sum[i], one);\n\
-            data.y += dot(tmp_sqr[i], one);\n\
-        }\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32_2D(\n\
-    image2d_t input, image2d_t output)\n\
-{\n\
-    int gidx = get_global_id(0) << 4;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-\n\
-    int2 coord = (int2)(gidx, gidy);\n\
-    vxc_short8 src0;\n\
-    float4 tmpSumSqr = (float4)(0);\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    int endH = gidy + height;\n\
-    if(gidx < width)\n\
-    {\n\
-        for(; coord.y < endH;)\n\
-        {\n\
-            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            vxc_float4 sumsqr;\n\
-            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
-                    uniInt16SumSqr_dp8x2);\n\
-            tmpSumSqr += sumsqr;\n\
-        }\n\
-        tmpSumSqr.x *= input_scale;\n\
-        tmpSumSqr.y *= e2InScale;\n\
-    }\n\
-    lcl_sum[lidx] = tmpSumSqr.x;\n\
-    lcl_sqr[lidx] = tmpSumSqr.y;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-        float4 data = (float4)(0);\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            data.x += dot(tmp_sum[i], one);\n\
-            data.y += dot(tmp_sqr[i], one);\n\
-        }\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16(\n\
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int2 coord_sum = (int2)(0, gidz);\n\
-    int4 coord_para = coord;\n\
-    coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\
-    vxc_short8 src0, src1, outval;\n\
-    vxc_half8 scale_h;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    vxc_float4 mean_vari = (vxc_float4)(0);\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_sum);\n\
-        coord_sum.x += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    int4 coord_bias = coord_para;\n\
-\n\
-    int8 input_desc, scale_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\
-    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\
-    _viv_asm(MOV, coord_para.w, baseAddr_c);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr);\n\
-\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1, norm;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-\n\
-    for(coord.y = 0; coord.y < height; coord.y++)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
-                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord_in.y ++;\n\
-        coord_para.y = coord.y;\n\
-        coord_bias.y = coord.y;\n\
-        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
-                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x = coord.x;\n\
-\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvertSecFp16Fp32_4x4);\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert2ndUint8SubZpToFp32_4x4);\n\
-        tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\
-        tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\
-\n\
-        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-\n\
-        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-                uniConvertInt32toUint8_2x8);\n\
-        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16_2D(\n\
-    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_t output, float eps)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), 0);\n\
-    int2 coord_bias = (int2)(0, 0);\n\
-    vxc_short8 src0, src1, outval;\n\
-    vxc_half8 scale_h;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    vxc_float4 mean_vari = (vxc_float4)(0);\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_bias);\n\
-        coord_bias.x += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    coord_bias = coord;\n\
-\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1, norm;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-\n\
-    for(coord.y = 0; coord.y < height; coord.y++)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord_bias.y = coord.y;\n\
-        VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x = coord.x;\n\
-\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvertSecFp16Fp32_4x4);\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert2ndUint8SubZpToFp32_4x4);\n\
-        tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\
-        tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\
-\n\
-        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-\n\
-        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-                uniConvertInt32toUint8_2x8);\n\
-        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}"; /* end of layer_normalization_wh_i16_vx*/
-
-static const char layer_normalization_wh_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniSumU8_16x1;\n\
-_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\
-_viv_uniform int sumInZp;\n\
-_viv_uniform int tmpZp1;\n\
-_viv_uniform float e2InScale;\n\
-_viv_uniform float rowSumScale;\n\
-_viv_uniform int width;\n\
-\n\
-_viv_uniform float input_scale;\n\
-_viv_uniform int height;\n\
-\n\
-_viv_uniform int height_depth;\n\
-_viv_uniform float dimRatio;\n\
-_viv_uniform int group_num;\n\
-_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-_viv_uniform float outputScale;\n\
-_viv_uniform float output_zp;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
-_viv_uniform int inputZP;\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32(\n\
-    image2d_array_t input, image2d_t output)\n\
-{\n\
-    int gidx = get_global_id(0) << 4;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
-    vxc_uchar16 src0;\n\
-    float sum = 0, sqr = 0;\n\
-    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    int8 input_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord.w, baseAddr_a);\n\
-\n\
-    if(gidx < width)\n\
-    {\n\
-        for(coord.y = 0; coord.y < height;)\n\
-        {\n\
-            VXC_OP4(img_load_3d, src0, input, coord.xywz, 0,\n\
-                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
-            tmpSum += (tmpSum1);\n\
-            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
-            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\
-        }\n\
-        sqr += (tmpSqr * e2InScale + rowSumScale);\n\
-        sum = (tmpSum + sumInZp) * input_scale;\n\
-    }\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-        sum = 0; sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32_2D(\n\
-    image2d_t input, image2d_t output)\n\
-{\n\
-    int gidx = get_global_id(0) << 4;\n\
-    int lidx = get_local_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-\n\
-    int2 coord = (int2)(gidx, gidy);\n\
-    vxc_uchar16 src0;\n\
-    float sum = 0, sqr = 0;\n\
-    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    int endH = gidy + height;\n\
-    if(gidx < width)\n\
-    {\n\
-        for(; coord.y < endH;)\n\
-        {\n\
-            VXC_ReadImage(src0, input, coord, 0,\n\
-                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-            coord.y++;\n\
-            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
-            tmpSum += (tmpSum1);\n\
-            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
-            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\
-        }\n\
-        sqr += (tmpSqr * e2InScale + rowSumScale);\n\
-        sum = (tmpSum + sumInZp) * input_scale;\n\
-    }\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-        sum = 0; sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-        float4 data = (float4)(sum, sqr, 0, 0);\n\
-        write_imagef(output, coord_out, data);\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16(\n\
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int2 coord_sum = (int2)(0, gidz);\n\
-    int4 coord_para = coord;\n\
-    coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\
-    vxc_uchar16 src0;\n\
-    vxc_short8 src1, outval;\n\
-    vxc_half8 scale_h, dst;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    vxc_float4 mean_vari = (vxc_float4)(0);\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_sum);\n\
-        coord_sum.x += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    int4 coord_bias = coord_para;\n\
-\n\
-    int8 input_desc, scale_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\
-    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\
-    _viv_asm(MOV, coord_para.w, baseAddr_c);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr);\n\
-\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1, norm;\n\
-    half4 tmpVal0, tmpVal1;\n\
-\n\
-    for(coord.y = 0; coord.y < height; coord.y++)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\
-                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord_in.y ++;\n\
-        coord_para.y = coord.y; coord_bias.y = coord.y;\n\
-        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,\n\
-                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x = coord.x;\n\
-\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvertSecFp16Fp32_4x4);\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert2ndUint8SubZpToFp32_4x4);\n\
-        tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\
-        tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\
-\n\
-        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
-        _viv_asm(CONV, tmpVal0, norm);\n\
-        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
-        _viv_asm(CONV, tmpVal1, norm);\n\
-\n\
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvertHalfToFp16_2x8);\n\
-        _viv_asm(COPY, outval, dst, 16);\n\
-        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16_2D(\n\
-    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_t output, float eps)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), 0);\n\
-    int2 coord_bias = (int2)(0, 0);\n\
-    vxc_uchar16 src0;\n\
-    vxc_short8 src1, outval;\n\
-    vxc_half8 scale_h, dst;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    vxc_float4 mean_vari = (vxc_float4)(0);\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_bias);\n\
-        coord_bias.x += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    coord_bias = coord;\n\
-\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1, norm;\n\
-    half4 tmpVal0, tmpVal1;\n\
-\n\
-    for(coord.y = 0; coord.y < height; coord.y++)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord, 0,\\\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord_bias.y = coord.y;\n\
-        VXC_ReadImage(src1, scale, coord, 0,\\\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x = coord.x;\n\
-\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvertSecFp16Fp32_4x4);\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert2ndUint8SubZpToFp32_4x4);\n\
-        tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\
-        tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\
-\n\
-        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
-        _viv_asm(CONV, tmpVal0, norm);\n\
-        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
-        _viv_asm(CONV, tmpVal1, norm);\n\
-\n\
-        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvertHalfToFp16_2x8);\n\
-        _viv_asm(COPY, outval, dst, 16);\n\
-        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8(\n\
-    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
-    int2 coord_sum = (int2)(0, gidz);\n\
-    int4 coord_para = coord;\n\
-    coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\
-    vxc_uchar16 src0 , outval;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    vxc_float4 mean_vari = (vxc_float4)(0);\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_sum);\n\
-        coord_sum.x += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    int4 coord_bias = coord_para;\n\
-\n\
-    int8 input_desc, scale_desc, output_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
-\n\
-    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\
-    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\
-    _viv_asm(MOV, coord_para.w, baseAddr_c);\n\
-\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord.z, baseAddr);\n\
-\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1, norm;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-\n\
-    for(coord.y = 0; coord.y < height; coord.y++)\n\
-    {\n\
-        VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\
-                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord_in.y ++;\n\
-        coord_para.y = coord.y;\n\
-        coord_bias.y = coord.y;\n\
-        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,\n\
-                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x = coord.x;\n\
-\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvertSecFp16Fp32_4x4);\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert2ndUint8SubZpToFp32_4x4);\n\
-        tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\
-        tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\
-\n\
-        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-\n\
-        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-                uniConvertInt32toUint8_2x8);\n\
-        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
-                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8_2D(\n\
-    image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_t output, float eps)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), 0);\n\
-    int2 coord_bias = (int2)(0, 0);\n\
-    vxc_uchar16 src0, outval;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
-    vxc_float4 mean_vari = (vxc_float4)(0);\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_bias);\n\
-        coord_bias.x += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    coord_bias = coord;\n\
-\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1, norm;\n\
-    vxc_int4 tmpVal0, tmpVal1;\n\
-\n\
-    for(coord.y = 0; coord.y < height; coord.y++)\n\
-    {\n\
-        VXC_ReadImage(src0, input, coord, 0,\\\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        coord_bias.y = coord.y;\n\
-        VXC_ReadImage(src1, scale, coord, 0,\\\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-        bias_f0 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x += 4;\n\
-        bias_f1 = read_imagef(bias, coord_bias);\n\
-        coord_bias.x = coord.x;\n\
-\n\
-        _viv_asm(COPY, scale_h, src1, 16);\n\
-        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                UniFP16toFP32Lo4_dp4x4);\n\
-        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvertSecFp16Fp32_4x4);\n\
-        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert1stUint8SubZpToFp32_4x4);\n\
-        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniConvert2ndUint8SubZpToFp32_4x4);\n\
-        tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\
-        tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\
-\n\
-        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
-        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
-        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
-        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
-\n\
-        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-                uniConvertInt32toUint8_2x8);\n\
-        VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}"; /* end of layer_normalization_wh_u8_vx*/
+}"; /* end of layer_normalization_3_vx*/
 
 static const char log_softmax_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 _viv_uniform float       rlogE;\n\
@@ -22546,6 +19739,7 @@ __kernel void gemm_BF16BF16toBF16(image2d_array_t inputA,\n\
         sum3 += (tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3);\n\
     }\n\
     coord_b.y = gidy;\n\
+    coord_b.z = get_global_id(2);\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
     _viv_asm(MOV, coord_b.w, baseAddr);\n\
@@ -22623,6 +19817,7 @@ __kernel void gemm_transa_BF16BF16toBF16(\n\
         sum3 = (sum3 + tempA0.w * tempB0);\n\
     }\n\
     coord_b.y = gidy;\n\
+    coord_b.z = get_global_id(2);\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
     _viv_asm(MOV, coord_b.w, baseAddr);\n\
@@ -22657,7 +19852,7 @@ __kernel void gemm_transb_BF16BF16toBF16(image2d_array_t inputA,\n\
                                     int adjointB,\n\
                         uint M, uint K, uint N)\n\
 {\n\
-    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
     int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\
     int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\
 \n\
@@ -22826,6 +20021,7 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA,\n\
         sum3 += (tempA3);\n\
     }\n\
     coord_b.y = gidy;\n\
+    coord_b.z = get_global_id(2);\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
     _viv_asm(MOV, coord_b.w, baseAddr);\n\
@@ -22914,6 +20110,7 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA,\n\
         sum3 += (tempA3 + tempB3);\n\
     }\n\
     coord_b.y = gidy;\n\
+    coord_b.z = get_global_id(2);\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
     _viv_asm(MOV, coord_b.w, baseAddr);\n\
@@ -22988,7 +20185,7 @@ __kernel void gemm_F32F32toF32(\n\
         sum2 += (tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3);\n\
         sum3 += (tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3);\n\
     }\n\
-    coord_b = (int4)(gidx, gidy, get_global_id(2), 0);\n\
+    coord_b = (int4)(gidx, gidy, get_global_id(2), get_global_id(2));\n\
     write_imagef(output, coord_b, sum0);\n\
     coord_b.y++;\n\
     write_imagef(output, coord_b, sum1);\n\
@@ -23083,6 +20280,7 @@ __kernel void gemm_F16F16to##dst_type_name( \\\n\
     vxc_int4 tmpOut0, tmpOut1; \\\n\
     write_type outC; \\\n\
     coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
     _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
@@ -23172,6 +20370,7 @@ __kernel void gemm_F16F16to##dst_type_name( \\\n\
     vxc_int4 tmpOut0, tmpOut1; \\\n\
     write_type outC; \\\n\
     coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
     _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
@@ -23282,6 +20481,7 @@ __kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \\\n\
     } \\\n\
     vxc_int4 tmpOut0, tmpOut1; \\\n\
     coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
     _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
@@ -23370,6 +20570,7 @@ __kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \\\n\
     } \\\n\
     vxc_int4 tmpOut0, tmpOut1; \\\n\
     coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
     _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
@@ -23475,6 +20676,7 @@ __kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \\\n\
     sum2 *= input1Scale; \\\n\
     sum3 *= input1Scale; \\\n\
     coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
     _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
@@ -23571,6 +20773,7 @@ __kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \\\n\
         sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \\\n\
     } \\\n\
     coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
     _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
@@ -23669,6 +20872,7 @@ __kernel void gemm_F16##src1_type_name##to##src1_type_name(image2d_array_t input
     } \\\n\
     vxc_int4 tmpOut0, tmpOut1; \\\n\
     coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
     _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
@@ -23777,6 +20981,7 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \\\n\
     } \\\n\
     vxc_int4 tmpOut0, tmpOut1; \\\n\
     coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
     _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
@@ -23868,6 +21073,7 @@ __kernel void gemm_transa_##src0_type_name##src1_type_name##to##dst_type_name( \
     } \\\n\
     vxc_int4 tmpOut0, tmpOut1; \\\n\
     coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
     _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
@@ -23953,6 +21159,7 @@ __kernel void gemm_transa_##src0_type_name##F16to##src0_type_name( \\\n\
     } \\\n\
     vxc_int4 tmpOut0, tmpOut1; \\\n\
     coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
     _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
@@ -24035,6 +21242,7 @@ __kernel void gemm_transa_F16F16toF16(\n\
         sum3 = (sum3 + tempA0.w * tempB0);\n\
     }\n\
     coord_b.y = gidy;\n\
+    coord_b.z = get_global_id(2);\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
     _viv_asm(MOV, coord_b.w, baseAddr);\n\
@@ -24060,7 +21268,8 @@ __kernel void gemm_transa_F16F16toF16(\n\
     _viv_asm(COPY, outC, valC, 16);\n\
     VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\
                 VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
-}"; /* end of matrixmul_transA_vx*/
+}\n\
+"; /* end of matrixmul_transA_vx*/
 
 static const char matrixmul_transB_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -24079,7 +21288,7 @@ __kernel void gemm_transb_F16F16toF16(image2d_array_t inputA,\n\
                                     int adjointB,\n\
                         uint M, uint K, uint N)\n\
 {\n\
-    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
     int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\
     int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\
 \n\
@@ -24336,7 +21545,7 @@ __kernel void gemm_transb_F16U8toU8(image2d_array_t inputA,\n\
                                     int adjointB,\n\
                         uint M, uint K, uint N)\n\
 {\n\
-    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
     int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\
     int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\
 \n\
@@ -24470,7 +21679,7 @@ __kernel void gemm_transb_U8U8toF16(image2d_array_t inputA,\n\
                                     int adjointB,\n\
                         uint M, uint K, uint N)\n\
 {\n\
-    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
     int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\
     int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\
 \n\
@@ -24605,7 +21814,7 @@ __kernel void gemm_transb_U8U8toU8(image2d_array_t inputA,\n\
                                     int adjointB,\n\
                         uint M, uint K, uint N)\n\
 {\n\
-    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
     int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\
     int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\
 \n\
@@ -24799,6 +22008,7 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \\\n\
     } \\\n\
     vxc_int4 tmpOut0, tmpOut1; \\\n\
     coord_b.y = get_global_id(1); \\\n\
+    coord_b.z = get_global_id(2); \\\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
     _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
@@ -24911,6 +22121,7 @@ __kernel void gemm_##src0_type_name##F16toF16( \\\n\
     sum2 *= input0Scale; \\\n\
     sum3 *= input0Scale; \\\n\
     coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
     int baseAddr = (int)gidz * output_desc.s4 + output_desc.s0; \\\n\
     _viv_asm(MOV, coord_b.w, baseAddr);  \\\n\
@@ -25016,6 +22227,7 @@ __kernel void gemm_##src0_type_name##F16toF16( \\\n\
     sum2 *= input0Scale; \\\n\
     sum3 *= input0Scale; \\\n\
     coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
     int baseAddr = (int)gidz * output_desc.s4 + output_desc.s0; \\\n\
     _viv_asm(MOV, coord_b.w, baseAddr);  \\\n\
@@ -25127,6 +22339,7 @@ __kernel void gemm_##src0_type_name##F16to##src0_type_name( \\\n\
     } \\\n\
     vxc_int4 tmpOut0, tmpOut1; \\\n\
     coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
     _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
@@ -25225,6 +22438,7 @@ __kernel void gemm_##src0_type_name##F16to##src0_type_name( \\\n\
     } \\\n\
     vxc_int4 tmpOut0, tmpOut1; \\\n\
     coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
     _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
@@ -25267,6 +22481,7 @@ _viv_uniform int bc2zero;\n\
 _viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4;\n\
 _viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4;\n\
 _viv_uniform float input01Scale;\n\
+_viv_uniform float mulKIn0In1Zp;\n\
 \n\
 #define GEMM_QINT_TO_F16(src0_type_name, read_type) \\\n\
 __kernel void gemm_##src0_type_name##src0_type_name##toF16( \\\n\
@@ -25280,10 +22495,8 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \\\n\
  \\\n\
     int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \\\n\
     int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\
-    vxc_float4 sum0 = (vxc_float4)(0); \\\n\
-    vxc_float4 sum1 = (vxc_float4)(0); \\\n\
-    vxc_float4 sum2 = (vxc_float4)(0); \\\n\
-    vxc_float4 sum3 = (vxc_float4)(0); \\\n\
+    vxc_float4 sum0 = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp); \\\n\
+    vxc_float4 sum1 = sum0, sum2 = sum0, sum3 = sum0; \\\n\
  \\\n\
     int8 inputA_desc, inputB_desc, output_desc; \\\n\
     _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\
@@ -25341,6 +22554,7 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \\\n\
     sum2 *= input01Scale; \\\n\
     sum3 *= input01Scale; \\\n\
     coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
     _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
@@ -25442,6 +22656,7 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \\\n\
         sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \\\n\
     } \\\n\
     coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
     int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
     _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
@@ -26283,6 +23498,192 @@ MINIMUM_QUANTTOF16_2D_IMPL(U8U8toF16,   vxc_uchar16)\n\
 MINIMUM_QUANTTOF16_2D_IMPL(I8I8toF16,   vxc_char16)\n\
 MINIMUM_QUANTTOF16_2D_IMPL(I16I16toF16, vxc_short8)"; /* end of minimum_1_vx*/
 
+static const char mod_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+_viv_uniform VXC_512Bits uniConvertFstToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertSecToFp32_4x4;\n\
+\n\
+_viv_uniform float in_scale0;\n\
+_viv_uniform float in_scale1;\n\
+_viv_uniform float out_scale;\n\
+_viv_uniform float in0Tail;\n\
+_viv_uniform float in1Tail;\n\
+_viv_uniform float out_zp;\n\
+\n\
+#define MOD_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\\\n\
+                      IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \\\n\
+    save_type data; \\\n\
+    read_type read_data0, read_data1; \\\n\
+    copy_type tmpData0, tmpData1; \\\n\
+    vxc_float4 in0Val1, in0Val2, in1Val1, in1Val2; \\\n\
+    vxc_float4 tmpVal1, tmpVal2; \\\n\
+    dst_type tmpOut1, tmpOut2; \\\n\
+    read_fun(read_data0, input0, coord, VXC_5BITOFFSET_XY(0,0), \\\n\
+    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, tmpData0, read_data0, 16); \\\n\
+    read_fun(read_data1, input1, coord, VXC_5BITOFFSET_XY(0,0), \\\n\
+    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, tmpData1, read_data1, 16); \\\n\
+    VXC_DP4x4(in0Val1, tmpData0, tmpData0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \\\n\
+    VXC_DP4x4(in0Val2, tmpData0, tmpData0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \\\n\
+    VXC_DP4x4(in1Val1, tmpData1, tmpData1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \\\n\
+    VXC_DP4x4(in1Val2, tmpData1, tmpData1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \\\n\
+    in0Val1 = in0Val1 * IN0_SCALE + IN0_TAIL; \\\n\
+    in0Val2 = in0Val2 * IN0_SCALE + IN0_TAIL; \\\n\
+    in1Val1 = in1Val1 * IN1_SCALE + IN1_TAIL; \\\n\
+    in1Val2 = in1Val2 * IN1_SCALE + IN1_TAIL; \\\n\
+    if (isfmod) \\\n\
+    { \\\n\
+    tmpVal1 = fmod(in0Val1, in1Val1) * OUT_SCALE + OUT_OFFSET; \\\n\
+    tmpVal2 = fmod(in0Val2, in1Val2) * OUT_SCALE + OUT_OFFSET; \\\n\
+    } \\\n\
+    else \\\n\
+    { \\\n\
+    tmpVal1 = (in0Val1 - in1Val1 * floor(in0Val1 / in1Val1)) * OUT_SCALE + OUT_OFFSET; \\\n\
+    tmpVal2 = (in0Val2 - in1Val2 * floor(in0Val2 / in1Val2)) * OUT_SCALE + OUT_OFFSET; \\\n\
+    } \\\n\
+    _viv_asm(conv_mode, tmpOut1, tmpVal1); \\\n\
+    _viv_asm(conv_mode, tmpOut2, tmpVal2); \\\n\
+    VXC_DP2x8(data, tmpOut1, tmpOut2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \\\n\
+    write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+\n\
+#define TENSOR_MOD(src0_name, src1_name, dst_name, dst_type, save_type, read_type, copy_type, \\\n\
+                    conv_mode, IN0_SCALE, IN0_TAIL, IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET) \\\n\
+__kernel void mod_##src0_name##src1_name##to##dst_name \\\n\
+    ( \\\n\
+    image2d_array_t input0, \\\n\
+    image2d_array_t input1, \\\n\
+    image2d_array_t output, \\\n\
+    int             isfmod \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    MOD_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\\\n\
+                  IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \\\n\
+}\n\
+\n\
+\n\
+TENSOR_MOD(F16, F16, F16, half4, vxc_short8, vxc_short8,\\\n\
+                vxc_half8, CONV, 1, 0, 1, 0, 1, 0)\n\
+TENSOR_MOD(F16, F16, I16, short4, vxc_short8, vxc_short8,\\\n\
+               vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\
+TENSOR_MOD(F16, F16, I8,  char4, vxc_char8, vxc_short8,\\\n\
+                vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\
+TENSOR_MOD(F16, F16, U8,  uchar4, vxc_uchar8, vxc_short8,\\\n\
+               vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\
+\n\
+TENSOR_MOD(I16, I16, I16, short4, vxc_short8, vxc_short8,\\\n\
+                vxc_short8, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\
+TENSOR_MOD(I16, I16, F16, half4, vxc_short8, vxc_short8,\\\n\
+                vxc_short8, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\
+\n\
+TENSOR_MOD(I8, I8, I8, char4, vxc_char8, vxc_char16,\\\n\
+                vxc_char16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\
+TENSOR_MOD(I8, I8, F16, half4, vxc_short8, vxc_char16,\\\n\
+                vxc_char16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\
+\n\
+TENSOR_MOD(U8, U8, U8,  uchar4, vxc_uchar8, vxc_uchar16,\\\n\
+                vxc_uchar16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\
+TENSOR_MOD(U8, U8, F16, half4, vxc_short8, vxc_uchar16,\\\n\
+                vxc_uchar16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\
+\n\
+\n\
+#define TENSOR_MOD_2D(src0_name, src1_name, dst_name, dst_type, save_type, read_type, copy_type, \\\n\
+    conv_mode, IN0_SCALE, IN0_TAIL, IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET) \\\n\
+__kernel void mod_##src0_name##src1_name##to##dst_name##_2D \\\n\
+    ( \\\n\
+    image2d_array_t input0, \\\n\
+    image2d_array_t input1, \\\n\
+    image2d_array_t output, \\\n\
+    int             isfmod \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    MOD_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\\\n\
+                  IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, VXC_ReadImage, VXC_WriteImage); \\\n\
+}\n\
+\n\
+\n\
+TENSOR_MOD_2D(F16, F16, F16, half4, vxc_short8, vxc_short8,\\\n\
+                vxc_half8, CONV, 1, 0, 1, 0, 1, 0)\n\
+TENSOR_MOD_2D(F16, F16, I16, short4, vxc_short8, vxc_short8,\\\n\
+               vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\
+TENSOR_MOD_2D(F16, F16, I8,  char4, vxc_char8, vxc_short8,\\\n\
+                vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\
+TENSOR_MOD_2D(F16, F16, U8,  uchar4, vxc_uchar8, vxc_short8,\\\n\
+               vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\
+\n\
+TENSOR_MOD_2D(I16, I16, I16, short4, vxc_short8, vxc_short8,\\\n\
+                vxc_short8, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\
+TENSOR_MOD_2D(I16, I16, F16, half4, vxc_short8, vxc_short8,\\\n\
+                vxc_short8, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\
+\n\
+TENSOR_MOD_2D(I8, I8, I8, char4, vxc_char8, vxc_char16,\\\n\
+                vxc_char16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\
+TENSOR_MOD_2D(I8, I8, F16, half4, vxc_short8, vxc_char16,\\\n\
+                vxc_char16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\
+\n\
+TENSOR_MOD_2D(U8, U8, U8,  uchar4, vxc_uchar8, vxc_uchar16,\\\n\
+                vxc_uchar16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\
+TENSOR_MOD_2D(U8, U8, F16, half4, vxc_short8, vxc_uchar16,\\\n\
+                vxc_uchar16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\
+\n\
+\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
+\n\
+#define MOD_BF16_PROCESS(read_fun, write_fun) \\\n\
+    vxc_short8 read_data0, read_data1, vec0; \\\n\
+    vxc_float4 in0Val1, in0Val2, in1Val1, in1Val2; \\\n\
+    vxc_float4 tmpVal1, tmpVal2; \\\n\
+    vxc_ushort8 dst0, dst1; \\\n\
+    vxc_ushort8 vect; \\\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\
+    read_fun(read_data0, input0, coord, VXC_5BITOFFSET_XY(0,0), \\\n\
+    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP2x8(vec0, read_data0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
+    _viv_asm(COPY, in0Val1, vec0, 16); \\\n\
+    VXC_DP2x8(vec0, read_data0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\
+    _viv_asm(COPY, in0Val2, vec0, 16); \\\n\
+    read_fun(read_data1, input1, coord, VXC_5BITOFFSET_XY(0,0), \\\n\
+    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP2x8(vec0, read_data1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
+    _viv_asm(COPY, in1Val1, vec0, 16); \\\n\
+    VXC_DP2x8(vec0, read_data1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\
+    _viv_asm(COPY, in1Val2, vec0, 16); \\\n\
+    tmpVal1 = fmod(in0Val1, in1Val1); \\\n\
+    tmpVal2 = fmod(in0Val2, in1Val2); \\\n\
+    _viv_asm(COPY, dst0, tmpVal1, 16); \\\n\
+    _viv_asm(COPY, dst1, tmpVal2, 16); \\\n\
+    VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\
+    write_fun(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+__kernel void mod_BF16BF16toBF16\n\
+    (\n\
+    image2d_array_t input0,\n\
+    image2d_array_t input1,\n\
+    image2d_array_t output,\n\
+    int             isfmod\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    MOD_BF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray);\n\
+}\n\
+\n\
+__kernel void mod_BF16BF16toBF16_2D\n\
+    (\n\
+    image2d_array_t input0,\n\
+    image2d_array_t input1,\n\
+    image2d_array_t output,\n\
+    int             isfmod\n\
+    )\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+    MOD_BF16_PROCESS(VXC_ReadImage, VXC_WriteImage);\n\
+}"; /* end of mod_vx*/
+
 static const char moments_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int width;\n\
@@ -29451,588 +26852,172 @@ __kernel void poolwithargmax_U8to_F16_I16_2D\n\
 \n\
 "; /* end of poolwithargmax_U8_vx*/
 
-static const char pow_fp16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char pow_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;\n\
 _viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
+_viv_uniform float input0_scale;\n\
+_viv_uniform float input1_scale;\n\
+_viv_uniform float input0_tail;\n\
+_viv_uniform float input1_tail;\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
 \n\
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4_2;\n\
-_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4_2;\n\
-\n\
-_viv_uniform int input_ZP1;\n\
-\n\
-_viv_uniform float output_ZP;\n\
-_viv_uniform float outputScale;\n\
-\n\
-__kernel void pow_F16F16toF16(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0, src1;\n\
-    vxc_short8 dst;\n\
-    vxc_half8 data0, data1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    half4 tmpVal0, tmpVal1;\n\
-    _viv_asm(CONV, tmpVal0, tmpDst0);\n\
-    _viv_asm(CONV, tmpVal1, tmpDst1);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+#define POW_SH_IMPL(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, copy2_type, conv_type) \\\n\
+__kernel void pow_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    src0_type src0; \\\n\
+    copy0_type data0; \\\n\
+    src0_type src1; \\\n\
+    copy0_type data1; \\\n\
+    VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, data0, src0, 16); \\\n\
+    VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, data1, src1, 16); \\\n\
+    float4 x0, x1; \\\n\
+    float4 y0, y1; \\\n\
+    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); \\\n\
+    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); \\\n\
+    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); \\\n\
+    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); \\\n\
+    x0 = x0 * input0_scale + input0_tail; \\\n\
+    x1 = x1 * input0_scale + input0_tail; \\\n\
+    y0 = y0 * input1_scale + input1_tail; \\\n\
+    y1 = y1 * input1_scale + input1_tail; \\\n\
+    float4 s0 = sign(x0); \\\n\
+    float4 s1 = sign(x1); \\\n\
+    int4 t0 = convert_int4(y0) & 1; \\\n\
+    int4 t1 = convert_int4(y1) & 1; \\\n\
+    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; \\\n\
+    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; \\\n\
+    x0 = s0 * exp2(y0 * log2(fabs(x0))); \\\n\
+    x1 = s1 * exp2(y1 * log2(fabs(x1))); \\\n\
+    x0 = x0 * output_scale + output_zp; \\\n\
+    x1 = x1 * output_scale + output_zp; \\\n\
+ \\\n\
+    conv_type tmpVal0, tmpVal1; \\\n\
+    _viv_asm(CONV_RTE, tmpVal0, x0); \\\n\
+    _viv_asm(CONV_RTE, tmpVal1, x1); \\\n\
+    dst_type dst0; \\\n\
+ \\\n\
+    copy2_type dst; \\\n\
+    VXC_DP2x8(dst0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, 16); \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+POW_SH_IMPL(F16_F16toF16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_half8,   vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL(F16_F16toI16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  int4)\n\
+POW_SH_IMPL(F16_F16toI8,  vxc_short8,  vxc_half8,   vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  int4)\n\
+POW_SH_IMPL(F16_F16toU8,  vxc_short8,  vxc_half8,   vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, int4)\n\
+POW_SH_IMPL(F16_I16toF16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL(F16_I16toI16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  int4)\n\
+POW_SH_IMPL(I16_F16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL(I16_F16toI16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  int4)\n\
+POW_SH_IMPL(I16_I16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL(I16_I16toI16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  int4)\n\
+POW_SH_IMPL(F16_I8toF16,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL(F16_I8toI8,   vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  int4)\n\
+POW_SH_IMPL(I8_F16toF16,  vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8,   vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL(I8_F16toI8,   vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  int4)\n\
+POW_SH_IMPL(I8_I8toF16,   vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL(I8_I8toI8,    vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  int4)\n\
+POW_SH_IMPL(F16_U8toF16,  vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL(F16_U8toU8,   vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4)\n\
+POW_SH_IMPL(U8_F16toF16,  vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8,   vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL(U8_F16toU8,   vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, int4)\n\
+POW_SH_IMPL(U8_U8toF16,   vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL(U8_U8toU8,    vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4)\n\
 \n\
-__kernel void pow_F16F16toF16_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_short8 src0, src1;\n\
-    vxc_short8 dst;\n\
-    vxc_half8 data0, data1;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    half4 tmpVal0, tmpVal1;\n\
-    _viv_asm(CONV, tmpVal0, tmpDst0);\n\
-    _viv_asm(CONV, tmpVal1, tmpDst1);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_F16F16toU8(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0, src1;\n\
-    vxc_uchar8 dst;\n\
-    vxc_half8 data0, data1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_F16F16toU8_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_short8 src0, src1;\n\
-    vxc_uchar8 dst;\n\
-    vxc_half8 data0, data1;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_F16U8toF16(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0;\n\
-    vxc_uchar8 src1;\n\
-    vxc_short8 dst;\n\
-    vxc_half8 data0;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    short in1_zp;\n\
-    _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\
-\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    half4 tmpVal0, tmpVal1;\n\
-    _viv_asm(CONV, tmpVal0, tmpDst0);\n\
-    _viv_asm(CONV, tmpVal1, tmpDst1);\n\
-    VXC_DP2x8(data0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, dst, data0, 16);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_F16U8toF16_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_short8 src0;\n\
-    vxc_uchar8 src1;\n\
-    vxc_short8 dst;\n\
-    vxc_half8 data0;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    short in1_zp;\n\
-    _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\
-\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    half4 tmpVal0, tmpVal1;\n\
-    _viv_asm(CONV, tmpVal0, tmpDst0);\n\
-    _viv_asm(CONV, tmpVal1, tmpDst1);\n\
-    VXC_DP2x8(data0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, dst, data0, 16);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_F16U8toU8(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0;\n\
-    vxc_uchar8 src1, dst;\n\
-    vxc_half8 data0;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    short in1_zp;\n\
-    _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\
-\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\
-\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_F16U8toU8_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_short8 src0;\n\
-    vxc_uchar8 src1, dst;\n\
-    vxc_half8 data0;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    short in1_zp;\n\
-    _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\
-\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\
-\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of pow_fp16_vx*/
-
-static const char pow_fp16_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;\n\
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;\n\
-\n\
-_viv_uniform float outScale_fl;\n\
-\n\
-__kernel void pow_F16F16toI16(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    vxc_half8 data0, data1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_F16F16toI16_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    vxc_half8 data0, data1;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_F16I16toF16(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    vxc_half8 data0;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    half4 tmpVal0, tmpVal1;\n\
-    _viv_asm(CONV, tmpVal0, tmpDst0);\n\
-    _viv_asm(CONV, tmpVal1, tmpDst1);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_F16I16toF16_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    vxc_half8 data0;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    half4 tmpVal0, tmpVal1;\n\
-    _viv_asm(CONV, tmpVal0, tmpDst0);\n\
-    _viv_asm(CONV, tmpVal1, tmpDst1);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_F16I16toI16(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    vxc_half8 data0;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_F16I16toI16_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    vxc_half8 data0;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+#define POW_SH_IMPL_2D(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, copy2_type, conv_type) \\\n\
+__kernel void pow_##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    src0_type src0; \\\n\
+    copy0_type data0; \\\n\
+    src0_type src1; \\\n\
+    copy0_type data1; \\\n\
+    VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, data0, src0, 16); \\\n\
+    VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, data1, src1, 16); \\\n\
+    float4 x0, x1; \\\n\
+    float4 y0, y1; \\\n\
+    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); \\\n\
+    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); \\\n\
+    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); \\\n\
+    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); \\\n\
+    x0 = x0 * input0_scale + input0_tail; \\\n\
+    x1 = x1 * input0_scale + input0_tail; \\\n\
+    y0 = y0 * input1_scale + input1_tail; \\\n\
+    y1 = y1 * input1_scale + input1_tail; \\\n\
+    float4 s0 = sign(x0); \\\n\
+    float4 s1 = sign(x1); \\\n\
+    int4 t0 = convert_int4(y0) & 1; \\\n\
+    int4 t1 = convert_int4(y1) & 1; \\\n\
+    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; \\\n\
+    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; \\\n\
+    x0 = s0 * exp2(y0 * log2(fabs(x0))); \\\n\
+    x1 = s1 * exp2(y1 * log2(fabs(x1))); \\\n\
+    x0 = x0 * output_scale + output_zp; \\\n\
+    x1 = x1 * output_scale + output_zp; \\\n\
+ \\\n\
+    conv_type tmpVal0, tmpVal1; \\\n\
+    _viv_asm(CONV_RTE, tmpVal0, x0); \\\n\
+    _viv_asm(CONV_RTE, tmpVal1, x1); \\\n\
+    dst_type dst0; \\\n\
+ \\\n\
+    copy2_type dst; \\\n\
+    VXC_DP2x8(dst0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, 16); \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+POW_SH_IMPL_2D(F16_F16toF16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_half8,   vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL_2D(F16_F16toI16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  int4)\n\
+POW_SH_IMPL_2D(F16_F16toI8,  vxc_short8,  vxc_half8,   vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  int4)\n\
+POW_SH_IMPL_2D(F16_F16toU8,  vxc_short8,  vxc_half8,   vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, int4)\n\
+POW_SH_IMPL_2D(F16_I16toF16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL_2D(F16_I16toI16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  int4)\n\
+POW_SH_IMPL_2D(I16_F16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL_2D(I16_F16toI16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  int4)\n\
+POW_SH_IMPL_2D(I16_I16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL_2D(I16_I16toI16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  int4)\n\
+POW_SH_IMPL_2D(F16_I8toF16,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL_2D(F16_I8toI8,   vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  int4)\n\
+POW_SH_IMPL_2D(I8_F16toF16,  vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8,   vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL_2D(I8_F16toI8,   vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  int4)\n\
+POW_SH_IMPL_2D(I8_I8toF16,   vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL_2D(I8_I8toI8,    vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  vxc_char16,  int4)\n\
+POW_SH_IMPL_2D(F16_U8toF16,  vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL_2D(F16_U8toU8,   vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4)\n\
+POW_SH_IMPL_2D(U8_F16toF16,  vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8,   vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL_2D(U8_F16toU8,   vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, int4)\n\
+POW_SH_IMPL_2D(U8_U8toF16,   vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_half8,   vxc_short8,  half4)\n\
+POW_SH_IMPL_2D(U8_U8toU8,    vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4)\n\
 \n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
 _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
 \n\
-__kernel void pow_BF16BF16toBF16(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
+__kernel void pow_BF16_BF16toBF16\n\
+    (\n\
+    __read_only  image2d_array_t input0,\n\
+    __read_only  image2d_array_t input1,\n\
+    __write_only image2d_array_t output\n\
+    )\n\
 {\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
 \n\
@@ -30071,10 +27056,12 @@ __kernel void pow_BF16BF16toBF16(\n\
     VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
-__kernel void pow_BF16BF16toBF16_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
+__kernel void pow_BF16_BF16toBF16_2D\n\
+    (\n\
+    __read_only  image2d_array_t input0,\n\
+    __read_only  image2d_array_t input1,\n\
+    __write_only image2d_array_t output\n\
+    )\n\
 {\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
 \n\
@@ -30111,1057 +27098,7 @@ __kernel void pow_BF16BF16toBF16_2D(\n\
     _viv_asm(COPY, src1, tmpDst1, 16);\n\
     VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of pow_fp16_i16_vx*/
-
-static const char pow_fp16_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;\n\
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-\n\
-_viv_uniform float outScale_fl;\n\
-\n\
-__kernel void pow_F16F16toI8(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0, src1;\n\
-    vxc_char8 dst;\n\
-    vxc_half8 data0, data1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_F16F16toI8_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_short8 src0, src1;\n\
-    vxc_char8 dst;\n\
-    vxc_half8 data0, data1;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_F16I8toF16(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0, dst;\n\
-    vxc_char8 src1;\n\
-    vxc_half8 data0;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    half4 tmpVal0, tmpVal1;\n\
-    _viv_asm(CONV, tmpVal0, tmpDst0);\n\
-    _viv_asm(CONV, tmpVal1, tmpDst1);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_F16I8toF16_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_short8 src0, dst;\n\
-    vxc_char8 src1;\n\
-    vxc_half8 data0;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    half4 tmpVal0, tmpVal1;\n\
-    _viv_asm(CONV, tmpVal0, tmpDst0);\n\
-    _viv_asm(CONV, tmpVal1, tmpDst1);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_F16I8toI8(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0;\n\
-    vxc_char8 src1, dst;\n\
-    vxc_half8 data0;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_F16I8toI8_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_short8 src0;\n\
-    vxc_char8 src1, dst;\n\
-    vxc_half8 data0;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of pow_fp16_i8_vx*/
-
-static const char pow_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;\n\
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-\n\
-_viv_uniform float outScale_fl;\n\
-\n\
-__kernel void pow_I16F16toF16(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    vxc_half8 data1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    half4 tmpVal0, tmpVal1;\n\
-    _viv_asm(CONV, tmpVal0, tmpDst0);\n\
-    _viv_asm(CONV, tmpVal1, tmpDst1);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_I16F16toF16_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    vxc_half8 data1;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    half4 tmpVal0, tmpVal1;\n\
-    _viv_asm(CONV, tmpVal0, tmpDst0);\n\
-    _viv_asm(CONV, tmpVal1, tmpDst1);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_I16F16toI16(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    vxc_half8 data1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_I16F16toI16_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    vxc_half8 data1;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_I16I16toI16(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_I16I16toI16_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of pow_i16_vx*/
-
-static const char pow_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;\n\
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-\n\
-_viv_uniform float outScale_fl;\n\
-\n\
-__kernel void pow_I8F16toF16(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_char8 src0;\n\
-    vxc_short8 src1, dst;\n\
-    vxc_half8 data1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    half4 tmpVal0, tmpVal1;\n\
-    _viv_asm(CONV, tmpVal0, tmpDst0);\n\
-    _viv_asm(CONV, tmpVal1, tmpDst1);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_I8F16toF16_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_char8 src0;\n\
-    vxc_short8 src1, dst;\n\
-    vxc_half8 data1;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    half4 tmpVal0, tmpVal1;\n\
-    _viv_asm(CONV, tmpVal0, tmpDst0);\n\
-    _viv_asm(CONV, tmpVal1, tmpDst1);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_I8F16toI8(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_char8 src0, dst;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 data1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_I8F16toI8_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_char8 src0, dst;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 data1;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_I8I8toI8(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_char8 src0, src1, dst;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_I8I8toI8_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_char8 src0, src1, dst;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of pow_i8_vx*/
-
-static const char pow_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;\n\
-_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4_2;\n\
-_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4_2;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\
-\n\
-_viv_uniform int input_ZP0;\n\
-_viv_uniform int input_ZP1;\n\
-_viv_uniform float output_ZP;\n\
-_viv_uniform float outputScale;\n\
-\n\
-__kernel void pow_U8F16toF16(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_uchar8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_short8 dst;\n\
-    vxc_half8 data1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    short in0_zp;\n\
-    _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\
-    VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    half4 tmpVal0, tmpVal1;\n\
-    _viv_asm(CONV, tmpVal0, tmpDst0);\n\
-    _viv_asm(CONV, tmpVal1, tmpDst1);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_U8F16toF16_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_uchar8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_short8 dst;\n\
-    vxc_half8 data1;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    short in0_zp;\n\
-    _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\
-    VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    half4 tmpVal0, tmpVal1;\n\
-    _viv_asm(CONV, tmpVal0, tmpDst0);\n\
-    _viv_asm(CONV, tmpVal1, tmpDst1);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_U8F16toU8(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_uchar8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_uchar8 dst;\n\
-    vxc_half8 data1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    short in0_zp;\n\
-    _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\
-    VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\
-\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_U8F16toU8_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_uchar8 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_uchar8 dst;\n\
-    vxc_half8 data1;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    short in0_zp;\n\
-    _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\
-    VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\
-\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\
-\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_U8U8toU8(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_uchar8 src0, src1, dst;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    short in0_zp, in1_zp;\n\
-    _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\
-    _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\
-    VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\
-\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_U8U8toU8_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_uchar8 src0, src1, dst;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    short in0_zp, in1_zp;\n\
-    _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\
-    _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\
-    VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\
-    VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\
-\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\
-    int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-            uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_U8U8toF16(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_uchar8 src0;\n\
-    vxc_uchar8 src1;\n\
-    vxc_short8 dst;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    short in0_zp, in1_zp;\n\
-    _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\
-    _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\
-    VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\
-\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    half4 tmpVal0, tmpVal1;\n\
-    vxc_half8 tmpVal;\n\
-    _viv_asm(CONV, tmpVal0, tmpDst0);\n\
-    _viv_asm(CONV, tmpVal1, tmpDst1);\n\
-    VXC_DP2x8(tmpVal, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\
-    _viv_asm(COPY, dst, tmpVal, 16);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pow_U8U8toF16_2D(\n\
-    image2d_array_t input0,\n\
-    image2d_array_t input1,\n\
-    image2d_array_t output)\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_uchar8 src0;\n\
-    vxc_uchar8 src1;\n\
-    vxc_short8 dst;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    float4 x0, x1;\n\
-    float4 y0, y1;\n\
-    float4 tmpDst0, tmpDst1;\n\
-    short in0_zp, in1_zp;\n\
-    _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\
-    _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\
-    VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\
-\n\
-    float4  s0 = sign(x0);\n\
-    float4  s1 = sign(x1);\n\
-    int4 t0 = convert_int4(y0) & 1;\n\
-    int4 t1 = convert_int4(y1) & 1;\n\
-    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\
-    tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\
-    tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\
-\n\
-    half4 tmpVal0, tmpVal1;\n\
-    vxc_half8 tmpVal;\n\
-    _viv_asm(CONV, tmpVal0, tmpDst0);\n\
-    _viv_asm(CONV, tmpVal1, tmpDst1);\n\
-    VXC_DP2x8(tmpVal, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\
-    _viv_asm(COPY, dst, tmpVal, 16);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of pow_u8_vx*/
+}"; /* end of pow_vx*/
 
 static const char pre_process_bgra_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -32586,6 +28523,659 @@ _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
 _viv_uniform float output_scale;\n\
 _viv_uniform float output_zp;\n\
 \n\
+#define RESIZE_BILINEAR_4X1(mean, output) \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.y; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.z; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.w; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.z ++; \\\n\
+    coord_in.x = coord.x; \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+    tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \\\n\
+    _viv_asm(CONV, dst0, tmp_dst); \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst1, 8); \\\n\
+    VXC_WriteImage(output, coord_out, dst, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+#define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output0, \\\n\
+    __write_only image2d_array_t output1, \\\n\
+    __write_only image2d_array_t output2, \\\n\
+          global int             *xRatio, \\\n\
+          global int             *yRatio, \\\n\
+          global int             *xOffset, \\\n\
+          global int             *yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           f32Var \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
+ \\\n\
+    int4 xPos = get_global_id(0); \\\n\
+    int yPos = get_global_id(1); \\\n\
+ \\\n\
+    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\
+    xPos += (int4)(0, 1, 2, 3); \\\n\
+ \\\n\
+    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\
+    int4 sx = fx0 & 0xffff8000; \\\n\
+    fx0 -= sx; \\\n\
+    sx = sx >> 15; \\\n\
+ \\\n\
+    vxc_short4 fx; \\\n\
+    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniAddRShift); \\\n\
+ \\\n\
+    int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\
+    int sy = fy & 0xffff8000; \\\n\
+ \\\n\
+    fy -= sy; \\\n\
+    sy = sy >> 15; \\\n\
+ \\\n\
+    fy = (fy + (1<< 4)) >> 5; \\\n\
+ \\\n\
+    vxc_uchar16 line0Y; \\\n\
+    vxc_uchar16 line1Y; \\\n\
+    int4 coord; \\\n\
+    int4 coord_in = (int4)(0, 0, 0, 0); \\\n\
+    sx = sx + *xOffset; \\\n\
+    coord = sx.xyzw; \\\n\
+    coord_in.y = sy + *yOffset; \\\n\
+    coord_in.x = coord.x; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.y; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.z; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.w; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.z ++; \\\n\
+    coord_in.x = coord.x; \\\n\
+ \\\n\
+    int4 test01, temp1; \\\n\
+    int4 test02, temp2; \\\n\
+    int4 tt; \\\n\
+    vxc_uchar4 val; \\\n\
+    int2 coord_out = (int2)(xPos.x, yPos); \\\n\
+ \\\n\
+    vxc_uchar8 line1, line2; \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ \\\n\
+    vxc_float4 tmp_dst; \\\n\
+    vxc_uchar4 u8_dst; \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+    conv_type dst0; \\\n\
+    dst_type dst1; \\\n\
+    copy_type dst; \\\n\
+    tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\
+    _viv_asm(CONV, dst0, tmp_dst); \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst1, 8); \\\n\
+    VXC_WriteImage(output0, coord_out, dst, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    RESIZE_BILINEAR_4X1(gMean, output1) \\\n\
+    RESIZE_BILINEAR_4X1(bMean, output2) \\\n\
+}\n\
+PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8,  half4, vxc_short8)\n\
+PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4,  vxc_short8)\n\
+\n\
+#define PRE_PROCESS_RGB888_PLANAR_8BITS(dst_name, write_type) \\\n\
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output0, \\\n\
+    __write_only image2d_array_t output1, \\\n\
+    __write_only image2d_array_t output2, \\\n\
+          global int             *xRatio, \\\n\
+          global int             *yRatio, \\\n\
+          global int             *xOffset, \\\n\
+          global int             *yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           f32Var \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
+    int4 xPos = get_global_id(0); \\\n\
+    int yPos  = get_global_id(1); \\\n\
+ \\\n\
+    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\
+    xPos += (int4)(0, 1, 2, 3); \\\n\
+ \\\n\
+    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\
+    int4 sx = fx0 & 0xffff8000; \\\n\
+    fx0 -= sx; \\\n\
+    sx = sx >> 15; \\\n\
+ \\\n\
+    vxc_short4 fx; \\\n\
+    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \\\n\
+ \\\n\
+    int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\
+    int sy = fy & 0xffff8000; \\\n\
+ \\\n\
+    fy -= sy; \\\n\
+    sy = sy >> 15; \\\n\
+    fy = (fy + (1<< 4)) >> 5; \\\n\
+ \\\n\
+    vxc_uchar16 line0Y; \\\n\
+    vxc_uchar16 line1Y; \\\n\
+    int4 coord; \\\n\
+    sx = sx + *xOffset; \\\n\
+    coord.xyz = sx.xyz; \\\n\
+    coord.w   = sy + *yOffset; \\\n\
+    int2 coord1 = (int2)(sx.w, coord.w); \\\n\
+    int4 coord_in = (int4)(coord.xw, 0, 0); \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.y; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.z; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord1.x; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    int4 test01, temp1; \\\n\
+    int4 test02, temp2; \\\n\
+    int2 coord_out = (int2)(xPos.x, yPos); \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ \\\n\
+    vxc_float4 tmp_dst; \\\n\
+    vxc_uchar4 u8_dst; \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+    int4 dst0; \\\n\
+    write_type dst; \\\n\
+    tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\
+    dst0 = convert_int4_rte(tmp_dst); \\\n\
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+ \\\n\
+    VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    coord_in.x = coord.x; \\\n\
+    coord_in.z = 1; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.y; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.z; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord1.x; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+    tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \\\n\
+    dst0 = convert_int4_rte(tmp_dst); \\\n\
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+ \\\n\
+    VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    coord_in.x = coord.x; \\\n\
+    coord_in.z = 2; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.y; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.z; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord1.x; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+    tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \\\n\
+    dst0 = convert_int4_rte(tmp_dst); \\\n\
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+ \\\n\
+    VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)\n\
+PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_0_vx*/
+
+static const char pre_process_rgb888_planar_1_vx[] = "\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\
+_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\
+\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output0, \\\n\
+    __write_only image2d_array_t output1, \\\n\
+    __write_only image2d_array_t output2, \\\n\
+          global int             *xRatio, \\\n\
+          global int             *yRatio, \\\n\
+          global int             *xOffset, \\\n\
+          global int             *yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           f32Var \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    coord.xy += (int2)(*xOffset, *yOffset); \\\n\
+    vxc_uchar16 src0, src1, src2; \\\n\
+    dst_type dst0, dst1; \\\n\
+ \\\n\
+    int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\
+    VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.z ++; \\\n\
+    VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.z ++; \\\n\
+    VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    coord.x = coord.z + 8; \\\n\
+    float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\
+        rMean * output_scale - output_zp, output_scale); \\\n\
+ \\\n\
+    half4 paramData_f16; \\\n\
+    copy_type tmp_dst; \\\n\
+    _viv_asm(CONV, paramData_f16, paramData0); \\\n\
+    VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
+    VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
+    VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\
+        gMean * output_scale - output_zp, output_scale); \\\n\
+    _viv_asm(CONV, paramData_f16, paramData1); \\\n\
+    VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
+    VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
+    VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\
+        bMean * output_scale - output_zp, output_scale); \\\n\
+    _viv_asm(CONV, paramData_f16, paramData2); \\\n\
+    VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
+    VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
+    VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8,  vxc_short8)\n\
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\
+\n\
+#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \\\n\
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output0, \\\n\
+    __write_only image2d_array_t output1, \\\n\
+    __write_only image2d_array_t output2, \\\n\
+          global int             *xRatio, \\\n\
+          global int             *yRatio, \\\n\
+          global int             *xOffset, \\\n\
+          global int             *yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           f32Var \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    coord.xy += (int2) (*xOffset, *yOffset); \\\n\
+    vxc_uchar16 src0, src1, src2; \\\n\
+    write_type dst; \\\n\
+ \\\n\
+    int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\
+    VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.z ++; \\\n\
+    VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.z ++; \\\n\
+    VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\
+        rMean * output_scale - output_zp, output_scale); \\\n\
+ \\\n\
+    half4 paramData_f16; \\\n\
+    _viv_asm(CONV, paramData_f16, paramData0); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\
+        gMean * output_scale - output_zp, output_scale); \\\n\
+    _viv_asm(CONV, paramData_f16, paramData1); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\
+        bMean * output_scale - output_zp, output_scale); \\\n\
+    _viv_asm(CONV, paramData_f16, paramData2); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\
+PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\
+"; /* end of pre_process_rgb888_planar_1_vx*/
+
+static const char pre_process_rgb888_planar_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;\n\
+\n\
+__kernel void pre_process_rgb888_planar_4over3_U8toU8\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output0,\n\
+    __write_only image2d_array_t output1,\n\
+    __write_only image2d_array_t output2,\n\
+          global int             *xRatio,\n\
+          global int             *yRatio,\n\
+          global int             *xOffset,\n\
+          global int             *yOffset,\n\
+                 float           rMean,\n\
+                 float           gMean,\n\
+                 float           bMean,\n\
+                 float           f32Var\n\
+    )\n\
+{\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_out;\n\
+\n\
+    vxc_uchar16 src0, src1, src2, src3;\n\
+    vxc_uchar16 dst0, dst1, dst2;\n\
+\n\
+    VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.z ++;\n\
+    coord_out.xy = (coord_in.xy >> 2) * 3;\n\
+    coord_out.zw = coord_in.yy + (int2)(1, 2);\n\
+\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+\n\
+    VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.z ++;\n\
+\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+\n\
+    VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+\n\
+    VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void pre_process_rgb888_planar_half_U8toU8\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output0,\n\
+    __write_only image2d_array_t output1,\n\
+    __write_only image2d_array_t output2,\n\
+          global int             *xRatio,\n\
+          global int             *yRatio,\n\
+          global int             *xOffset,\n\
+          global int             *yOffset,\n\
+                 float           rMean,\n\
+                 float           gMean,\n\
+                 float           bMean,\n\
+                 float           f32Var\n\
+    )\n\
+{\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+    vxc_uchar16 src0, src1, src2;\n\
+\n\
+    VXC_ReadImage2DArray(src0, input, coord_in, 0,\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.z ++;\n\
+    VXC_ReadImage2DArray(src1, input, coord_in, 0,\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.z ++;\n\
+    VXC_ReadImage2DArray(src2, input, coord_in, 0,\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    int2 coord = coord_in.xy >> 1;\n\
+\n\
+    VXC_WriteImage(output0, coord, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output1, coord, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output2, coord, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of pre_process_rgb888_planar_2_vx*/
+
+static const char pre_process_rgb888_planar_sep_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniVecShift10;\n\
+_viv_uniform VXC_512Bits uniAddRShift;\n\
+_viv_uniform VXC_512Bits uniGetTempVal;\n\
+_viv_uniform VXC_512Bits uniExtractBytes;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
 #define RESIZE_BILINEAR_4X1(input, mean, output) \\\n\
     VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
@@ -32624,8 +29214,8 @@ _viv_uniform float output_zp;\n\
     VXC_WriteImage(output, coord_out, dst, \\\n\
         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-#define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\
-__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
+#define RGB888_PLANAR_SEP_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\
+__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
     ( \\\n\
     __read_only  image2d_array_t input0, \\\n\
     __read_only  image2d_array_t input1, \\\n\
@@ -32729,11 +29319,11 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
     RESIZE_BILINEAR_4X1(input1, gMean, output1) \\\n\
     RESIZE_BILINEAR_4X1(input2, bMean, output2) \\\n\
 }\n\
-PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8,  half4, vxc_short8)\n\
-PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4,  vxc_short8)\n\
+RGB888_PLANAR_SEP_16BITS(F16, vxc_half8,  half4, vxc_short8)\n\
+RGB888_PLANAR_SEP_16BITS(I16, vxc_short8, int4,  vxc_short8)\n\
 \n\
-#define PRE_PROCESS_RGB888_PLANAR_8BITS(dst_name, write_type) \\\n\
-__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
+#define RGB888_PLANAR_SEP_8BITS(dst_name, write_type) \\\n\
+__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
     ( \\\n\
     __read_only  image2d_array_t input0, \\\n\
     __read_only  image2d_array_t input1, \\\n\
@@ -32901,19 +29491,10 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
  \\\n\
     VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)\n\
-PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_0_vx*/
+RGB888_PLANAR_SEP_8BITS(U8, vxc_uchar16)\n\
+RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_sep_0_vx*/
 
-static const char pre_process_rgb888_planar_1_vx[] = "/*\n\
- ============================================================================\n\
- Name        : GrayScale.vx\n\
- Author      : Sam\n\
- Version     :\n\
- Copyright   : Your copyright notice\n\
- Description :\n\
- ============================================================================\n\
- */\n\
-#include \"cl_viv_vx_ext.h\"\n\
+static const char pre_process_rgb888_planar_sep_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\
 _viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\
@@ -32921,8 +29502,8 @@ _viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\
 _viv_uniform float output_scale;\n\
 _viv_uniform float output_zp;\n\
 \n\
-#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\
-__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
+#define RGB888_PLANAR_SEP_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\
+__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\
     ( \\\n\
     __read_only  image2d_array_t input0, \\\n\
     __read_only  image2d_array_t input1, \\\n\
@@ -32990,11 +29571,11 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
     _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
     VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8,  vxc_short8)\n\
-PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\
+RGB888_PLANAR_SEP_COPY_16BITS(F16, vxc_half8,  vxc_short8)\n\
+RGB888_PLANAR_SEP_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\
 \n\
 #define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \\\n\
-__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
+__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\
     ( \\\n\
     __read_only  image2d_array_t input0, \\\n\
     __read_only  image2d_array_t input1, \\\n\
@@ -33056,9 +29637,9 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
 }\n\
 PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\
 PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\
-"; /* end of pre_process_rgb888_planar_1_vx*/
+"; /* end of pre_process_rgb888_planar_sep_1_vx*/
 
-static const char pre_process_rgb888_planar_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char pre_process_rgb888_planar_sep_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;\n\
 _viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;\n\
@@ -33066,7 +29647,7 @@ _viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;\n\
 _viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;\n\
 _viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;\n\
 \n\
-__kernel void pre_process_rgb888_planar_4over3_U8toU8\n\
+__kernel void pre_process_rgb888_planar_sep_4over3_U8toU8\n\
     (\n\
     __read_only  image2d_array_t input0,\n\
     __read_only  image2d_array_t input1,\n\
@@ -33148,7 +29729,7 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8\n\
     VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
-__kernel void pre_process_rgb888_planar_half_U8toU8\n\
+__kernel void pre_process_rgb888_planar_sep_half_U8toU8\n\
     (\n\
     __read_only  image2d_array_t input0,\n\
     __read_only  image2d_array_t input1,\n\
@@ -33180,7 +29761,7 @@ __kernel void pre_process_rgb888_planar_half_U8toU8\n\
     VXC_WriteImage(output1, coord_in.zw, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
     VXC_WriteImage(output2, coord_in.zw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
-"; /* end of pre_process_rgb888_planar_2_vx*/
+"; /* end of pre_process_rgb888_planar_sep_2_vx*/
 
 static const char pre_process_rgb_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -41765,6 +38346,102 @@ __kernel void resize_bilinear_U8toU8_UP_opt\n\
 \n\
 #endif"; /* end of resize_bilinear_U8_opt_vx*/
 
+static const char resize_bilinear_align_corners_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniBilinear_8x_l10_4x8;\n\
+_viv_uniform VXC_512Bits uniBilinear_8x_l11_4x8;\n\
+_viv_uniform VXC_512Bits uniBilinear_8x_l20_4x8;\n\
+_viv_uniform VXC_512Bits uniBilinear_8x_l21_4x8;\n\
+_viv_uniform VXC_512Bits uniBilinear_8x_l30_4x8;\n\
+_viv_uniform VXC_512Bits uniBilinear_8x_l31_4x8;\n\
+_viv_uniform VXC_512Bits uniBilinear_8x_l40_4x8;\n\
+_viv_uniform VXC_512Bits uniBilinear_8x_l41_4x8;\n\
+__kernel void resize_bilinear_U8toU8_SAME_8x_upsample_align_corners\n\
+    (\n\
+    __read_only  image2d_array_t   input,\n\
+    __write_only image2d_array_t   output,\n\
+                             int   align_corners,\n\
+                             int   half_pixel_centers\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+\n\
+\n\
+    vxc_uchar16 in0, in1, dst;\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, in0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, in1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    coord.xy = coord.xy << 3;\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr);\n\
+\n\
+\n\
+    VXC_DP4x8(dst, in0, in0, VXC_MODIFIER(0, 7,  0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l10_4x8);\n\
+    VXC_DP4x8(dst, in0, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l11_4x8);\n\
+\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,\n\
+        VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    coord.y ++;\n\
+\n\
+    VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(0, 7,  0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l10_4x8);\n\
+    VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l11_4x8);\n\
+\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,\n\
+        VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    coord.y ++;\n\
+\n\
+    VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(0, 7,  0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l20_4x8);\n\
+    VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l21_4x8);\n\
+\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,\n\
+        VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    coord.y ++;\n\
+\n\
+    VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(0, 7,  0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l30_4x8);\n\
+    VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l31_4x8);\n\
+\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,\n\
+        VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    coord.y ++;\n\
+\n\
+    VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(0, 7,  0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l40_4x8);\n\
+    VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l41_4x8);\n\
+\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,\n\
+        VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    coord.y ++;\n\
+\n\
+    VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(0, 7,  0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l30_4x8);\n\
+    VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l31_4x8);\n\
+\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,\n\
+        VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    coord.y ++;\n\
+\n\
+    VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(0, 7,  0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l20_4x8);\n\
+    VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l21_4x8);\n\
+\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,\n\
+        VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    coord.y ++;\n\
+\n\
+    VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(0, 7,  0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l10_4x8);\n\
+    VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l11_4x8);\n\
+\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,\n\
+        VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of resize_bilinear_align_corners_vx*/
+
 static const char resize_bilinear_nhwc_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniResize_x2_nhwc2_0_4x8;\n\
@@ -41970,6 +38647,161 @@ __kernel void resize_bilinear_nhwc_U8toU8_4x_upsample_half_pixel_centers\n\
     VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
 }"; /* end of resize_bilinear_nhwc_vx*/
 
+static const char resize_bilinear_nhwc_bound_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniResize_x2_nhwc2_0_4x8;\n\
+_viv_uniform int2 x_coord;\n\
+\n\
+__kernel void resize_bilinear_nhwc_bound_U8toU8_2x\n\
+    (\n\
+    __read_only  image2d_array_t   input,\n\
+                 image2d_array_t   output,\n\
+    __write_only image2d_array_t   output1\n\
+    )\n\
+{\n\
+    int4 coord_out =  (int4)(1, get_global_id(0), get_global_id(0), get_global_id(0));\n\
+    int2 coord_in  = (int2)(1, get_global_id(0));\n\
+    coord_in.y = ((coord_out.y * 2 - 1) >> 2);\n\
+    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;\n\
+\n\
+    vxc_uchar16 in0, in1, in2, in3, result;\n\
+\n\
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_out.z = coord_out.y + 1;\n\
+\n\
+    VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);\n\
+    VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_DP4x8(result, in2, in1, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);\n\
+    VXC_WriteImage(output, coord_out.xz, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_in.x = x_coord.x;\n\
+\n\
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_out.x = x_coord.y;\n\
+\n\
+    VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);\n\
+    VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_DP4x8(result, in2, in1, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);\n\
+    VXC_WriteImage(output, coord_out.xz, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l10_4x4;\n\
+__kernel void resize_bilinear_nhwc_bound_U8toU8_3x\n\
+    (\n\
+    __read_only  image2d_array_t   input,\n\
+                 image2d_array_t   output,\n\
+    __write_only image2d_array_t   output1\n\
+    )\n\
+{\n\
+    int4 coord_out =  (int4)(1, get_global_id(0), get_global_id(0), get_global_id(0));\n\
+    int2 coord_in  = (int2)(1, get_global_id(0));\n\
+    coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;\n\
+    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;\n\
+\n\
+    vxc_uchar16 in0, in1, in2, in3, result;\n\
+\n\
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_out.zw = coord_out.yy + (int2)(1, 2);\n\
+\n\
+    VXC_DP4x4(result, in1, in0, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);\n\
+    VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.xz, in1, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_DP4x4(result, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);\n\
+    VXC_WriteImage(output, coord_out.xw, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_in.x = x_coord.x;\n\
+\n\
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_out.x = x_coord.y;\n\
+\n\
+    VXC_DP4x4(result, in1, in0, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);\n\
+    VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.xz, in1, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_DP4x4(result, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);\n\
+    VXC_WriteImage(output, coord_out.xw, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l00_4x8;\n\
+_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l10_4x8;\n\
+__kernel void resize_bilinear_nhwc_bound_U8toU8_4x\n\
+    (\n\
+    __read_only  image2d_array_t   input,\n\
+                 image2d_array_t   output,\n\
+    __write_only image2d_array_t   output1\n\
+    )\n\
+{\n\
+    int4 coord_out =  (int4)(1, get_global_id(0), get_global_id(0), get_global_id(0));\n\
+    int2 coord_in  = (int2)(1, get_global_id(0));\n\
+    coord_in.y = (coord_out.y * 2 - 3) >> 3;\n\
+    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;\n\
+\n\
+    vxc_uchar16 in0, in1, in2, in3, dst0, dst1;\n\
+\n\
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_out.z = coord_out.y + 1;\n\
+\n\
+    VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);\n\
+    VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);\n\
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.x += 2;\n\
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.x -= 2;\n\
+\n\
+    coord_out.zw = coord_out.zz + (int2)(1, 2);\n\
+    VXC_DP4x8(dst0, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);\n\
+    VXC_DP4x8(dst1, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);\n\
+    VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.x += 2;\n\
+    VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_in.x = x_coord.x;\n\
+\n\
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_out.x = x_coord.y;\n\
+    coord_out.z = coord_out.y + 1;\n\
+\n\
+    VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);\n\
+    VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);\n\
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.x -= 2;\n\
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.x += 2;\n\
+\n\
+    coord_out.zw = coord_out.zz + (int2)(1, 2);\n\
+    VXC_DP4x8(dst0, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);\n\
+    VXC_DP4x8(dst1, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);\n\
+    VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.x -= 2;\n\
+    VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of resize_bilinear_nhwc_bound_vx*/
+
 static const char resize_nearest_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniMultiplyAndPostShift_2x8;\n\
@@ -43161,24 +39993,25 @@ __kernel void scatter_nd_update_F16F16toU8_big(\n\
 static const char select_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniConvConditiontoDst_2x8;\n\
-_viv_uniform VXC_512Bits uniConvIntIn0toDst_2x8;\n\
-_viv_uniform VXC_512Bits uniConvIntIn1toDst_2x8;\n\
-_viv_uniform VXC_512Bits uniU8SubZP_MulM_PStoF16In0_2x8;\n\
-_viv_uniform VXC_512Bits uniU8SubZP_MulM_PStoF16In1_2x8;\n\
-_viv_uniform int input0Zp;\n\
-_viv_uniform int input1Zp;\n\
-_viv_uniform int outputZP;\n\
-_viv_uniform VXC_512Bits uniU8AddZP_2x8;\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;\n\
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\
 \n\
 #define SELECT_INT(type_name, read_fun, write_fun) \\\n\
-    type_name tmp, src0, src1, dst, value; \\\n\
+    type_name src0, src1, dst, value; \\\n\
     vxc_char8 value_tmp; \\\n\
-    read_fun(tmp, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_DP2x8(src0, tmp, tmp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvIntIn0toDst_2x8); \\\n\
-    read_fun(tmp, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+    read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_DP2x8(src1, tmp, tmp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvIntIn1toDst_2x8); \\\n\
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+             uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+             uniU8MulAndPostShift1_Lo_2x8); \\\n\
     read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_DP2x8(value, value_tmp, value_tmp,\\\n\
@@ -43198,6 +40031,7 @@ __kernel void select_##cond_name##_##src_name##_##src_name##to##dst_name( \\\n\
 }\n\
 \n\
 SELECT_INT_FUN(I8, I8,  I8,  vxc_char8)\n\
+SELECT_INT_FUN(I8, U8,  U8,  vxc_uchar8)\n\
 SELECT_INT_FUN(I8, I16, I16, vxc_short8)\n\
 \n\
 #define SELECT_INT_FUN_2D(cond_name, src_name, dst_name, type_name) \\\n\
@@ -43212,6 +40046,7 @@ __kernel void select_##cond_name##_##src_name##_##src_name##to##dst_name##_2D( \
 }\n\
 \n\
 SELECT_INT_FUN_2D(I8, I8,  I8,  vxc_char8)\n\
+SELECT_INT_FUN_2D(I8, U8,  U8,  vxc_uchar8)\n\
 SELECT_INT_FUN_2D(I8, I16, I16, vxc_short8)\n\
 \n\
 #define SELECT_HALF(read_fun, write_fun) \\\n\
@@ -43248,47 +40083,111 @@ __kernel void select_I8_F16_F16toF16_2D(\n\
     SELECT_HALF(VXC_ReadImage, VXC_WriteImage)\n\
 }\n\
 \n\
-#define SELECT_U8(read_fun, write_fun) \\\n\
-    vxc_uchar8 tmp, src0, src1, dst; \\\n\
-    vxc_char8 value; \\\n\
-    vxc_half8 tmp1; \\\n\
-    vxc_uchar16 input0_ZP, input1_ZP, output_ZP; \\\n\
-    _viv_asm(COPY, input0_ZP, input0Zp, 4); \\\n\
-    _viv_asm(COPY, input1_ZP, input1Zp, 4); \\\n\
-    _viv_asm(COPY, output_ZP, outputZP, 4); \\\n\
-    read_fun(tmp, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+#define SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, read_fun, write_fun) \\\n\
+    vxc_short8 src0, src1, dst, value; \\\n\
+    vxc_half8 value0, value1; \\\n\
+    src0_type r0; \\\n\
+    src1_type r1; \\\n\
+    copy0_type v0; \\\n\
+    copy1_type v1; \\\n\
+    vxc_char8 value_tmp; \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_DP2x8(tmp1, tmp, input0_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-             uniU8SubZP_MulM_PStoF16In0_2x8); \\\n\
-    VXC_DP2x8(src0, tmp1, output_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8AddZP_2x8); \\\n\
-    read_fun(tmp, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+    _viv_asm(COPY, v0, src0, 16); \\\n\
+    read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_DP2x8(tmp1, tmp, input1_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-             uniU8SubZP_MulM_PStoF16In1_2x8); \\\n\
-    VXC_DP2x8(src1, tmp1, output_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8AddZP_2x8); \\\n\
-    read_fun(value, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+    _viv_asm(COPY, v1, src1, 16); \\\n\
+    VXC_DP2x8(value0, v0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+             uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    _viv_asm(COPY, src0, value0, 16); \\\n\
+    VXC_DP2x8(value1, v1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+             uniU8MulAndPostShift1_Lo_2x8); \\\n\
+    _viv_asm(COPY, src1, value1, 16); \\\n\
+    read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP2x8(value, value_tmp, value_tmp,\\\n\
+             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \\\n\
     dst = (value != 0 ? src0 : src1); \\\n\
     write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-__kernel void select_I8_U8_U8toU8(\n\
+#define SELECT_HYBRID_TOF16_FUN(name, src0_type, copy0_type, src1_type, copy1_type) \\\n\
+__kernel void select_##name( \\\n\
+    __read_only  image2d_array_t   condition, \\\n\
+    __read_only  image2d_array_t   input0, \\\n\
+    __read_only  image2d_array_t   input1, \\\n\
+    __write_only image2d_array_t   output) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \\\n\
+            VXC_ReadImage2DArray, VXC_WriteImage2DArray) \\\n\
+}\n\
+SELECT_HYBRID_TOF16_FUN(I8_F16_U8toF16,  vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16)\n\
+SELECT_HYBRID_TOF16_FUN(I8_U8_F16toF16,  vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8)\n\
+SELECT_HYBRID_TOF16_FUN(I8_F16_I8toF16,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16)\n\
+SELECT_HYBRID_TOF16_FUN(I8_I8_F16toF16,  vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8)\n\
+SELECT_HYBRID_TOF16_FUN(I8_F16_I16toF16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8)\n\
+SELECT_HYBRID_TOF16_FUN(I8_I16_F16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8)\n\
+\n\
+#define SELECT_HYBRID_TOF16_FUN_2D(name, src0_type, copy0_type, src1_type, copy1_type) \\\n\
+__kernel void select_##name( \\\n\
+    __read_only  image2d_array_t   condition, \\\n\
+    __read_only  image2d_array_t   input0, \\\n\
+    __read_only  image2d_array_t   input1, \\\n\
+    __write_only image2d_array_t   output) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \\\n\
+            VXC_ReadImage, VXC_WriteImage) \\\n\
+}\n\
+SELECT_HYBRID_TOF16_FUN_2D(I8_F16_U8toF16_2D,  vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16)\n\
+SELECT_HYBRID_TOF16_FUN_2D(I8_U8_F16toF16_2D,  vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8)\n\
+SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I8toF16_2D,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16)\n\
+SELECT_HYBRID_TOF16_FUN_2D(I8_I8_F16toF16_2D,  vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8)\n\
+SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I16toF16_2D, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8)\n\
+SELECT_HYBRID_TOF16_FUN_2D(I8_I16_F16toF16_2D, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8)\n\
+\n\
+#define SELECT_HALF_TO_QINT(read_fun, write_fun, dst_type) \\\n\
+    vxc_short8 src0, src1, tmp_dst, value; \\\n\
+    vxc_half8 data; \\\n\
+    dst_type dst; \\\n\
+    vxc_char8 value_tmp; \\\n\
+    read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP2x8(value, value_tmp, value_tmp,\\\n\
+             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \\\n\
+    tmp_dst = (value != 0 ? src0 : src1); \\\n\
+    _viv_asm(COPY, data, tmp_dst, 16); \\\n\
+    vxc_ushort8 mp0; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    VXC_DP2x8(dst, data, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+            uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+__kernel void select_I8_F16_F16toU8(\n\
     __read_only  image2d_array_t   condition,\n\
     __read_only  image2d_array_t   input0,\n\
     __read_only  image2d_array_t   input1,\n\
     __write_only image2d_array_t   output)\n\
 {\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-    SELECT_U8(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\
+    SELECT_HALF_TO_QINT(VXC_ReadImage2DArray, VXC_WriteImage2DArray, vxc_uchar16)\n\
 }\n\
 \n\
-__kernel void select_I8_U8_U8toU8_2D(\n\
+__kernel void select_I8_F16_F16toU8_2D(\n\
     __read_only  image2d_array_t   condition,\n\
     __read_only  image2d_array_t   input0,\n\
     __read_only  image2d_array_t   input1,\n\
     __write_only image2d_array_t   output)\n\
 {\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-    SELECT_U8(VXC_ReadImage, VXC_WriteImage)\n\
+    SELECT_HALF_TO_QINT(VXC_ReadImage, VXC_WriteImage, vxc_uchar16)\n\
 }\n\
 "; /* end of select_vx*/
 
@@ -43667,7 +40566,7 @@ __kernel void slice_##name0##_I32to##name1##_2D \\\n\
 SLICE_8BITSTO8BITS_2D(I8, I8, vxc_char16,  vxc_char16)\n\
 SLICE_8BITSTO8BITS_2D(U8, U8, vxc_uchar16, vxc_uchar16)\n\
 \n\
-#define SLICE_16BITS_TO(name0, name1, src_type, copy_type, dst_type) \\\n\
+#define SLICE_16BITS_TO(name0, name1, src_type, copy_type, dst_type, save_type) \\\n\
 __kernel void slice_##name0##_I32to##name1 \\\n\
     ( \\\n\
     __read_only  image2d_array_t input0, \\\n\
@@ -43679,7 +40578,7 @@ __kernel void slice_##name0##_I32to##name1 \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
     src_type src; \\\n\
     copy_type src0; \\\n\
-    dst_type dst; \\\n\
+    dst_type result; \\\n\
     int4 coord_in; \\\n\
     Image begin_img = create_image_from_image2d(input1, 4); \\\n\
     uchar* begin_ptr = begin_img.ptr; \\\n\
@@ -43691,15 +40590,19 @@ __kernel void slice_##name0##_I32to##name1 \\\n\
  \\\n\
     vxc_ushort8 multiplier; \\\n\
     _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\
-    VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+    VXC_DP2x8(result, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniU8MulAndPostShift_Lo_2x8); \\\n\
+    save_type dst; \\\n\
+    _viv_asm(COPY, dst, result, 16); \\\n\
     VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-SLICE_16BITS_TO(F16, I8,  vxc_half8,  vxc_short8, vxc_char16)\n\
-SLICE_16BITS_TO(F16, U8,  vxc_half8,  vxc_short8, vxc_uchar16)\n\
-SLICE_16BITS_TO(F16, I16, vxc_half8,  vxc_short8, vxc_short8)\n\
+SLICE_16BITS_TO(F16, I8,  vxc_half8,   vxc_short8, vxc_char16,  vxc_char16)\n\
+SLICE_16BITS_TO(F16, U8,  vxc_half8,   vxc_short8, vxc_uchar16, vxc_uchar16)\n\
+SLICE_16BITS_TO(F16, I16, vxc_half8,   vxc_short8, vxc_short8,  vxc_short8)\n\
+SLICE_16BITS_TO(I16, I16, vxc_short8,  vxc_short8, vxc_short8,  vxc_short8)\n\
+SLICE_16BITS_TO(I16, F16, vxc_short8,  vxc_short8, vxc_half8,   vxc_short8)\n\
 \n\
-#define SLICE_16BITS_TO_2D(name0, name1, src_type, copy_type, dst_type) \\\n\
+#define SLICE_16BITS_TO_2D(name0, name1, src_type, copy_type, dst_type, save_type) \\\n\
 __kernel void slice_##name0##_I32to##name1##_2D \\\n\
     ( \\\n\
     __read_only  image2d_array_t input0, \\\n\
@@ -43711,7 +40614,7 @@ __kernel void slice_##name0##_I32to##name1##_2D \\\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
     src_type src; \\\n\
     copy_type src0; \\\n\
-    dst_type dst; \\\n\
+    dst_type result; \\\n\
     int2 coord_in; \\\n\
     Image begin_img = create_image_from_image2d(input1, 4); \\\n\
     uchar* begin_ptr = begin_img.ptr; \\\n\
@@ -43723,13 +40626,18 @@ __kernel void slice_##name0##_I32to##name1##_2D \\\n\
  \\\n\
     vxc_ushort8 multiplier; \\\n\
     _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\
-    VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+    VXC_DP2x8(result, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniU8MulAndPostShift_Lo_2x8); \\\n\
+    save_type dst; \\\n\
+    _viv_asm(COPY, dst, result, 16); \\\n\
     VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-SLICE_16BITS_TO_2D(F16, I8,  vxc_half8,  vxc_short8, vxc_char16)\n\
-SLICE_16BITS_TO_2D(F16, U8,  vxc_half8,  vxc_short8, vxc_uchar16)\n\
-SLICE_16BITS_TO_2D(F16, I16, vxc_half8,  vxc_short8, vxc_short8)"; /* end of slice_vx*/
+SLICE_16BITS_TO_2D(F16, I8,  vxc_half8,   vxc_short8, vxc_char16,  vxc_char16)\n\
+SLICE_16BITS_TO_2D(F16, U8,  vxc_half8,   vxc_short8, vxc_uchar16, vxc_uchar16)\n\
+SLICE_16BITS_TO_2D(F16, I16, vxc_half8,   vxc_short8, vxc_short8,  vxc_short8)\n\
+SLICE_16BITS_TO_2D(I16, I16, vxc_short8,  vxc_short8, vxc_short8,  vxc_short8)\n\
+SLICE_16BITS_TO_2D(I16, F16, vxc_short8,  vxc_short8, vxc_half8,   vxc_short8)\n\
+"; /* end of slice_vx*/
 
 static const char space2depth_internal_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -47586,6 +44494,802 @@ __kernel void clip_U8toF32_2D(\n\
 }\n\
 "; /* end of clip_U8_cl*/
 
+static const char cumsum_cl[] = "__kernel void cumsum_F32toF32_axis2(\n\
+    __read_only image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int channel,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    float4 sum = (float4)(0);\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord_out.z = channel - 1;\n\
+        write_imagef(output, coord_out, sum);\n\
+\n\
+        for(coord.z = channel - 1; coord.z > 0; coord.z--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            coord_out.z--;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord_out, sum);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        coord_out.z = 0;\n\
+        write_imagef(output, coord_out, sum);\n\
+        for(coord.z = 0; coord.z < channel - 1; coord.z++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            coord_out.z++;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord_out, sum);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord, sum);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.z = 0; coord.z < channel; coord.z++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord, sum);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_U8toU8_axis2(\n\
+    __read_only image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int channel,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    uint4 sum = (uint4)(0);\n\
+    uint4 dst = (uint4)(0);\n\
+\n\
+    float cnt = 0.0f;\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord_out.z = channel - 1;\n\
+        write_imageui(output, coord_out, dst);\n\
+        for(coord.z = channel - 1; coord.z > 0; coord.z--)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord);\n\
+            coord_out.z--;\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord_out, dst);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        coord_out.z = 0;\n\
+        write_imageui(output, coord_out, dst);\n\
+        for(coord.z = 0; coord.z < channel - 1; coord.z++)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord);\n\
+            coord_out.z++;\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord_out, dst);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord, dst);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.z = 0; coord.z < channel; coord.z++)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord, dst);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_F32toF32_axis1(\n\
+    __read_only image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int channel,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    float4 sum = (float4)(0);\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord_out.y = height - 1;\n\
+        write_imagef(output, coord_out, sum);\n\
+        for(coord.y = height - 1; coord.y > 0; coord.y--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            coord_out.y--;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord_out, sum);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        coord_out.y = 0;\n\
+        write_imagef(output, coord_out, sum);\n\
+        for(coord.y = 0; coord.y < height - 1; coord.y++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            coord_out.y++;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord_out, sum);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord, sum);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.y = 0; coord.y < height; coord.y++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord, sum);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_U8toU8_axis1(\n\
+    __read_only image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int channel,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    uint4 sum = (uint4)(0);\n\
+    uint4 dst = (uint4)(0);\n\
+\n\
+    float cnt = 0;\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord_out.y = height - 1;\n\
+        write_imageui(output, coord_out, dst);\n\
+\n\
+        for(coord.y = height - 1; coord.y > 0; coord.y--)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord);\n\
+            cnt += 1.0f;\n\
+            coord_out.y--;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord_out, dst);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        coord_out.y = 0;\n\
+        write_imageui(output, coord_out, dst);\n\
+        for(coord.y = 0; coord.y < height - 1; coord.y++)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord);\n\
+            cnt += 1.0f;\n\
+            coord_out.y++;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord_out, dst);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord, dst);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.y = 0; coord.y < height; coord.y++)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord, dst);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_F32toF32_axis0(\n\
+    __read_only image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int channel,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    float4 sum = (float4)(0);\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord_out.x = width - 1;\n\
+        write_imagef(output, coord_out, sum);\n\
+        for(coord.x = width - 1; coord.x > 0; coord.x--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            coord_out.x--;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord_out, sum);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        coord_out.x = 0;\n\
+        write_imagef(output, coord_out, sum);\n\
+        for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            coord_out.x++;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord_out, sum);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord, sum);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.x = 0; coord.x < width; coord.x++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord, sum);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_U8toU8_axis0(\n\
+    __read_only image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int channel,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    uint4 sum = (uint4)(0);\n\
+    uint4 dst = (uint4)(0);\n\
+\n\
+    float cnt = 0;\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord_out.x = width - 1;\n\
+        write_imageui(output, coord_out, dst);\n\
+        for(coord.x = width - 1; coord.x > 0; coord.x--)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord);\n\
+            coord_out.x--;\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord_out, dst);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        coord_out.x = 0;\n\
+        write_imageui(output, coord_out, dst);\n\
+        for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord);\n\
+            coord_out.x++;\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord_out, dst);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord, dst);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.x = 0; coord.x < width; coord.x++)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord, dst);\n\
+        }\n\
+    }\n\
+}\n\
+"; /* end of cumsum_cl*/
+
+static const char cumsum_2d_cl[] = "\n\
+__kernel void cumsum_F32toF32_axis1_2D(\n\
+    __read_only image2d_t  input,\n\
+    __write_only image2d_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int chn,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    float4 sum = (float4)(0);\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord.w = height - 1;\n\
+        write_imagef(output, coord.zw, sum);\n\
+        for(coord.y = height - 1; coord.y > 0; coord.y--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            coord.w--;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord.zw, sum);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        write_imagef(output, coord.zw, sum);\n\
+        for(coord.y = 0; coord.y < height - 1; coord.y++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            coord.w++;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord.zw, sum);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord.xy, sum);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.y = 0; coord.y < height; coord.y++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord.xy, sum);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_U8toU8_axis1_2D(\n\
+    __read_only image2d_t  input,\n\
+    __write_only image2d_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int chn,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    uint4 sum = (uint4)(0);\n\
+    uint4 dst = (uint4)(0);\n\
+\n\
+    float cnt = 0;\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord.w = height - 1;\n\
+        write_imageui(output, coord.zw, sum);\n\
+        for(coord.y = height - 1; coord.y > 0; coord.y--)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            coord.w--;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.zw, dst);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        write_imageui(output, coord.zw, sum);\n\
+        for(coord.y = 0; coord.y < height - 1; coord.y++)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            coord.w++;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.zw, dst);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.xy, dst);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.y = 0; coord.y < height; coord.y++)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.xy, dst);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_F32toF32_axis0_2D(\n\
+    __read_only image2d_t  input,\n\
+    __write_only image2d_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int chn,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    float4 sum = (float4)(0);\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord.x = width - 1;\n\
+        coord.z = coord.x;\n\
+        write_imagef(output, coord.zw, sum);\n\
+        for(; coord.x > 0; coord.x--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            coord.z--;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord.zw, sum);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        coord.z = 0;\n\
+        write_imagef(output, coord.zw, sum);\n\
+        for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            coord.z++;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord.zw, sum);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord.xy, sum);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.x = 0; coord.x < width; coord.x++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord.xy, sum);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_U8toU8_axis0_2D(\n\
+    __read_only image2d_t  input,\n\
+    __write_only image2d_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int chn,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    uint4 sum = (uint4)(0);\n\
+    uint4 dst = (uint4)(0);\n\
+\n\
+    float cnt = 0.0f;\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord.x = width - 1;\n\
+        coord.z = coord.x;\n\
+        write_imageui(output, coord.zw, sum);\n\
+        for(; coord.x > 0; coord.x--)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord.xy);\n\
+            coord.z--;\n\
+            cnt += 1.0;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.zw, dst);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        coord.z = 0;\n\
+        write_imageui(output, coord.zw, sum);\n\
+        for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            coord.z++;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.zw, dst);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.xy, dst);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.x = 0; coord.x < width; coord.x++)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.xy, dst);\n\
+        }\n\
+    }\n\
+}\n\
+"; /* end of cumsum_2d_cl*/
+
 static const char depth2space_crd_cl[] = "\n\
 __kernel void depth2space_crd_F32toF32(\n\
     image2d_array_t input, image2d_array_t output, int block_size)\n\
@@ -47949,6 +45653,21 @@ float eltwise_unary_celu(float val, float alpha, float rcp_alpha)\n\
     return val < 0 ? x : val;\n\
 }\n\
 \n\
+float eltwise_unary_rcp(float val, float alpha, float rcp_alpha)\n\
+{\n\
+    return 1.0f / val;\n\
+}\n\
+\n\
+float eltwise_unary_sign(float val, float alpha, float rcp_alpha)\n\
+{\n\
+    return sign(val);\n\
+}\n\
+\n\
+float eltwise_unary_softsign(float val, float alpha, float rcp_alpha)\n\
+{\n\
+    return val / (1.0f + fabs(val));\n\
+}\n\
+\n\
 #define ELTWISE_UNARY_F32_2D(func_name) \\\n\
 __kernel void func_name##_F32toF32_2D \\\n\
     ( \\\n\
@@ -47983,6 +45702,9 @@ ELTWISE_UNARY_F32_2D(gelu)\n\
 ELTWISE_UNARY_F32_2D(hard_gelu)\n\
 ELTWISE_UNARY_F32_2D(selu)\n\
 ELTWISE_UNARY_F32_2D(celu)\n\
+ELTWISE_UNARY_F32_2D(rcp)\n\
+ELTWISE_UNARY_F32_2D(sign)\n\
+ELTWISE_UNARY_F32_2D(softsign)\n\
 \n\
 #define ELTWISE_UNARY_U8_2D(func_name) \\\n\
 __kernel void func_name##_U8toU8_2D \\\n\
@@ -48019,6 +45741,9 @@ ELTWISE_UNARY_U8_2D(gelu)\n\
 ELTWISE_UNARY_U8_2D(hard_gelu)\n\
 ELTWISE_UNARY_U8_2D(selu)\n\
 ELTWISE_UNARY_U8_2D(celu)\n\
+ELTWISE_UNARY_U8_2D(rcp)\n\
+ELTWISE_UNARY_U8_2D(sign)\n\
+ELTWISE_UNARY_U8_2D(softsign)\n\
 \n\
 __kernel void neg_I32toI32_2D\n\
     (\n\
@@ -48179,6 +45904,21 @@ float eltwise_unary_celu(float val, float alpha, float rcp_alpha)\n\
     return val < 0 ? x : val;\n\
 }\n\
 \n\
+float eltwise_unary_rcp(float val, float alpha, float rcp_alpha)\n\
+{\n\
+    return 1.0f / val;\n\
+}\n\
+\n\
+float eltwise_unary_sign(float val, float alpha, float rcp_alpha)\n\
+{\n\
+    return sign(val);\n\
+}\n\
+\n\
+float eltwise_unary_softsign(float val, float alpha, float rcp_alpha)\n\
+{\n\
+    return val / (1.0f + fabs(val));\n\
+}\n\
+\n\
 #define ELTWISE_UNARY_F32(func_name) \\\n\
 __kernel void func_name##_F32toF32 \\\n\
     ( \\\n\
@@ -48213,6 +45953,9 @@ ELTWISE_UNARY_F32(gelu)\n\
 ELTWISE_UNARY_F32(hard_gelu)\n\
 ELTWISE_UNARY_F32(selu)\n\
 ELTWISE_UNARY_F32(celu)\n\
+ELTWISE_UNARY_F32(rcp)\n\
+ELTWISE_UNARY_F32(sign)\n\
+ELTWISE_UNARY_F32(softsign)\n\
 \n\
 #define ELTWISE_UNARY_U8(func_name) \\\n\
 __kernel void func_name##_U8toU8 \\\n\
@@ -48249,6 +45992,9 @@ ELTWISE_UNARY_U8(gelu)\n\
 ELTWISE_UNARY_U8(hard_gelu)\n\
 ELTWISE_UNARY_U8(selu)\n\
 ELTWISE_UNARY_U8(celu)\n\
+ELTWISE_UNARY_U8(rcp)\n\
+ELTWISE_UNARY_U8(sign)\n\
+ELTWISE_UNARY_U8(softsign)\n\
 \n\
 __kernel void neg_I32toI32\n\
     (\n\
@@ -50552,16 +48298,13 @@ __kernel void hswish_I32toI32_2D(\n\
 }\n\
 "; /* end of hswish_cl*/
 
-static const char instance_normalization_f16_cl[] = "__kernel void instance_norm_meanvari_F16(\n\
-    __read_only image2d_array_t   input,\n\
-    __write_only image2d_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int width,\n\
-    int height\n\
+static const char instance_normalization_f32_cl[] = "__kernel void instance_norm_sums_F32(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_t       output,\n\
+                 float           eps,\n\
+                 int             rsFlg,\n\
+                 int             width,\n\
+                 int             height\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);\n\
@@ -50581,8 +48324,8 @@ static const char instance_normalization_f16_cl[] = "__kernel void instance_norm
         {\n\
             data = read_imagef(input, coord);\n\
             coord.y++;\n\
-            sum += data.x;\n\
-            sqr += data.x * data.x;\n\
+            sum = sum + data.x;\n\
+            sqr = sqr + data.x * data.x;\n\
         }\n\
     }\n\
     lcl_sum[lidx] = sum;\n\
@@ -50612,16 +48355,13 @@ static const char instance_normalization_f16_cl[] = "__kernel void instance_norm
     }\n\
 }\n\
 \n\
-__kernel void instance_norm_meanvari_F16_2D(\n\
-    __read_only image2d_t   input,\n\
-    __write_only image2d_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int width,\n\
-    int height\n\
+__kernel void instance_norm_sums_F32_2D(\n\
+    __read_only  image2d_t input,\n\
+    __write_only image2d_t output,\n\
+                 float     eps,\n\
+                 int       rsFlg,\n\
+                 int       width,\n\
+                 int       height\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);\n\
@@ -50643,239 +48383,8 @@ __kernel void instance_norm_meanvari_F16_2D(\n\
         {\n\
             data = read_imagef(input, coord);\n\
             coord.y++;\n\
-            sum += data.x;\n\
-            sqr += data.x * data.x;\n\
-        }\n\
-    }\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
-        sum = 0; sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-\n\
-        float4 dst = (float4)(0);\n\
-        dst.x = sum;\n\
-        write_imagef(output, coord_out.xy, dst);\n\
-        coord_out.x++;\n\
-        dst.x = sqr;\n\
-        write_imagef(output, coord_out.xy, dst);\n\
-    }\n\
-}\n\
-\n\
-__kernel void instance_norm_F16toF16(\n\
-    __read_only image2d_array_t   input,\n\
-    __read_only image2d_t   bias,\n\
-    __read_only image2d_t   scale,\n\
-    __read_only image2d_t   meanVari,\n\
-    __write_only image2d_array_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int output_zp,\n\
-    float output_scale,\n\
-    float output_fl,\n\
-    int width,\n\
-    int height,\n\
-    float dim_ratio,\n\
-    int group_num\n\
-    )\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\
-    int4 coord_para = (int4)(0, gidz, 0, 0);\n\
-\n\
-    float4 gamma = read_imagef(scale, coord_para.yx);\n\
-    float4 beta  = read_imagef(bias, coord_para.yx);\n\
-    float4 mean_vari = (float4)(0);\n\
-    float scale_vari, bias_val;\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari.x += read_imagef(meanVari, coord_para.xy).x;\n\
-        coord_para.x++;\n\
-        mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\
-        coord_para.x+=3;\n\
-    }\n\
-    mean_vari *= dim_ratio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = gamma.s0 * mean_vari.s1;\n\
-    bias_val = (beta.s0 - scale_vari * mean_vari.s0);\n\
-\n\
-    float4 data, dst;\n\
-    for(coord.y = 0; coord.y < height;coord.y++)\n\
-    {\n\
-        data = read_imagef(input, coord);\n\
-\n\
-        dst.x = data.x * scale_vari + bias_val;\n\
-        write_imagef(output, coord, dst);\n\
-    }\n\
-}\n\
-\n\
-__kernel void instance_norm_F16toF16_2D(\n\
-    __read_only image2d_t   input,\n\
-    __read_only image2d_t   bias,\n\
-    __read_only image2d_t   scale,\n\
-    __read_only image2d_t   meanVari,\n\
-    __write_only image2d_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int output_zp,\n\
-    float output_scale,\n\
-    float output_fl,\n\
-    int width,\n\
-    int height,\n\
-    float dim_ratio,\n\
-    int group_num\n\
-    )\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-    int2 coord = (int2)(get_global_id(0), gidy);\n\
-    int2 coord_para = (int2)(0, gidz);\n\
-    int endH = gidy + height;\n\
-\n\
-    float4 gamma = read_imagef(scale, coord_para.yx);\n\
-    float4 beta  = read_imagef(bias, coord_para.yx);\n\
-    float4 mean_vari = (float4)(0);\n\
-    float scale_vari, bias_val;\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari.x += read_imagef(meanVari, coord_para.xy).x;\n\
-        coord_para.x++;\n\
-        mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\
-        coord_para.x+=3;\n\
-    }\n\
-    mean_vari *= dim_ratio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = gamma.s0 * mean_vari.s1;\n\
-    bias_val = (beta.s0 - scale_vari * mean_vari.s0);\n\
-\n\
-    float4 data, dst;\n\
-    for(; coord.y < endH; coord.y++)\n\
-    {\n\
-        data = read_imagef(input, coord);\n\
-\n\
-        dst.x = data.x * scale_vari + bias_val;\n\
-        write_imagef(output, coord, dst);\n\
-    }\n\
-}\n\
-"; /* end of instance_normalization_f16_cl*/
-
-static const char instance_normalization_f32_cl[] = "__kernel void instance_norm_meanvari_F32(\n\
-    __read_only image2d_array_t   input,\n\
-    __write_only image2d_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int width,\n\
-    int height\n\
-    )\n\
-{\n\
-    int gidx = get_global_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int lidx = get_local_id(0);\n\
-\n\
-    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
-    float4 data;\n\
-    float sum = 0, sqr = 0;\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    if(gidx < width)\n\
-    {\n\
-        for(coord.y = 0; coord.y < height;)\n\
-        {\n\
-            data = read_imagef(input, coord);\n\
-            coord.y++;\n\
-            sum += data.x;\n\
-            sqr += data.x * data.x;\n\
-        }\n\
-    }\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
-    if(lidx == 0)\n\
-    {\n\
-        float4 one = (float4)(1, 1, 1, 1);\n\
-        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
-        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
-        sum = 0; sqr = 0;\n\
-        for(int i = 0; i < 4; i++)\n\
-        {\n\
-            sum += dot(tmp_sum[i], one);\n\
-            sqr += dot(tmp_sqr[i], one);\n\
-        }\n\
-\n\
-        float4 dst = (float4)(0);\n\
-        dst.x = sum;\n\
-        write_imagef(output, coord_out.xy, dst);\n\
-        coord_out.x++;\n\
-        dst.x = sqr;\n\
-        write_imagef(output, coord_out.xy, dst);\n\
-    }\n\
-}\n\
-\n\
-__kernel void instance_norm_meanvari_F32_2D(\n\
-    __read_only image2d_t   input,\n\
-    __write_only image2d_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int width,\n\
-    int height\n\
-    )\n\
-{\n\
-    int gidx = get_global_id(0);\n\
-    int gidz = get_global_id(1);\n\
-    int lidx = get_local_id(0);\n\
-    int gidy = gidz * height;\n\
-\n\
-    int2 coord = (int2)(gidx, gidy);\n\
-    float4 data;\n\
-    float sum = 0, sqr = 0;\n\
-\n\
-    __local float lcl_sum[16];\n\
-    __local float lcl_sqr[16];\n\
-\n\
-    int endH = gidy + height;\n\
-    if(gidx < width)\n\
-    {\n\
-        for(; coord.y < endH;)\n\
-        {\n\
-            data = read_imagef(input, coord);\n\
-            coord.y++;\n\
-            sum += data.x;\n\
-            sqr += data.x * data.x;\n\
+            sum = sum + data.x;\n\
+            sqr = sqr + data.x * data.x;\n\
         }\n\
     }\n\
     lcl_sum[lidx] = sum;\n\
@@ -50906,23 +48415,19 @@ __kernel void instance_norm_meanvari_F32_2D(\n\
 }\n\
 \n\
 __kernel void instance_norm_F32toF32(\n\
-    __read_only image2d_array_t   input,\n\
-    __read_only image2d_t   bias,\n\
-    __read_only image2d_t   scale,\n\
-    __read_only image2d_t   meanVari,\n\
-    __write_only image2d_array_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int output_zp,\n\
-    float output_scale,\n\
-    float output_fl,\n\
-    int width,\n\
-    int height,\n\
-    float dim_ratio,\n\
-    int group_num\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_t       bias,\n\
+    __read_only  image2d_t       scale,\n\
+    __read_only  image2d_t       meanVari,\n\
+    __write_only image2d_array_t output,\n\
+                 float           eps,\n\
+                 int             rsFlg,\n\
+                 int             output_zp,\n\
+                 float           output_scale,\n\
+                 int             width,\n\
+                 int             height,\n\
+                 float           inv_multiplier,\n\
+                 int             group_num\n\
     )\n\
 {\n\
     int gidz = get_global_id(1);\n\
@@ -50941,7 +48446,7 @@ __kernel void instance_norm_F32toF32(\n\
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\
         coord_para.x+=3;\n\
     }\n\
-    mean_vari *= dim_ratio;\n\
+    mean_vari *= inv_multiplier;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
     mean_vari.s1 = rsqrt(mean_vari.s1);\n\
 \n\
@@ -50959,23 +48464,19 @@ __kernel void instance_norm_F32toF32(\n\
 }\n\
 \n\
 __kernel void instance_norm_F32toF32_2D(\n\
-    __read_only image2d_t   input,\n\
-    __read_only image2d_t   bias,\n\
-    __read_only image2d_t   scale,\n\
-    __read_only image2d_t   meanVari,\n\
-    __write_only image2d_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int output_zp,\n\
-    float output_scale,\n\
-    float output_fl,\n\
-    int width,\n\
-    int height,\n\
-    float dim_ratio,\n\
-    int group_num\n\
+    __read_only  image2d_t input,\n\
+    __read_only  image2d_t bias,\n\
+    __read_only  image2d_t scale,\n\
+    __read_only  image2d_t meanVari,\n\
+    __write_only image2d_t output,\n\
+                 float     eps,\n\
+                 int       rsFlg,\n\
+                 int       output_zp,\n\
+                 float     output_scale,\n\
+                 int       width,\n\
+                 int       height,\n\
+                 float     inv_multiplier,\n\
+                 int       group_num\n\
     )\n\
 {\n\
     int gidz = get_global_id(1);\n\
@@ -50996,12 +48497,12 @@ __kernel void instance_norm_F32toF32_2D(\n\
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\
         coord_para.x+=3;\n\
     }\n\
-    mean_vari *= dim_ratio;\n\
+    mean_vari *= inv_multiplier;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
     mean_vari.s1 = rsqrt(mean_vari.s1);\n\
 \n\
     scale_vari = gamma.s0 * mean_vari.s1;\n\
-    bias_val = beta.s0 - scale_vari * mean_vari.s0;\n\
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0);\n\
 \n\
     float4 data, dst;\n\
     for(; coord.y < endH; coord.y++)\n\
@@ -51014,16 +48515,13 @@ __kernel void instance_norm_F32toF32_2D(\n\
 }\n\
 "; /* end of instance_normalization_f32_cl*/
 
-static const char instance_normalization_i32_cl[] = "__kernel void instance_norm_meanvari_I32(\n\
-    __read_only image2d_array_t   input,\n\
-    __write_only image2d_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int width,\n\
-    int height\n\
+static const char instance_normalization_i32_cl[] = "__kernel void instance_norm_sums_I32(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_t       output,\n\
+                 float           eps,\n\
+                 int             rsFlg,\n\
+                 int             width,\n\
+                 int             height\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);\n\
@@ -51032,9 +48530,8 @@ static const char instance_normalization_i32_cl[] = "__kernel void instance_norm
 \n\
     int4 coord = (int4)(gidx, 0, gidz, 0);\n\
     int4 data;\n\
-    float sum = 0, sqr = 0;\n\
-    int tmpSum = 0;\n\
-    float e2InScale = input_fl * input_fl;\n\
+    float2 sum_x_x2 = 0;\n\
+    int2 _sum_x_x2 = 0;\n\
 \n\
     __local float lcl_sum[16];\n\
     __local float lcl_sqr[16];\n\
@@ -51045,13 +48542,13 @@ static const char instance_normalization_i32_cl[] = "__kernel void instance_norm
         {\n\
             data = read_imagei(input, coord);\n\
             coord.y++;\n\
-            tmpSum += data.x;\n\
-            sqr += (data.x * data.x * e2InScale);\n\
+            _sum_x_x2.x = _sum_x_x2.x + data.x;\n\
+            _sum_x_x2.y = _sum_x_x2.y + data.x * data.x;\n\
         }\n\
-        sum = tmpSum * input_fl;\n\
+        sum_x_x2 = convert_float2(_sum_x_x2);\n\
     }\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
+    lcl_sum[lidx] = sum_x_x2.x;\n\
+    lcl_sqr[lidx] = sum_x_x2.y;\n\
     barrier(CLK_LOCAL_MEM_FENCE);\n\
 \n\
     int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
@@ -51061,7 +48558,7 @@ static const char instance_normalization_i32_cl[] = "__kernel void instance_norm
         __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
         __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
 \n\
-        sum = 0; sqr = 0;\n\
+        float sum = 0, sqr = 0;\n\
         for(int i = 0; i < 4; i++)\n\
         {\n\
             sum += dot(tmp_sum[i], one);\n\
@@ -51077,16 +48574,13 @@ static const char instance_normalization_i32_cl[] = "__kernel void instance_norm
     }\n\
 }\n\
 \n\
-__kernel void instance_norm_meanvari_I32_2D(\n\
-    __read_only image2d_t   input,\n\
-    __write_only image2d_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int width,\n\
-    int height\n\
+__kernel void instance_norm_sums_I32_2D(\n\
+    __read_only  image2d_t input,\n\
+    __write_only image2d_t output,\n\
+                 float     eps,\n\
+                 int       rsFlg,\n\
+                 int       width,\n\
+                 int       height\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);\n\
@@ -51096,9 +48590,8 @@ __kernel void instance_norm_meanvari_I32_2D(\n\
 \n\
     int2 coord = (int2)(gidx, gidy);\n\
     int4 data;\n\
-    float sum = 0, sqr = 0;\n\
-    int tmpSum = 0;\n\
-    float e2InScale = input_fl * input_fl;\n\
+    float2 sum_x_x2 = 0;\n\
+    int2 _sum_x_x2 = 0;\n\
 \n\
     __local float lcl_sum[16];\n\
     __local float lcl_sqr[16];\n\
@@ -51110,13 +48603,13 @@ __kernel void instance_norm_meanvari_I32_2D(\n\
         {\n\
             data = read_imagei(input, coord);\n\
             coord.y++;\n\
-            tmpSum += data.x;\n\
-            sqr += (data.x * data.x * e2InScale);\n\
+            _sum_x_x2.x = _sum_x_x2.x + data.x;\n\
+            _sum_x_x2.y = _sum_x_x2.y + data.x * data.x;\n\
         }\n\
-        sum = tmpSum * input_fl;\n\
+        sum_x_x2 = convert_float2(_sum_x_x2);\n\
     }\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
+    lcl_sum[lidx] = sum_x_x2.x;\n\
+    lcl_sqr[lidx] = sum_x_x2.y;\n\
     barrier(CLK_LOCAL_MEM_FENCE);\n\
 \n\
     int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
@@ -51126,7 +48619,7 @@ __kernel void instance_norm_meanvari_I32_2D(\n\
         __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
         __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
 \n\
-        sum = 0; sqr = 0;\n\
+        float sum = 0, sqr = 0;\n\
         for(int i = 0; i < 4; i++)\n\
         {\n\
             sum += dot(tmp_sum[i], one);\n\
@@ -51143,23 +48636,19 @@ __kernel void instance_norm_meanvari_I32_2D(\n\
 }\n\
 \n\
 __kernel void instance_norm_I32toI32(\n\
-    __read_only image2d_array_t   input,\n\
-    __read_only image2d_t   bias,\n\
-    __read_only image2d_t   scale,\n\
-    __read_only image2d_t   meanVari,\n\
-    __write_only image2d_array_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int output_zp,\n\
-    float output_scale,\n\
-    float output_fl,\n\
-    int width,\n\
-    int height,\n\
-    float dim_ratio,\n\
-    int group_num\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_t       bias,\n\
+    __read_only  image2d_t       scale,\n\
+    __read_only  image2d_t       meanVari,\n\
+    __write_only image2d_array_t output,\n\
+                 float           eps,\n\
+                 int             rsFlg,\n\
+                 int             output_zp,\n\
+                 float           output_scale,\n\
+                 int             width,\n\
+                 int             height,\n\
+                 float           inv_multiplier,\n\
+                 int             group_num\n\
     )\n\
 {\n\
     int gidz = get_global_id(1);\n\
@@ -51178,13 +48667,13 @@ __kernel void instance_norm_I32toI32(\n\
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\
         coord_para.x+=3;\n\
     }\n\
-    mean_vari *= dim_ratio;\n\
+    mean_vari *= inv_multiplier;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
     mean_vari.s1 = rsqrt(mean_vari.s1);\n\
 \n\
     scale_vari = gamma.s0 * mean_vari.s1;\n\
-    float alpha = input_fl * output_fl * scale_vari;\n\
-    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_fl;\n\
+    float alpha = output_scale * scale_vari;\n\
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\
 \n\
     int4 data, dst;\n\
     for(coord.y = 0; coord.y < height;coord.y++)\n\
@@ -51199,23 +48688,19 @@ __kernel void instance_norm_I32toI32(\n\
 }\n\
 \n\
 __kernel void instance_norm_I32toI32_2D(\n\
-    __read_only image2d_t   input,\n\
-    __read_only image2d_t   bias,\n\
-    __read_only image2d_t   scale,\n\
-    __read_only image2d_t   meanVari,\n\
-    __write_only image2d_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int output_zp,\n\
-    float output_scale,\n\
-    float output_fl,\n\
-    int width,\n\
-    int height,\n\
-    float dim_ratio,\n\
-    int group_num\n\
+    __read_only  image2d_t input,\n\
+    __read_only  image2d_t bias,\n\
+    __read_only  image2d_t scale,\n\
+    __read_only  image2d_t meanVari,\n\
+    __write_only image2d_t output,\n\
+                 float     eps,\n\
+                 int       rsFlg,\n\
+                 int       output_zp,\n\
+                 float     output_scale,\n\
+                 int       width,\n\
+                 int       height,\n\
+                 float     inv_multiplier,\n\
+                 int       group_num\n\
     )\n\
 {\n\
     int gidz = get_global_id(1);\n\
@@ -51236,13 +48721,13 @@ __kernel void instance_norm_I32toI32_2D(\n\
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\
         coord_para.x+=3;\n\
     }\n\
-    mean_vari *= dim_ratio;\n\
+    mean_vari *= inv_multiplier;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
     mean_vari.s1 = rsqrt(mean_vari.s1);\n\
 \n\
     scale_vari = gamma.s0 * mean_vari.s1;\n\
-    float alpha = input_fl * output_fl * scale_vari;\n\
-    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_fl;\n\
+    float alpha = output_scale * scale_vari;\n\
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\
 \n\
     int4 data, dst;\n\
     for(; coord.y < endH; coord.y++)\n\
@@ -51257,23 +48742,19 @@ __kernel void instance_norm_I32toI32_2D(\n\
 }\n\
 \n\
 __kernel void instance_norm_I32toF32(\n\
-    __read_only image2d_array_t   input,\n\
-    __read_only image2d_t   bias,\n\
-    __read_only image2d_t   scale,\n\
-    __read_only image2d_t   meanVari,\n\
-    __write_only image2d_array_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int output_zp,\n\
-    float output_scale,\n\
-    float output_fl,\n\
-    int width,\n\
-    int height,\n\
-    float dim_ratio,\n\
-    int group_num\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_t       bias,\n\
+    __read_only  image2d_t       scale,\n\
+    __read_only  image2d_t       meanVari,\n\
+    __write_only image2d_array_t output,\n\
+                 float           eps,\n\
+                 int             rsFlg,\n\
+                 int             output_zp,\n\
+                 float           output_scale,\n\
+                 int             width,\n\
+                 int             height,\n\
+                 float           inv_multiplier,\n\
+                 int             group_num\n\
     )\n\
 {\n\
     int gidz = get_global_id(1);\n\
@@ -51292,12 +48773,12 @@ __kernel void instance_norm_I32toF32(\n\
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\
         coord_para.x+=3;\n\
     }\n\
-    mean_vari *= dim_ratio;\n\
+    mean_vari *= inv_multiplier;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
     mean_vari.s1 = rsqrt(mean_vari.s1);\n\
 \n\
     scale_vari = gamma.s0 * mean_vari.s1;\n\
-    float alpha = input_fl * scale_vari;\n\
+    float alpha = scale_vari;\n\
     bias_val = (beta.s0 - scale_vari * mean_vari.s0);\n\
 \n\
     int4 data;\n\
@@ -51312,23 +48793,19 @@ __kernel void instance_norm_I32toF32(\n\
 }\n\
 \n\
 __kernel void instance_norm_I32toF32_2D(\n\
-    __read_only image2d_t   input,\n\
-    __read_only image2d_t   bias,\n\
-    __read_only image2d_t   scale,\n\
-    __read_only image2d_t   meanVari,\n\
-    __write_only image2d_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int output_zp,\n\
-    float output_scale,\n\
-    float output_fl,\n\
-    int width,\n\
-    int height,\n\
-    float dim_ratio,\n\
-    int group_num\n\
+    __read_only  image2d_t input,\n\
+    __read_only  image2d_t bias,\n\
+    __read_only  image2d_t scale,\n\
+    __read_only  image2d_t meanVari,\n\
+    __write_only image2d_t output,\n\
+                 float     eps,\n\
+                 int       rsFlg,\n\
+                 int       output_zp,\n\
+                 float     output_scale,\n\
+                 int       width,\n\
+                 int       height,\n\
+                 float     inv_multiplier,\n\
+                 int       group_num\n\
     )\n\
 {\n\
     int gidz = get_global_id(1);\n\
@@ -51349,12 +48826,12 @@ __kernel void instance_norm_I32toF32_2D(\n\
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\
         coord_para.x+=3;\n\
     }\n\
-    mean_vari *= dim_ratio;\n\
+    mean_vari *= inv_multiplier;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
     mean_vari.s1 = rsqrt(mean_vari.s1);\n\
 \n\
     scale_vari = gamma.s0 * mean_vari.s1;\n\
-    float alpha = input_fl * scale_vari;\n\
+    float alpha = scale_vari;\n\
     bias_val = beta.s0 - scale_vari * mean_vari.s0;\n\
 \n\
     int4 data;\n\
@@ -51369,16 +48846,13 @@ __kernel void instance_norm_I32toF32_2D(\n\
 }\n\
 "; /* end of instance_normalization_i32_cl*/
 
-static const char instance_normalization_u8_cl[] = "__kernel void instance_norm_meanvari_U8(\n\
-    __read_only image2d_array_t   input,\n\
-    __write_only image2d_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int width,\n\
-    int height\n\
+static const char instance_normalization_u8_cl[] = "__kernel void instance_norm_sums_U8(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_t       output,\n\
+                 float           eps,\n\
+                 int             rsFlg,\n\
+                 int             width,\n\
+                 int             height\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);\n\
@@ -51387,9 +48861,8 @@ static const char instance_normalization_u8_cl[] = "__kernel void instance_norm_
 \n\
     int4 coord = (int4)(gidx, 0, gidz, 0);\n\
     uint4 data;\n\
-    float sum = 0, sqr = 0;\n\
-    int tmpSum = 0, tmpSqr = 0;\n\
-    float e2InScale = input_scale * input_scale;\n\
+    float2 sum_x_x2 = 0;\n\
+    int2 _sum_x_x2 = 0;\n\
 \n\
     __local float lcl_sum[16];\n\
     __local float lcl_sqr[16];\n\
@@ -51400,14 +48873,13 @@ static const char instance_normalization_u8_cl[] = "__kernel void instance_norm_
         {\n\
             data = read_imageui(input, coord);\n\
             coord.y++;\n\
-            tmpSum += data.x;\n\
-            tmpSqr += data.x * data.x;\n\
+            _sum_x_x2.x = _sum_x_x2.x + data.x;\n\
+            _sum_x_x2.y = _sum_x_x2.y + data.x * data.x;\n\
         }\n\
-        sqr = (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\
-        sum = (tmpSum - height * input_zp) * input_scale;\n\
+        sum_x_x2 = convert_float2(_sum_x_x2);\n\
     }\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
+    lcl_sum[lidx] = sum_x_x2.x;\n\
+    lcl_sqr[lidx] = sum_x_x2.y;\n\
     barrier(CLK_LOCAL_MEM_FENCE);\n\
 \n\
     int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
@@ -51417,7 +48889,7 @@ static const char instance_normalization_u8_cl[] = "__kernel void instance_norm_
         __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
         __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
 \n\
-        sum = 0; sqr = 0;\n\
+        float sum = 0, sqr = 0;\n\
         for(int i = 0; i < 4; i++)\n\
         {\n\
             sum += dot(tmp_sum[i], one);\n\
@@ -51433,16 +48905,13 @@ static const char instance_normalization_u8_cl[] = "__kernel void instance_norm_
     }\n\
 }\n\
 \n\
-__kernel void instance_norm_meanvari_U8_2D(\n\
-    __read_only image2d_t   input,\n\
-    __write_only image2d_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int width,\n\
-    int height\n\
+__kernel void instance_norm_sums_U8_2D(\n\
+    __read_only  image2d_t input,\n\
+    __write_only image2d_t output,\n\
+                 float     eps,\n\
+                 int       rsFlg,\n\
+                 int       width,\n\
+                 int       height\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);\n\
@@ -51452,9 +48921,8 @@ __kernel void instance_norm_meanvari_U8_2D(\n\
 \n\
     int2 coord = (int2)(gidx, gidy);\n\
     uint4 data;\n\
-    float sum = 0, sqr = 0;\n\
-    int tmpSum = 0, tmpSqr = 0;\n\
-    float e2InScale = input_scale * input_scale;\n\
+    float2 sum_x_x2 = 0;\n\
+    int2 _sum_x_x2 = 0;\n\
 \n\
     __local float lcl_sum[16];\n\
     __local float lcl_sqr[16];\n\
@@ -51466,14 +48934,13 @@ __kernel void instance_norm_meanvari_U8_2D(\n\
         {\n\
             data = read_imageui(input, coord);\n\
             coord.y++;\n\
-            tmpSum += data.x;\n\
-            tmpSqr += data.x * data.x;\n\
+            _sum_x_x2.x = _sum_x_x2.x + data.x;\n\
+            _sum_x_x2.y = _sum_x_x2.y + data.x * data.x;\n\
         }\n\
-        sqr = (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\
-        sum = (tmpSum - height * input_zp) * input_scale;\n\
+        sum_x_x2 = convert_float2(_sum_x_x2);\n\
     }\n\
-    lcl_sum[lidx] = sum;\n\
-    lcl_sqr[lidx] = sqr;\n\
+    lcl_sum[lidx] = sum_x_x2.x;\n\
+    lcl_sqr[lidx] = sum_x_x2.y;\n\
     barrier(CLK_LOCAL_MEM_FENCE);\n\
 \n\
     int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
@@ -51483,7 +48950,7 @@ __kernel void instance_norm_meanvari_U8_2D(\n\
         __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
         __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
 \n\
-        sum = 0; sqr = 0;\n\
+        float sum = 0, sqr = 0;\n\
         for(int i = 0; i < 4; i++)\n\
         {\n\
             sum += dot(tmp_sum[i], one);\n\
@@ -51500,23 +48967,19 @@ __kernel void instance_norm_meanvari_U8_2D(\n\
 }\n\
 \n\
 __kernel void instance_norm_U8toU8(\n\
-    __read_only image2d_array_t   input,\n\
-    __read_only image2d_t   bias,\n\
-    __read_only image2d_t   scale,\n\
-    __read_only image2d_t   meanVari,\n\
-    __write_only image2d_array_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int output_zp,\n\
-    float output_scale,\n\
-    float output_fl,\n\
-    int width,\n\
-    int height,\n\
-    float dim_ratio,\n\
-    int group_num\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_t       bias,\n\
+    __read_only  image2d_t       scale,\n\
+    __read_only  image2d_t       meanVari,\n\
+    __write_only image2d_array_t output,\n\
+                 float           eps,\n\
+                 int             rsFlg,\n\
+                 int             output_zp,\n\
+                 float           output_scale,\n\
+                 int             width,\n\
+                 int             height,\n\
+                 float           inv_multiplier,\n\
+                 int             group_num\n\
     )\n\
 {\n\
     int gidz = get_global_id(1);\n\
@@ -51527,7 +48990,6 @@ __kernel void instance_norm_U8toU8(\n\
     float4 beta  = read_imagef(bias, coord_para.yx);\n\
     float4 mean_vari = (float4)(0);\n\
     float scale_vari, bias_val;\n\
-    float scale_inOut = input_scale * output_scale;\n\
 \n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
@@ -51536,19 +48998,18 @@ __kernel void instance_norm_U8toU8(\n\
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\
         coord_para.x+=3;\n\
     }\n\
-    mean_vari *= dim_ratio;\n\
+    mean_vari *= inv_multiplier;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
     mean_vari.s1 = rsqrt(mean_vari.s1);\n\
 \n\
     scale_vari = gamma.s0 * mean_vari.s1;\n\
-    float alpha = scale_inOut * scale_vari;\n\
+    float alpha = output_scale * scale_vari;\n\
     bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\
 \n\
     uint4 data, dst;\n\
     for(coord.y = 0; coord.y < height;coord.y++)\n\
     {\n\
         data = read_imageui(input, coord);\n\
-        data.x -= input_zp;\n\
 \n\
         float4 norm;\n\
         norm.x = data.x * alpha + bias_val;\n\
@@ -51558,23 +49019,19 @@ __kernel void instance_norm_U8toU8(\n\
 }\n\
 \n\
 __kernel void instance_norm_U8toU8_2D(\n\
-    __read_only image2d_t   input,\n\
-    __read_only image2d_t   bias,\n\
-    __read_only image2d_t   scale,\n\
-    __read_only image2d_t   meanVari,\n\
-    __write_only image2d_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int output_zp,\n\
-    float output_scale,\n\
-    float output_fl,\n\
-    int width,\n\
-    int height,\n\
-    float dim_ratio,\n\
-    int group_num\n\
+    __read_only  image2d_t input,\n\
+    __read_only  image2d_t bias,\n\
+    __read_only  image2d_t scale,\n\
+    __read_only  image2d_t meanVari,\n\
+    __write_only image2d_t output,\n\
+                 float     eps,\n\
+                 int       rsFlg,\n\
+                 int       output_zp,\n\
+                 float     output_scale,\n\
+                 int       width,\n\
+                 int       height,\n\
+                 float     inv_multiplier,\n\
+                 int       group_num\n\
     )\n\
 {\n\
     int gidz = get_global_id(1);\n\
@@ -51587,7 +49044,6 @@ __kernel void instance_norm_U8toU8_2D(\n\
     float4 beta  = read_imagef(bias, coord_para.yx);\n\
     float4 mean_vari = (float4)(0);\n\
     float scale_vari, bias_val;\n\
-    float scale_inOut = input_scale * output_scale;\n\
 \n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
@@ -51596,19 +49052,18 @@ __kernel void instance_norm_U8toU8_2D(\n\
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\
         coord_para.x+=3;\n\
     }\n\
-    mean_vari *= dim_ratio;\n\
+    mean_vari *= inv_multiplier;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
     mean_vari.s1 = rsqrt(mean_vari.s1);\n\
 \n\
     scale_vari = gamma.s0 * mean_vari.s1;\n\
-    float alpha = scale_inOut * scale_vari;\n\
+    float alpha = output_scale * scale_vari;\n\
     bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\
 \n\
     uint4 data, dst;\n\
     for(; coord.y < endH; coord.y++)\n\
     {\n\
         data = read_imageui(input, coord);\n\
-        data.x -= input_zp;\n\
 \n\
         float4 norm;\n\
         norm.x = data.x * alpha + bias_val;\n\
@@ -51618,23 +49073,19 @@ __kernel void instance_norm_U8toU8_2D(\n\
 }\n\
 \n\
 __kernel void instance_norm_U8toF16(\n\
-    __read_only image2d_array_t   input,\n\
-    __read_only image2d_t   bias,\n\
-    __read_only image2d_t   scale,\n\
-    __read_only image2d_t   meanVari,\n\
-    __write_only image2d_array_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int output_zp,\n\
-    float output_scale,\n\
-    float output_fl,\n\
-    int width,\n\
-    int height,\n\
-    float dim_ratio,\n\
-    int group_num\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_t       bias,\n\
+    __read_only  image2d_t       scale,\n\
+    __read_only  image2d_t       meanVari,\n\
+    __write_only image2d_array_t output,\n\
+                 float           eps,\n\
+                 int             rsFlg,\n\
+                 int             output_zp,\n\
+                 float           output_scale,\n\
+                 int             width,\n\
+                 int             height,\n\
+                 float           inv_multiplier,\n\
+                 int             group_num\n\
     )\n\
 {\n\
     int gidz = get_global_id(1);\n\
@@ -51645,7 +49096,6 @@ __kernel void instance_norm_U8toF16(\n\
     float4 beta  = read_imagef(bias, coord_para.yx);\n\
     float4 mean_vari = (float4)(0);\n\
     float scale_vari, bias_val;\n\
-    float scale_inOut = input_scale * output_scale;\n\
 \n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
@@ -51654,19 +49104,18 @@ __kernel void instance_norm_U8toF16(\n\
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\
         coord_para.x+=3;\n\
     }\n\
-    mean_vari *= dim_ratio;\n\
+    mean_vari *= inv_multiplier;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
     mean_vari.s1 = rsqrt(mean_vari.s1);\n\
 \n\
     scale_vari = gamma.s0 * mean_vari.s1;\n\
-    float alpha = scale_inOut * scale_vari;\n\
+    float alpha = output_scale * scale_vari;\n\
     bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\
 \n\
     uint4 data;\n\
     for(coord.y = 0; coord.y < height;coord.y++)\n\
     {\n\
         data = read_imageui(input, coord);\n\
-        data.x -= input_zp;\n\
 \n\
         float4 norm;\n\
         norm.x = data.x * alpha + bias_val;\n\
@@ -51675,23 +49124,19 @@ __kernel void instance_norm_U8toF16(\n\
 }\n\
 \n\
 __kernel void instance_norm_U8toF16_2D(\n\
-    __read_only image2d_t   input,\n\
-    __read_only image2d_t   bias,\n\
-    __read_only image2d_t   scale,\n\
-    __read_only image2d_t   meanVari,\n\
-    __write_only image2d_t  output,\n\
-    float eps,\n\
-    int rsFlg,\n\
-    int input_zp,\n\
-    float input_scale,\n\
-    float input_fl,\n\
-    int output_zp,\n\
-    float output_scale,\n\
-    float output_fl,\n\
-    int width,\n\
-    int height,\n\
-    float dim_ratio,\n\
-    int group_num\n\
+    __read_only  image2d_t input,\n\
+    __read_only  image2d_t bias,\n\
+    __read_only  image2d_t scale,\n\
+    __read_only  image2d_t meanVari,\n\
+    __write_only image2d_t output,\n\
+                 float     eps,\n\
+                 int       rsFlg,\n\
+                 int       output_zp,\n\
+                 float     output_scale,\n\
+                 int       width,\n\
+                 int       height,\n\
+                 float     inv_multiplier,\n\
+                 int       group_num\n\
     )\n\
 {\n\
     int gidz = get_global_id(1);\n\
@@ -51704,7 +49149,6 @@ __kernel void instance_norm_U8toF16_2D(\n\
     float4 beta  = read_imagef(bias, coord_para.yx);\n\
     float4 mean_vari = (float4)(0);\n\
     float scale_vari, bias_val;\n\
-    float scale_inOut = input_scale * output_scale;\n\
 \n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
@@ -51713,19 +49157,18 @@ __kernel void instance_norm_U8toF16_2D(\n\
         mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\
         coord_para.x+=3;\n\
     }\n\
-    mean_vari *= dim_ratio;\n\
+    mean_vari *= inv_multiplier;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
     mean_vari.s1 = rsqrt(mean_vari.s1);\n\
 \n\
     scale_vari = gamma.s0 * mean_vari.s1;\n\
-    float alpha = scale_inOut * scale_vari;\n\
+    float alpha = output_scale * scale_vari;\n\
     bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\
 \n\
     uint4 data;\n\
     for(; coord.y < endH; coord.y++)\n\
     {\n\
         data = read_imageui(input, coord);\n\
-        data.x -= input_zp;\n\
 \n\
         float4 norm;\n\
         norm.x = data.x * alpha + bias_val;\n\
@@ -56132,6 +53575,391 @@ __kernel void maximum_I32I32toI32_2D\n\
 }\n\
 "; /* end of maximum_cl*/
 
+static const char maxpoolwithargmax_cl[] = "#define FP32_MIN   -3.4e38\n\
+#define I32_MIN    -2147483647\n\
+\n\
+__kernel void maxpoolwithargmax_F32toF32_I32(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    __write_only image2d_array_t  argmax,\n\
+    int ksize_x, int ksize_y, int stride_x, int stride_y,\n\
+    int pad_left, int pad_top, int width, int height,\n\
+    float scale, float tail)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0);\n\
+    int4 coord_in  = coord_out;\n\
+\n\
+    int hstart = gidy * stride_y - pad_top;\n\
+    int wstart = gidx * stride_x - pad_left;\n\
+    int hend = min(hstart + ksize_y, height);\n\
+    int wend = min(wstart + ksize_x, width);\n\
+    int h, w;\n\
+    int4 index_max = (int4)(0);\n\
+    float value_max = FP32_MIN;\n\
+    float4 dst = (float4)(0);\n\
+\n\
+    hstart = max(hstart, 0);\n\
+    wstart = max(wstart, 0);\n\
+    int2 coord_max = (int2)(wstart, hstart);\n\
+    for (h = hstart; h < hend; ++ h)\n\
+    {\n\
+        for (w = wstart; w < wend; ++ w)\n\
+        {\n\
+            coord_in.xy = (int2)(w, h);\n\
+            float4 data = read_imagef(input, coord_in);\n\
+\n\
+            if (data.x > value_max)\n\
+            {\n\
+                value_max = data.x;\n\
+                coord_max = coord_in.xy;\n\
+            }\n\
+        }\n\
+    }\n\
+\n\
+    index_max.x = coord_max.x + coord_max.y * width + get_global_id(2) * width * height;\n\
+    dst.x = value_max;\n\
+    write_imagef(output, coord_out, dst);\n\
+    write_imagei(argmax, coord_out, index_max);\n\
+}\n\
+\n\
+__kernel void maxpoolwithargmax_BF16toBF16_I32(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    __write_only image2d_array_t  argmax,\n\
+    int ksize_x, int ksize_y, int stride_x, int stride_y,\n\
+    int pad_left, int pad_top, int width, int height,\n\
+    float scale, float tail)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0);\n\
+    int4 coord_in  = coord_out;\n\
+\n\
+    int hstart = gidy * stride_y - pad_top;\n\
+    int wstart = gidx * stride_x - pad_left;\n\
+    int hend = min(hstart + ksize_y, height);\n\
+    int wend = min(wstart + ksize_x, width);\n\
+    int h, w;\n\
+    int4 index_max = (int4)(0);\n\
+    float value_max = FP32_MIN;\n\
+    uint4 dst = (uint4)(0);\n\
+\n\
+    hstart = max(hstart, 0);\n\
+    wstart = max(wstart, 0);\n\
+    int2 coord_max = (int2)(wstart, hstart);\n\
+    for (h = hstart; h < hend; ++ h)\n\
+    {\n\
+        for (w = wstart; w < wend; ++ w)\n\
+        {\n\
+            coord_in.xy = (int2)(w, h);\n\
+            uint4 src = read_imageui(input, coord_in);\n\
+            src = src << 16;\n\
+            float4 data;\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            if (data.x > value_max)\n\
+            {\n\
+                value_max = data.x;\n\
+                coord_max = coord_in.xy;\n\
+            }\n\
+        }\n\
+    }\n\
+\n\
+    index_max.x = coord_max.x + coord_max.y * width + get_global_id(2) * width * height;\n\
+    _viv_asm(COPY, dst, value_max, 4);\n\
+    dst.x = dst.x >> 16;\n\
+    write_imageui(output, coord_out, dst);\n\
+    write_imagei(argmax, coord_out, index_max);\n\
+}\n\
+\n\
+__kernel void maxpoolwithargmax_U32toU32_I32(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    __write_only image2d_array_t  argmax,\n\
+    int ksize_x, int ksize_y, int stride_x, int stride_y,\n\
+    int pad_left, int pad_top, int width, int height,\n\
+    float scale, float tail)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0);\n\
+    int4 coord_in  = coord_out;\n\
+\n\
+    int hstart = gidy * stride_y - pad_top;\n\
+    int wstart = gidx * stride_x - pad_left;\n\
+    int hend = min(hstart + ksize_y, height);\n\
+    int wend = min(wstart + ksize_x, width);\n\
+    int h, w;\n\
+    int4 index_max = (int4)(0);\n\
+    uint value_max = 0;\n\
+    uint4 dst = (uint4)(0);\n\
+\n\
+    hstart = max(hstart, 0);\n\
+    wstart = max(wstart, 0);\n\
+\n\
+    int2 coord_max = (int2)(wstart, hstart);\n\
+    for (h = hstart; h < hend; ++ h)\n\
+    {\n\
+        for (w = wstart; w < wend; ++ w)\n\
+        {\n\
+            coord_in.xy = (int2)(w, h);\n\
+            uint4 data = read_imageui(input, coord_in);\n\
+\n\
+            if (data.x > value_max)\n\
+            {\n\
+                value_max = data.x;\n\
+                coord_max = coord_in.xy;\n\
+            }\n\
+        }\n\
+    }\n\
+\n\
+    index_max.x = coord_max.x + coord_max.y * width + get_global_id(2) * width * height;\n\
+    dst.x = convert_uint(convert_float(value_max) * scale + tail);\n\
+    write_imageui(output, coord_out, dst);\n\
+    write_imagei(argmax, coord_out, index_max);\n\
+}\n\
+\n\
+__kernel void maxpoolwithargmax_I32toI32_I32(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    __write_only image2d_array_t  argmax,\n\
+    int ksize_x, int ksize_y, int stride_x, int stride_y,\n\
+    int pad_left, int pad_top, int width, int height,\n\
+    float scale, float tail)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0);\n\
+    int4 coord_in  = coord_out;\n\
+\n\
+    int hstart = gidy * stride_y - pad_top;\n\
+    int wstart = gidx * stride_x - pad_left;\n\
+    int hend = min(hstart + ksize_y, height);\n\
+    int wend = min(wstart + ksize_x, width);\n\
+    int h, w;\n\
+    int4 index_max = (int4)(0);\n\
+    int value_max = I32_MIN;\n\
+    int4 dst = (int4)(0);\n\
+\n\
+    hstart = max(hstart, 0);\n\
+    wstart = max(wstart, 0);\n\
+    int2 coord_max = (int2)(wstart, hstart);\n\
+    for (h = hstart; h < hend; ++ h)\n\
+    {\n\
+        for (w = wstart; w < wend; ++ w)\n\
+        {\n\
+            coord_in.xy = (int2)(w, h);\n\
+            int4 data = read_imagei(input, coord_in);\n\
+\n\
+            if (data.x > value_max)\n\
+            {\n\
+                value_max = data.x;\n\
+                coord_max = coord_in.xy;\n\
+            }\n\
+        }\n\
+    }\n\
+\n\
+    index_max.x = coord_max.x + coord_max.y * width + get_global_id(2) * width * height;\n\
+    dst.x = convert_int(convert_float(value_max) * scale + tail);\n\
+    write_imagei(output, coord_out, dst);\n\
+    write_imagei(argmax, coord_out, index_max);\n\
+}\n\
+"; /* end of maxpoolwithargmax_cl*/
+
+static const char maxpoolwithargmax_2d_cl[] = "#define FP32_MIN   -3.4e38\n\
+#define I32_MIN    -2147483647\n\
+\n\
+__kernel void maxpoolwithargmax_F32toF32_I32_2D(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    __write_only image2d_t  argmax,\n\
+    int ksize_x, int ksize_y, int stride_x, int stride_y,\n\
+    int pad_left, int pad_top, int width, int height,\n\
+    float scale, float tail)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    int2 coord_out = (int2)(gidx, gidy);\n\
+    int2 coord_in  = coord_out;\n\
+\n\
+    int hstart = gidy * stride_y - pad_top;\n\
+    int wstart = gidx * stride_x - pad_left;\n\
+    int hend = min(hstart + ksize_y, height);\n\
+    int wend = min(wstart + ksize_x, width);\n\
+    int h, w;\n\
+    int4 index_max = (int4)(0);\n\
+    float value_max = FP32_MIN;\n\
+    float4 dst = (float4)(0);\n\
+\n\
+    hstart = max(hstart, 0);\n\
+    wstart = max(wstart, 0);\n\
+    int2 coord_max = (int2)(wstart, hstart);\n\
+    for (h = hstart; h < hend; ++ h)\n\
+    {\n\
+        for (w = wstart; w < wend; ++ w)\n\
+        {\n\
+            coord_in.xy = (int2)(w, h);\n\
+            float4 data = read_imagef(input, coord_in);\n\
+\n\
+            if (data.x > value_max)\n\
+            {\n\
+                value_max = data.x;\n\
+                coord_max = coord_in;\n\
+            }\n\
+        }\n\
+    }\n\
+\n\
+    index_max.x = coord_max.x + coord_max.y * width;\n\
+    dst.x = value_max;\n\
+    write_imagef(output, coord_out, dst);\n\
+    write_imagei(argmax, coord_out, index_max);\n\
+}\n\
+\n\
+__kernel void maxpoolwithargmax_BF16toBF16_I32_2D(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    __write_only image2d_t  argmax,\n\
+    int ksize_x, int ksize_y, int stride_x, int stride_y,\n\
+    int pad_left, int pad_top, int width, int height,\n\
+    float scale, float tail)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    int2 coord_out = (int2)(gidx, gidy);\n\
+    int2 coord_in  = coord_out;\n\
+\n\
+    int hstart = gidy * stride_y - pad_top;\n\
+    int wstart = gidx * stride_x - pad_left;\n\
+    int hend = min(hstart + ksize_y, height);\n\
+    int wend = min(wstart + ksize_x, width);\n\
+    int h, w;\n\
+    int4 index_max = (int4)(0);\n\
+    float value_max = FP32_MIN;\n\
+    uint4 dst = (uint4)(0);\n\
+\n\
+    hstart = max(hstart, 0);\n\
+    wstart = max(wstart, 0);\n\
+    int2 coord_max = (int2)(wstart, hstart);\n\
+    for (h = hstart; h < hend; ++ h)\n\
+    {\n\
+        for (w = wstart; w < wend; ++ w)\n\
+        {\n\
+            coord_in.xy = (int2)(w, h);\n\
+            uint4 src = read_imageui(input, coord_in);\n\
+            src = src << 16;\n\
+            float4 data;\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            if (data.x > value_max)\n\
+            {\n\
+                value_max = data.x;\n\
+                coord_max = coord_in;\n\
+            }\n\
+        }\n\
+    }\n\
+\n\
+    index_max.x = coord_max.x + coord_max.y * width;\n\
+    _viv_asm(COPY, dst, value_max, 4);\n\
+    dst.x = dst.x >> 16;\n\
+    write_imageui(output, coord_out, dst);\n\
+    write_imagei(argmax, coord_out, index_max);\n\
+}\n\
+\n\
+__kernel void maxpoolwithargmax_U32toU32_I32_2D(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    __write_only image2d_t  argmax,\n\
+    int ksize_x, int ksize_y, int stride_x, int stride_y,\n\
+    int pad_left, int pad_top, int width, int height,\n\
+    float scale, float tail)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    int2 coord_out = (int2)(gidx, gidy);\n\
+    int2 coord_in  = coord_out;\n\
+\n\
+    int hstart = gidy * stride_y - pad_top;\n\
+    int wstart = gidx * stride_x - pad_left;\n\
+    int hend = min(hstart + ksize_y, height);\n\
+    int wend = min(wstart + ksize_x, width);\n\
+    int h, w;\n\
+    int4 index_max = (int4)(0);\n\
+    uint value_max = 0;\n\
+    uint4 dst = (uint4)(0);\n\
+\n\
+    hstart = max(hstart, 0);\n\
+    wstart = max(wstart, 0);\n\
+    int2 coord_max = (int2)(wstart, hstart);\n\
+    for (h = hstart; h < hend; ++ h)\n\
+    {\n\
+        for (w = wstart; w < wend; ++ w)\n\
+        {\n\
+            coord_in.xy = (int2)(w, h);\n\
+            uint4 data = read_imageui(input, coord_in);\n\
+\n\
+            if (data.x > value_max)\n\
+            {\n\
+                value_max = data.x;\n\
+                coord_max = coord_in;\n\
+            }\n\
+        }\n\
+    }\n\
+\n\
+    index_max.x = coord_max.x + coord_max.y * width;\n\
+    dst.x = convert_uint(convert_float(value_max) * scale + tail);\n\
+    write_imageui(output, coord_out, dst);\n\
+    write_imagei(argmax, coord_out, index_max);\n\
+}\n\
+\n\
+__kernel void maxpoolwithargmax_I32toI32_I32_2D(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    __write_only image2d_t  argmax,\n\
+    int ksize_x, int ksize_y, int stride_x, int stride_y,\n\
+    int pad_left, int pad_top, int width, int height,\n\
+    float scale, float tail)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    int2 coord_out = (int2)(gidx, gidy);\n\
+    int2 coord_in  = coord_out;\n\
+\n\
+    int hstart = gidy * stride_y - pad_top;\n\
+    int wstart = gidx * stride_x - pad_left;\n\
+    int hend = min(hstart + ksize_y, height);\n\
+    int wend = min(wstart + ksize_x, width);\n\
+    int h, w;\n\
+    int4 index_max = (int4)(0);\n\
+    int value_max = I32_MIN;\n\
+    int4 dst = (int4)(0);\n\
+\n\
+    hstart = max(hstart, 0);\n\
+    wstart = max(wstart, 0);\n\
+    int2 coord_max = (int2)(wstart, hstart);\n\
+    for (h = hstart; h < hend; ++ h)\n\
+    {\n\
+        for (w = wstart; w < wend; ++ w)\n\
+        {\n\
+            coord_in.xy = (int2)(w, h);\n\
+            int4 data = read_imagei(input, coord_in);\n\
+\n\
+            if (data.x > value_max)\n\
+            {\n\
+                value_max = data.x;\n\
+                coord_max = coord_in;\n\
+            }\n\
+        }\n\
+    }\n\
+\n\
+    index_max.x = coord_max.x + coord_max.y * width;\n\
+    dst.x = convert_int(convert_float(value_max) * scale + tail);\n\
+    write_imagei(output, coord_out, dst);\n\
+    write_imagei(argmax, coord_out, index_max);\n\
+}\n\
+"; /* end of maxpoolwithargmax_2d_cl*/
+
 static const char minimum_cl[] = "__kernel void minimum_FP32FP32toFP32\n\
     (\n\
     __read_only  image2d_array_t    input0,\n\
@@ -56290,6 +54118,314 @@ __kernel void minimum_I32I32toI32_2D\n\
 }\n\
 "; /* end of minimum_cl*/
 
+static const char mod_cl[] = "__kernel void mod_F32F32toF32\n\
+    (\n\
+    __read_only  image2d_array_t  input,\n\
+    __read_only  image2d_array_t  input1,\n\
+    __write_only image2d_array_t  output,\n\
+                 int              isfmod,\n\
+                 float            input0Scale,\n\
+                 float            input0Tail,\n\
+                 float            input1Scale,\n\
+                 float            input1Tail,\n\
+                 float            outputScale,\n\
+                 float            outputTail\n\
+     )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    float4 src0;\n\
+    float4 src1;\n\
+    READ_IMAGEF_2DARRAY(src0, input, coord);\n\
+    READ_IMAGEF_2DARRAY(src1, input1, coord);\n\
+    float4 dst  = fmod(src0, src1);\n\
+    write_imagef(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void mod_F32F32toF32_2D\n\
+    (\n\
+    __read_only  image2d_t input,\n\
+    __read_only  image2d_t input1,\n\
+    __write_only image2d_t output,\n\
+                 int       isfmod,\n\
+                 float     input0Scale,\n\
+                 float     input0Tail,\n\
+                 float     input1Scale,\n\
+                 float     input1Tail,\n\
+                 float     outputScale,\n\
+                 float     outputTail\n\
+     )\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+    float4 src0 = read_imagef(input, coord);\n\
+    float4 src1 = read_imagef(input1, coord);\n\
+    float4 dst  = fmod(src0, src1);\n\
+    write_imagef(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void mod_I32I32toI32\n\
+    (\n\
+    __read_only  image2d_array_t  input,\n\
+    __read_only  image2d_array_t  input1,\n\
+    __write_only image2d_array_t  output,\n\
+                 int              isfmod,\n\
+                 float            input0Scale,\n\
+                 float            input0Tail,\n\
+                 float            input1Scale,\n\
+                 float            input1Tail,\n\
+                 float            outputScale,\n\
+                 float            outputTail\n\
+     )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 src0;\n\
+    int4 src1;\n\
+    READ_IMAGEI_2DARRAY(src0, input, coord);\n\
+    READ_IMAGEI_2DARRAY(src1, input1, coord);\n\
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;\n\
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;\n\
+    float4 out;\n\
+    if (isfmod)\n\
+    {\n\
+        out = fmod(in0, in1) * outputScale + outputTail;\n\
+    }\n\
+    else\n\
+    {\n\
+        out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;\n\
+    }\n\
+    int4 dst = convert_int4(out);\n\
+    write_imagei(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void mod_I32I32toI32_2D\n\
+    (\n\
+    __read_only  image2d_t input,\n\
+    __read_only  image2d_t input1,\n\
+    __write_only image2d_t output,\n\
+                 int       isfmod,\n\
+                 float     input0Scale,\n\
+                 float     input0Tail,\n\
+                 float     input1Scale,\n\
+                 float     input1Tail,\n\
+                 float     outputScale,\n\
+                 float     outputTail\n\
+     )\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+    int4 src0 = read_imagei(input, coord);\n\
+    int4 src1 = read_imagei(input1, coord);\n\
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;\n\
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;\n\
+    float4 out;\n\
+    if (isfmod)\n\
+    {\n\
+        out = fmod(in0, in1) * outputScale + outputTail;\n\
+    }\n\
+    else\n\
+    {\n\
+        out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;\n\
+    }\n\
+    int4 dst = convert_int4(out);\n\
+    write_imagei(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void mod_I32I32toU8\n\
+    (\n\
+    __read_only  image2d_array_t  input,\n\
+    __read_only  image2d_array_t  input1,\n\
+    __write_only image2d_array_t  output,\n\
+                 int              isfmod,\n\
+                 float            input0Scale,\n\
+                 float            input0Tail,\n\
+                 float            input1Scale,\n\
+                 float            input1Tail,\n\
+                 float            outputScale,\n\
+                 float            outputTail\n\
+     )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 src0;\n\
+    int4 src1;\n\
+    READ_IMAGEI_2DARRAY(src0, input, coord);\n\
+    READ_IMAGEI_2DARRAY(src1, input1, coord);\n\
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;\n\
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;\n\
+    float4 out;\n\
+    if (isfmod)\n\
+    {\n\
+        out = fmod(in0, in1) * outputScale + outputTail;\n\
+    }\n\
+    else\n\
+    {\n\
+        out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;\n\
+    }\n\
+    uint4 dst = convert_uint4(out);\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void mod_I32I32toU8_2D\n\
+    (\n\
+    __read_only  image2d_t input,\n\
+    __read_only  image2d_t input1,\n\
+    __write_only image2d_t output,\n\
+                 int       isfmod,\n\
+                 float     input0Scale,\n\
+                 float     input0Tail,\n\
+                 float     input1Scale,\n\
+                 float     input1Tail,\n\
+                 float     outputScale,\n\
+                 float     outputTail\n\
+     )\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+    int4 src0 = read_imagei(input, coord);\n\
+    int4 src1 = read_imagei(input1, coord);\n\
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;\n\
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;\n\
+    float4 out;\n\
+    if (isfmod)\n\
+    {\n\
+        out = fmod(in0, in1) * outputScale + outputTail;\n\
+    }\n\
+    else\n\
+    {\n\
+        out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;\n\
+    }\n\
+    uint4 dst = convert_uint4(out);\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void mod_U8U8toU8\n\
+    (\n\
+    __read_only  image2d_array_t  input,\n\
+    __read_only  image2d_array_t  input1,\n\
+    __write_only image2d_array_t  output,\n\
+                 int              isfmod,\n\
+                 float            input0Scale,\n\
+                 float            input0Tail,\n\
+                 float            input1Scale,\n\
+                 float            input1Tail,\n\
+                 float            outputScale,\n\
+                 float            outputTail\n\
+     )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    uint4 src0, src1;\n\
+    float4 in0, in1, out;\n\
+    READ_IMAGEUI_2DARRAY(src0, input, coord);\n\
+    READ_IMAGEUI_2DARRAY(src1, input1, coord);\n\
+    in0 = convert_float4(src0) * input0Scale + input0Tail;\n\
+    in1 = convert_float4(src1) * input1Scale + input1Tail;\n\
+    if (isfmod)\n\
+    {\n\
+        out = fmod(in0, in1) * outputScale + outputTail;\n\
+    }\n\
+    else\n\
+    {\n\
+        out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;\n\
+    }\n\
+    uint4 dst  = convert_uint4(out);\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void mod_U8U8toU8_2D\n\
+    (\n\
+    __read_only  image2d_t input,\n\
+    __read_only  image2d_t input1,\n\
+    __write_only image2d_t output,\n\
+                 int       isfmod,\n\
+                 float     input0Scale,\n\
+                 float     input0Tail,\n\
+                 float     input1Scale,\n\
+                 float     input1Tail,\n\
+                 float     outputScale,\n\
+                 float     outputTail\n\
+     )\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+    uint4 src0 = read_imageui(input, coord);\n\
+    uint4 src1 = read_imageui(input1, coord);\n\
+    float4 in0, in1, out;\n\
+    in0 = convert_float4(src0) * input0Scale + input0Tail;\n\
+    in1 = convert_float4(src1) * input1Scale + input1Tail;\n\
+    if (isfmod)\n\
+    {\n\
+        out = fmod(in0, in1) * outputScale + outputTail;\n\
+    }\n\
+    else\n\
+    {\n\
+        out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;\n\
+    }\n\
+    uint4 dst  = convert_uint4(out);\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void mod_U8I32toU8\n\
+    (\n\
+    __read_only  image2d_array_t  input,\n\
+    __read_only  image2d_array_t  input1,\n\
+    __write_only image2d_array_t  output,\n\
+                 int              isfmod,\n\
+                 float            input0Scale,\n\
+                 float            input0Tail,\n\
+                 float            input1Scale,\n\
+                 float            input1Tail,\n\
+                 float            outputScale,\n\
+                 float            outputTail\n\
+     )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    uint4 src0;\n\
+    int4 src1;\n\
+    float4 in0, in1, out;\n\
+    READ_IMAGEUI_2DARRAY(src0, input, coord);\n\
+    READ_IMAGEI_2DARRAY(src1, input1, coord);\n\
+    in0 = convert_float4(src0) * input0Scale + input0Tail;\n\
+    in1 = convert_float4(src1);\n\
+    if (isfmod)\n\
+    {\n\
+        out = fmod(in0, in1) * outputScale + outputTail;\n\
+    }\n\
+    else\n\
+    {\n\
+        out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;\n\
+    }\n\
+    uint4 dst = convert_uint4(out);\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void mod_U8I32toU8_2D\n\
+    (\n\
+    __read_only  image2d_t input,\n\
+    __read_only  image2d_t input1,\n\
+    __write_only image2d_t output,\n\
+                 int       isfmod,\n\
+                 float     input0Scale,\n\
+                 float     input0Tail,\n\
+                 float     input1Scale,\n\
+                 float     input1Tail,\n\
+                 float     outputScale,\n\
+                 float     outputTail\n\
+     )\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+    uint4 src0 = read_imageui(input, coord);\n\
+    int4 src1 = read_imagei(input1, coord);\n\
+    float4 in0, in1, out;\n\
+    in0 = convert_float4(src0) * input0Scale + input0Tail;\n\
+    in1 = convert_float4(src1);\n\
+    if (isfmod)\n\
+    {\n\
+        out = fmod(in0, in1) * outputScale + outputTail;\n\
+    }\n\
+    else\n\
+    {\n\
+        out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;\n\
+    }\n\
+    uint4 dst = convert_uint4(out);\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+"; /* end of mod_cl*/
+
 static const char moments_axis0_cl[] = "__kernel void moments_axis0_U8toF32(\n\
     __read_only image2d_array_t   input,\n\
     __write_only image2d_t  output_mean,\n\
@@ -60201,21 +58337,25 @@ static const char roi_align_cl[] = "inline float roi_align_1x1\n\
 \n\
 \n\
 #define EPS_GRID 0.00001f\n\
-__kernel void roi_align_F32toF32\n\
+__kernel void roi_align_F32_F32toF32\n\
 (\n\
-    __read_only  image2d_array_t  input,\n\
-    __read_only  image2d_t        rois,\n\
-    __read_only  image2d_t        n_rois,\n\
-    __write_only image2d_array_t  output,\n\
-                           float  spatial_x_scale,\n\
-                           float  spatial_y_scale,\n\
-                           float  in_width,\n\
-                           float  in_height,\n\
-                           float  rcp_of_out_width,\n\
-                           float  rcp_of_out_height,\n\
-                           float  sampling_x_ratio,\n\
-                           float  sampling_y_ratio,\n\
-                           int    depth\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_t       rois,\n\
+    __read_only  image2d_t       n_rois,\n\
+    __write_only image2d_array_t output,\n\
+                 float           input_scale,\n\
+                 float           input_tail,\n\
+                 float           output_scale,\n\
+                 float           output_zp,\n\
+                 float           spatial_x_scale,\n\
+                 float           spatial_y_scale,\n\
+                 float           in_width,\n\
+                 float           in_height,\n\
+                 float           rcp_of_out_width,\n\
+                 float           rcp_of_out_height,\n\
+                 float           sampling_x_ratio,\n\
+                 float           sampling_y_ratio,\n\
+                 int             depth\n\
 )\n\
 {\n\
     int px = get_global_id(0);\n\
@@ -60261,6 +58401,128 @@ __kernel void roi_align_F32toF32\n\
 \n\
         write_imagef(output, (int4)(px, py, kz1, 0), interp);\n\
     }\n\
+}\n\
+\n\
+inline float roi_align_1x1_U8toF32\n\
+(\n\
+    __read_only image2d_array_t  input,\n\
+                float            input_scale,\n\
+                float            input_tail,\n\
+                float2           region_start,\n\
+                float2           region_end,\n\
+                float2           bin_size,\n\
+                int2             grid_size,\n\
+                float2           rcp_of_grid_size,\n\
+                int              pz\n\
+)\n\
+{\n\
+    float sum = 0;\n\
+\n\
+    for(int iy = 0; iy < grid_size.y; ++iy)\n\
+    {\n\
+        for(int ix = 0; ix < grid_size.x; ++ix)\n\
+        {\n\
+            float2 ixy = (float2)(ix + 0.5f, iy + 0.5f);\n\
+            float2 pos = region_start + ixy * bin_size * rcp_of_grid_size;\n\
+\n\
+            int2 xy_low  = convert_int2(pos);\n\
+            int2 xy_high = xy_low + 1;\n\
+\n\
+            float ly = pos.y - xy_low.y;\n\
+            float lx = pos.x - xy_low.x;\n\
+            float hy = 1.0f - ly;\n\
+            float hx = 1.0f - lx;\n\
+\n\
+            float w1 = hy * hx;\n\
+            float w2 = hy * lx;\n\
+            float w3 = ly * hx;\n\
+            float w4 = ly * lx;\n\
+\n\
+            uint4 data;\n\
+            data.x = read_imageui(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;\n\
+            data.y = read_imageui(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;\n\
+            data.z = read_imageui(input, (int4)(xy_low.x, xy_high.y, pz, 0)).x;\n\
+            data.w = read_imageui(input, (int4)(xy_high.x, xy_high.y, pz, 0)).x;\n\
+\n\
+            float4 value = convert_float4(data) * input_scale + input_tail;\n\
+\n\
+            sum = sum + w1 * value.x + w2 * value.y + w3 * value.z + w4 * value.w;\n\
+        }\n\
+    }\n\
+\n\
+    return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);\n\
+}\n\
+\n\
+__kernel void roi_align_U8_U16toU8\n\
+(\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_t       rois,\n\
+    __read_only  image2d_t       n_rois,\n\
+    __write_only image2d_array_t output,\n\
+                 float           input_scale,\n\
+                 float           input_tail,\n\
+                 float           output_scale,\n\
+                 float           output_zp,\n\
+                 float           spatial_x_scale,\n\
+                 float           spatial_y_scale,\n\
+                 float           in_width,\n\
+                 float           in_height,\n\
+                 float           rcp_of_out_width,\n\
+                 float           rcp_of_out_height,\n\
+                 float           sampling_x_ratio,\n\
+                 float           sampling_y_ratio,\n\
+                 int             depth\n\
+)\n\
+{\n\
+    int px = get_global_id(0);\n\
+    int py = get_global_id(1);\n\
+    int pw = get_global_id(2);\n\
+\n\
+    int roi_batch = read_imagei(n_rois, (int2)(pw, 0)).x;\n\
+    float4 roi_x = convert_float4(read_imageui(rois, (int2)(0, pw)));\n\
+    float4 roi_y = convert_float4(read_imageui(rois, (int2)(1, pw)));\n\
+    float4 roi_z = convert_float4(read_imageui(rois, (int2)(2, pw)));\n\
+    float4 roi_w = convert_float4(read_imageui(rois, (int2)(3, pw)));\n\
+    float4 roi = (float4)(roi_x.x, roi_y.x, roi_z.x, roi_w.x);\n\
+\n\
+    float4 roi_anchor = roi * (float4)(spatial_x_scale, spatial_y_scale, spatial_x_scale, spatial_y_scale);\n\
+    float2 roi_dims = fmax(roi_anchor.zw - roi_anchor.xy, 1.0f);\n\
+\n\
+    float2 spatial_indx     = (float2)(px, py);\n\
+    float2 pooled_dims      = (float2)(rcp_of_out_width, rcp_of_out_height);\n\
+    float2 max_spatial_dims = (float2)(in_width, in_height);\n\
+\n\
+    float2 bin_size     = roi_dims * pooled_dims;\n\
+    float2 region_start = spatial_indx * bin_size + roi_anchor.xy;\n\
+    float2 region_end   = region_start + bin_size;\n\
+\n\
+    float2 roi_bin_grid = (float2)(sampling_x_ratio, sampling_y_ratio);\n\
+\n\
+    roi_bin_grid = roi_bin_grid == 0 ? ceil(bin_size - EPS_GRID) : roi_bin_grid;\n\
+\n\
+    int kz = roi_batch * depth;\n\
+    float2 rcp_of_grid_size = 1.0f / roi_bin_grid;\n\
+    int2 grid_size_xy = convert_int2(roi_bin_grid);\n\
+    float4 interp;\n\
+    int kz1 = pw * depth;\n\
+    for (int pz = 0; pz < depth; pz ++, kz ++, kz1 ++)\n\
+    {\n\
+        interp.x = roi_align_1x1_U8toF32( input,\n\
+                       input_scale,\n\
+                       input_tail,\n\
+                       region_start,\n\
+                       region_end,\n\
+                       bin_size,\n\
+                       grid_size_xy,\n\
+                       rcp_of_grid_size,\n\
+                       kz);\n\
+\n\
+        uint4 dst;\n\
+        interp.x = interp.x * output_scale + output_zp;\n\
+        interp.x = interp.x < 255 ? interp.x : 255;\n\
+        dst.x = convert_uint_rte(interp.x);\n\
+        write_imageui(output, (int4)(px, py, kz1, 0), dst.xxxx);\n\
+    }\n\
 }"; /* end of roi_align_cl*/
 
 static const char scatter_nd_cl[] = "__kernel void scatter_nd_U32toU32_1D(\n\
@@ -61471,6 +59733,334 @@ TOPK_I32(1 << 5, 5)\n\
 TOPK_I32(1 << 6, 6)\n\
 "; /* end of topk_cl*/
 
+static const char topk_odd_even_sort_cl[] = "#define LOCAL_SIZE_X    (32)\n\
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_F32toF32_I32\n\
+ (\n\
+  __read_only  image2d_t input,\n\
+               image2d_t input_t,\n\
+               image2d_t indices_t,\n\
+  __write_only image2d_t output,\n\
+  __write_only image2d_t indices,\n\
+               int       width\n\
+  )\n\
+ {\n\
+    uint lid = get_local_id(0);\n\
+    uint work_group_size = get_local_size(0);\n\
+    uint offset = 0;\n\
+\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)\n\
+    {\n\
+        float4 data = read_imagef(input, coord.xy);\n\
+\n\
+        write_imagef(input_t, coord.xy, data);\n\
+        write_imagei(indices_t, coord.xy, coord.xxxx);\n\
+    }\n\
+\n\
+    __local int sorted[1];\n\
+    int width_minus_one = width - 1;\n\
+    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\
+    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\
+\n\
+    int x_start = lid * num_pixels_per_thread;\n\
+    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\
+\n\
+    sorted[0] = 0;\n\
+\n\
+    while (1)\n\
+    {\n\
+        if (lid == 0)\n\
+        {\n\
+            *sorted = 0;\n\
+        }\n\
+        int swapped = 0;\n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+\n\
+        // odd-even\n\
+        coord.x = x_start;\n\
+        coord.z = x_start + 1;\n\
+        for (; coord.x < x_end; )\n\
+        {\n\
+            float4 left = read_imagef(input_t, coord.xy);\n\
+            float4 right = read_imagef(input_t, coord.zy);\n\
+\n\
+            if (left.x < right.x)\n\
+            {\n\
+                int4 l_index = read_imagei(indices_t, coord.xy);\n\
+                int4 r_index = read_imagei(indices_t, coord.zy);\n\
+                swapped = 1;\n\
+\n\
+                write_imagef(input_t, coord.xy, right);\n\
+                write_imagef(input_t, coord.zy, left);\n\
+\n\
+                write_imagei(indices_t, coord.xy, r_index);\n\
+                write_imagei(indices_t, coord.zy, l_index);\n\
+            }\n\
+\n\
+            coord.xz = coord.xz + 2;\n\
+        }\n\
+\n\
+        // even-odd\n\
+        coord.x = x_start + 1;\n\
+        coord.z = x_start + 2;\n\
+        for (; coord.x < x_end; )\n\
+        {\n\
+            float4 left = read_imagef(input_t, coord.xy);\n\
+            float4 right = read_imagef(input_t, coord.zy);\n\
+\n\
+            if (left.x < right.x)\n\
+            {\n\
+                int4 l_index = read_imagei(indices_t, coord.xy);\n\
+                int4 r_index = read_imagei(indices_t, coord.zy);\n\
+                swapped = 1;\n\
+\n\
+                write_imagef(input_t, coord.xy, right);\n\
+                write_imagef(input_t, coord.zy, left);\n\
+\n\
+                write_imagei(indices_t, coord.xy, r_index);\n\
+                write_imagei(indices_t, coord.zy, l_index);\n\
+            }\n\
+\n\
+            coord.xz = coord.xz + 2;\n\
+        }\n\
+\n\
+        atomic_add(sorted, swapped);\n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+\n\
+        if (*sorted == 0)\n\
+            break;\n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+    }\n\
+\n\
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)\n\
+    {\n\
+        float4 data = read_imagef(input_t, coord.xy);\n\
+        int4 index = read_imagei(indices_t, coord.xy);\n\
+\n\
+        write_imagef(output, coord.xy, data);\n\
+        write_imagei(indices, coord.xy, index);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_U32toU32_I32\n\
+ (\n\
+  __read_only  image2d_t input,\n\
+               image2d_t input_t,\n\
+               image2d_t indices_t,\n\
+  __write_only image2d_t output,\n\
+  __write_only image2d_t indices,\n\
+               int       width\n\
+  )\n\
+ {\n\
+    uint lid = get_local_id(0);\n\
+    uint work_group_size = get_local_size(0);\n\
+    uint offset = 0;\n\
+\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)\n\
+    {\n\
+        uint4 data = read_imageui(input, coord.xy);\n\
+\n\
+        write_imageui(input_t, coord.xy, data);\n\
+        write_imagei(indices_t, coord.xy, coord.xxxx);\n\
+    }\n\
+\n\
+    __local int sorted[1];\n\
+    int width_minus_one = width - 1;\n\
+    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\
+    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\
+\n\
+    int x_start = lid * num_pixels_per_thread;\n\
+    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\
+\n\
+    sorted[0] = 0;\n\
+\n\
+    while (1)\n\
+    {\n\
+        if (lid == 0)\n\
+        {\n\
+            *sorted = 0;\n\
+        }\n\
+        int swapped = 0;\n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+\n\
+        // odd-even\n\
+        coord.x = x_start;\n\
+        coord.z = x_start + 1;\n\
+        for (; coord.x < x_end; )\n\
+        {\n\
+            uint4 left = read_imageui(input_t, coord.xy);\n\
+            uint4 right = read_imageui(input_t, coord.zy);\n\
+\n\
+            if (left.x < right.x)\n\
+            {\n\
+                int4 l_index = read_imagei(indices_t, coord.xy);\n\
+                int4 r_index = read_imagei(indices_t, coord.zy);\n\
+                swapped = 1;\n\
+\n\
+                write_imageui(input_t, coord.xy, right);\n\
+                write_imageui(input_t, coord.zy, left);\n\
+\n\
+                write_imagei(indices_t, coord.xy, r_index);\n\
+                write_imagei(indices_t, coord.zy, l_index);\n\
+            }\n\
+\n\
+            coord.xz = coord.xz + 2;\n\
+        }\n\
+\n\
+        // even-odd\n\
+        coord.x = x_start + 1;\n\
+        coord.z = x_start + 2;\n\
+        for (; coord.x < x_end; )\n\
+        {\n\
+            uint4 left = read_imageui(input_t, coord.xy);\n\
+            uint4 right = read_imageui(input_t, coord.zy);\n\
+\n\
+            if (left.x < right.x)\n\
+            {\n\
+                int4 l_index = read_imagei(indices_t, coord.xy);\n\
+                int4 r_index = read_imagei(indices_t, coord.zy);\n\
+                swapped = 1;\n\
+\n\
+                write_imageui(input_t, coord.xy, right);\n\
+                write_imageui(input_t, coord.zy, left);\n\
+\n\
+                write_imagei(indices_t, coord.xy, r_index);\n\
+                write_imagei(indices_t, coord.zy, l_index);\n\
+            }\n\
+\n\
+            coord.xz = coord.xz + 2;\n\
+        }\n\
+\n\
+        atomic_add(sorted, swapped);\n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+\n\
+        if (*sorted == 0)\n\
+            break;\n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+    }\n\
+\n\
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)\n\
+    {\n\
+        uint4 data = read_imageui(input_t, coord.xy);\n\
+        int4 index = read_imagei(indices_t, coord.xy);\n\
+\n\
+        write_imageui(output, coord.xy, data);\n\
+        write_imagei(indices, coord.xy, index);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_I32toI32_I32\n\
+ (\n\
+  __read_only  image2d_t input,\n\
+               image2d_t input_t,\n\
+               image2d_t indices_t,\n\
+  __write_only image2d_t output,\n\
+  __write_only image2d_t indices,\n\
+               int       width\n\
+  )\n\
+ {\n\
+    uint lid = get_local_id(0);\n\
+    uint work_group_size = get_local_size(0);\n\
+    uint offset = 0;\n\
+\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)\n\
+    {\n\
+        int4 data = read_imagei(input, coord.xy);\n\
+\n\
+        write_imagei(input_t, coord.xy, data);\n\
+        write_imagei(indices_t, coord.xy, coord.xxxx);\n\
+    }\n\
+\n\
+    __local int sorted[1];\n\
+    int width_minus_one = width - 1;\n\
+    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\
+    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\
+\n\
+    int x_start = lid * num_pixels_per_thread;\n\
+    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\
+\n\
+    sorted[0] = 0;\n\
+\n\
+    while (1)\n\
+    {\n\
+        if (lid == 0)\n\
+        {\n\
+            *sorted = 0;\n\
+        }\n\
+        int swapped = 0;\n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+\n\
+        // odd-even\n\
+        coord.x = x_start;\n\
+        coord.z = x_start + 1;\n\
+        for (; coord.x < x_end; )\n\
+        {\n\
+            int4 left = read_imagei(input_t, coord.xy);\n\
+            int4 right = read_imagei(input_t, coord.zy);\n\
+\n\
+            if (left.x < right.x)\n\
+            {\n\
+                int4 l_index = read_imagei(indices_t, coord.xy);\n\
+                int4 r_index = read_imagei(indices_t, coord.zy);\n\
+                swapped = 1;\n\
+\n\
+                write_imagei(input_t, coord.xy, right);\n\
+                write_imagei(input_t, coord.zy, left);\n\
+\n\
+                write_imagei(indices_t, coord.xy, r_index);\n\
+                write_imagei(indices_t, coord.zy, l_index);\n\
+            }\n\
+\n\
+            coord.xz = coord.xz + 2;\n\
+        }\n\
+\n\
+        // even-odd\n\
+        coord.x = x_start + 1;\n\
+        coord.z = x_start + 2;\n\
+        for (; coord.x < x_end; )\n\
+        {\n\
+            int4 left = read_imagei(input_t, coord.xy);\n\
+            int4 right = read_imagei(input_t, coord.zy);\n\
+\n\
+            if (left.x < right.x)\n\
+            {\n\
+                int4 l_index = read_imagei(indices_t, coord.xy);\n\
+                int4 r_index = read_imagei(indices_t, coord.zy);\n\
+                swapped = 1;\n\
+\n\
+                write_imagei(input_t, coord.xy, right);\n\
+                write_imagei(input_t, coord.zy, left);\n\
+\n\
+                write_imagei(indices_t, coord.xy, r_index);\n\
+                write_imagei(indices_t, coord.zy, l_index);\n\
+            }\n\
+\n\
+            coord.xz = coord.xz + 2;\n\
+        }\n\
+\n\
+        atomic_add(sorted, swapped);\n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+\n\
+        if (*sorted == 0)\n\
+            break;\n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+    }\n\
+\n\
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)\n\
+    {\n\
+        int4 data = read_imagei(input_t, coord.xy);\n\
+        int4 index = read_imagei(indices_t, coord.xy);\n\
+\n\
+        write_imagei(output, coord.xy, data);\n\
+        write_imagei(indices, coord.xy, index);\n\
+    }\n\
+}"; /* end of topk_odd_even_sort_cl*/
+
 static const char upsample_cl[] = "\n\
 #define UPSAMPLE_PROCESS(data_type, read_fun, write_fun) \\\n\
     data_type src  = 0; \\\n\
@@ -61704,6 +60294,10 @@ static const source_map_t evis_resource[] =
     {"clip_U8_vx", clip_U8_vx},
     {"conv1d_ovxlib_vx", conv1d_ovxlib_vx},
     {"conv1d_ovxlib_k1024_vx", conv1d_ovxlib_k1024_vx},
+    {"cumsum_vx", cumsum_vx},
+    {"cumsum_2d_vx", cumsum_2d_vx},
+    {"cumsum_bf16_vx", cumsum_bf16_vx},
+    {"cumsum_f16_u8_vx", cumsum_f16_u8_vx},
     {"custom_softmax_vx", custom_softmax_vx},
     {"custom_warp_affine_vx", custom_warp_affine_vx},
     {"custom_warp_perspective_vx", custom_warp_perspective_vx},
@@ -61733,14 +60327,9 @@ static const source_map_t evis_resource[] =
     {"gather_nd_3d_mix_vx", gather_nd_3d_mix_vx},
     {"gather_nd_mix_vx", gather_nd_mix_vx},
     {"get_matrix_vx", get_matrix_vx},
-    {"group_normalization_f16_vx", group_normalization_f16_vx},
-    {"group_normalization_f16_scale_vx", group_normalization_f16_scale_vx},
-    {"group_normalization_i16_vx", group_normalization_i16_vx},
-    {"group_normalization_i16_scale_vx", group_normalization_i16_scale_vx},
-    {"group_normalization_i8_vx", group_normalization_i8_vx},
-    {"group_normalization_i8_scale_vx", group_normalization_i8_scale_vx},
-    {"group_normalization_u8_vx", group_normalization_u8_vx},
-    {"group_normalization_u8_f16_vx", group_normalization_u8_f16_vx},
+    {"group_normalization_0_vx", group_normalization_0_vx},
+    {"group_normalization_1_vx", group_normalization_1_vx},
+    {"group_normalization_2_vx", group_normalization_2_vx},
     {"grucell_activation_vx", grucell_activation_vx},
     {"grucell_activation_sma_vx", grucell_activation_sma_vx},
     {"grucell_activation_z_h_vx", grucell_activation_z_h_vx},
@@ -61749,26 +60338,17 @@ static const source_map_t evis_resource[] =
     {"grucell_h_times_activation_r_vx", grucell_h_times_activation_r_vx},
     {"grucell_reset_after_activation_vx", grucell_reset_after_activation_vx},
     {"hswish_vx", hswish_vx},
-    {"instance_normalization_f16_vx", instance_normalization_f16_vx},
-    {"instance_normalization_i16_vx", instance_normalization_i16_vx},
-    {"instance_normalization_i8_vx", instance_normalization_i8_vx},
-    {"instance_normalization_scale_f32_vx", instance_normalization_scale_f32_vx},
-    {"instance_normalization_scale_f32_bf16_vx", instance_normalization_scale_f32_bf16_vx},
-    {"instance_normalization_scale_f32_f16_vx", instance_normalization_scale_f32_f16_vx},
-    {"instance_normalization_u8_vx", instance_normalization_u8_vx},
-    {"instance_normalization_u8_f16_vx", instance_normalization_u8_f16_vx},
+    {"instance_normalization_0_vx", instance_normalization_0_vx},
+    {"instance_normalization_1_vx", instance_normalization_1_vx},
+    {"instance_normalization_2_vx", instance_normalization_2_vx},
+    {"instance_normalization_3_vx", instance_normalization_3_vx},
     {"l2normalizescale_axis0_vx", l2normalizescale_axis0_vx},
+    {"l2normalizescale_axis0_2d_vx", l2normalizescale_axis0_2d_vx},
     {"l2normalizescale_axis1_vx", l2normalizescale_axis1_vx},
-    {"layer_normalization_vx", layer_normalization_vx},
-    {"layer_normalization_2d_vx", layer_normalization_2d_vx},
-    {"layer_normalization_i16_vx", layer_normalization_i16_vx},
-    {"layer_normalization_scale_f32_vx", layer_normalization_scale_f32_vx},
-    {"layer_normalization_scale_f32_2d_vx", layer_normalization_scale_f32_2d_vx},
-    {"layer_normalization_scale_f32_bf16_vx", layer_normalization_scale_f32_bf16_vx},
-    {"layer_normalization_u8_f16_vx", layer_normalization_u8_f16_vx},
-    {"layer_normalization_wh_f16_vx", layer_normalization_wh_f16_vx},
-    {"layer_normalization_wh_i16_vx", layer_normalization_wh_i16_vx},
-    {"layer_normalization_wh_u8_vx", layer_normalization_wh_u8_vx},
+    {"layer_normalization_0_vx", layer_normalization_0_vx},
+    {"layer_normalization_1_vx", layer_normalization_1_vx},
+    {"layer_normalization_2_vx", layer_normalization_2_vx},
+    {"layer_normalization_3_vx", layer_normalization_3_vx},
     {"log_softmax_axis0_vx", log_softmax_axis0_vx},
     {"log_softmax_axis0_BF16_vx", log_softmax_axis0_BF16_vx},
     {"log_softmax_axis1_vx", log_softmax_axis1_vx},
@@ -61815,6 +60395,7 @@ static const source_map_t evis_resource[] =
     {"maximum_1_vx", maximum_1_vx},
     {"minimum_0_vx", minimum_0_vx},
     {"minimum_1_vx", minimum_1_vx},
+    {"mod_vx", mod_vx},
     {"moments_axis0_vx", moments_axis0_vx},
     {"moments_axis01_vx", moments_axis01_vx},
     {"moments_axis012_vx", moments_axis012_vx},
@@ -61827,12 +60408,7 @@ static const source_map_t evis_resource[] =
     {"poolwithargmax_I16_vx", poolwithargmax_I16_vx},
     {"poolwithargmax_I8_vx", poolwithargmax_I8_vx},
     {"poolwithargmax_U8_vx", poolwithargmax_U8_vx},
-    {"pow_fp16_vx", pow_fp16_vx},
-    {"pow_fp16_i16_vx", pow_fp16_i16_vx},
-    {"pow_fp16_i8_vx", pow_fp16_i8_vx},
-    {"pow_i16_vx", pow_i16_vx},
-    {"pow_i8_vx", pow_i8_vx},
-    {"pow_u8_vx", pow_u8_vx},
+    {"pow_vx", pow_vx},
     {"pre_process_bgra_vx", pre_process_bgra_vx},
     {"pre_process_gray_vx", pre_process_gray_vx},
     {"pre_process_gray_2_vx", pre_process_gray_2_vx},
@@ -61844,6 +60420,9 @@ static const source_map_t evis_resource[] =
     {"pre_process_rgb888_planar_0_vx", pre_process_rgb888_planar_0_vx},
     {"pre_process_rgb888_planar_1_vx", pre_process_rgb888_planar_1_vx},
     {"pre_process_rgb888_planar_2_vx", pre_process_rgb888_planar_2_vx},
+    {"pre_process_rgb888_planar_sep_0_vx", pre_process_rgb888_planar_sep_0_vx},
+    {"pre_process_rgb888_planar_sep_1_vx", pre_process_rgb888_planar_sep_1_vx},
+    {"pre_process_rgb888_planar_sep_2_vx", pre_process_rgb888_planar_sep_2_vx},
     {"pre_process_rgb_copy_vx", pre_process_rgb_copy_vx},
     {"pre_process_yuv420_copy_u8_vx", pre_process_yuv420_copy_u8_vx},
     {"pre_process_yuv420_scale_fp16_vx", pre_process_yuv420_scale_fp16_vx},
@@ -61893,7 +60472,9 @@ static const source_map_t evis_resource[] =
     {"resize_bilinear_U8_half_pixel_centers_1_vx", resize_bilinear_U8_half_pixel_centers_1_vx},
     {"resize_bilinear_U8_half_pixel_centers_2_vx", resize_bilinear_U8_half_pixel_centers_2_vx},
     {"resize_bilinear_U8_opt_vx", resize_bilinear_U8_opt_vx},
+    {"resize_bilinear_align_corners_vx", resize_bilinear_align_corners_vx},
     {"resize_bilinear_nhwc_vx", resize_bilinear_nhwc_vx},
+    {"resize_bilinear_nhwc_bound_vx", resize_bilinear_nhwc_bound_vx},
     {"resize_nearest_vx", resize_nearest_vx},
     {"scatter_nd_vx", scatter_nd_vx},
     {"scatter_nd_big_vx", scatter_nd_big_vx},
@@ -61934,6 +60515,8 @@ static const source_map_t cl_resource[] =
     {"clip_F32_cl", clip_F32_cl},
     {"clip_I32_cl", clip_I32_cl},
     {"clip_U8_cl", clip_U8_cl},
+    {"cumsum_cl", cumsum_cl},
+    {"cumsum_2d_cl", cumsum_2d_cl},
     {"depth2space_crd_cl", depth2space_crd_cl},
     {"detect_post_box_cl", detect_post_box_cl},
     {"eltwise_ops_helper_cl", eltwise_ops_helper_cl},
@@ -61955,7 +60538,6 @@ static const source_map_t cl_resource[] =
     {"grucell_h_times_activation_r_cl", grucell_h_times_activation_r_cl},
     {"grucell_reset_after_activation_cl", grucell_reset_after_activation_cl},
     {"hswish_cl", hswish_cl},
-    {"instance_normalization_f16_cl", instance_normalization_f16_cl},
     {"instance_normalization_f32_cl", instance_normalization_f32_cl},
     {"instance_normalization_i32_cl", instance_normalization_i32_cl},
     {"instance_normalization_u8_cl", instance_normalization_u8_cl},
@@ -61992,7 +60574,10 @@ static const source_map_t cl_resource[] =
     {"matrixmul_cl", matrixmul_cl},
     {"matrixmul_transA_cl", matrixmul_transA_cl},
     {"maximum_cl", maximum_cl},
+    {"maxpoolwithargmax_cl", maxpoolwithargmax_cl},
+    {"maxpoolwithargmax_2d_cl", maxpoolwithargmax_2d_cl},
     {"minimum_cl", minimum_cl},
+    {"mod_cl", mod_cl},
     {"moments_axis0_cl", moments_axis0_cl},
     {"moments_axis01_cl", moments_axis01_cl},
     {"moments_axis012_cl", moments_axis012_cl},
@@ -62036,6 +60621,7 @@ static const source_map_t cl_resource[] =
     {"swish_cl", swish_cl},
     {"tile_cl", tile_cl},
     {"topk_cl", topk_cl},
+    {"topk_odd_even_sort_cl", topk_odd_even_sort_cl},
     {"upsample_cl", upsample_cl},
 };
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c b/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c
index f1141ba..8fece69 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c
@@ -63,43 +63,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    /* check inputs outputs data type */
-    BEGIN_IO_TYPE_DECL(ABS, 1, 1)
-        /* IO_TYPE(INPUT, OUTPUT) */
-        IO_TYPE(D_F32, D_F32)
-        IO_TYPE(D_F32, D_F16)
-        IO_TYPE(D_F32, D_BF16)
+    vsi_bool ret = vsi_nn_OpCheck(VSI_NN_OP_RELU, self, inputs, outputs);
 
-        IO_TYPE(D_F16, D_F32)
-        IO_TYPE(D_F16, D_F16)
-        IO_TYPE(D_F16, D_U8|Q_ASYM)
-        IO_TYPE(D_F16, D_I8|Q_DFP)
-        IO_TYPE(D_F16, D_I16|Q_DFP)
-
-        IO_TYPE(D_BF16, D_BF16)
-        IO_TYPE(D_BF16, D_F32)
-
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_F16)
-
-        IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_ASYM, D_F16)
-
-        IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP, D_F16)
-
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_F16)
-    END_IO_TYPE_DECL(ABS)
-    if(!VALIDATE_OP_IO_TYPES(ABS, self, inputs, self->input.num, outputs, self->output.num)) {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
-    }
-
-    return TRUE;
+    return ret;
 } /* op_check() */
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c
index 073d063..46e689c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c
@@ -154,7 +154,10 @@ static vsi_bool op_setup
             attr.dim_num = VSI_NN_DIM_AUTO;
             attr.vtl = TRUE;
             attr.is_const = FALSE;
-            if (_is_float32_data_format(self, inputs, outputs))
+            if (VSI_NN_TYPE_INT32 == outputs[0]->attr.dtype.vx_type){
+                attr.dtype.vx_type = VSI_NN_TYPE_INT32;
+            }
+            else if(_is_float32_data_format( self, inputs, outputs ))
             {
                 attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
             }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
index 06d439b..b0eea1f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
@@ -350,19 +350,19 @@ static vsi_bool _dynamic_check
 
     /* check inputs outputs data type */
     BEGIN_IO_TYPE_DECL(BATCHNORM_SINGLE, 5, 1)
-        IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM)
-        IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP)
-        IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP)
-        IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_F16)
-        IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16)
-        IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16)
-        IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32)
-        IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,    D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_F32,          D_F32, D_F32, D_F32, D_F32, D_F32)
+        IO_TYPE(D_F16,          D_F32, D_F32, D_F32, D_F32, D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
     END_IO_TYPE_DECL(BATCHNORM_SINGLE)
     if(!VALIDATE_OP_IO_TYPES(BATCHNORM_SINGLE, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
@@ -399,24 +399,33 @@ static vsi_bool _static_check
     )
 {
     BEGIN_IO_TYPE_DECL(BATCH_NORM, 5, 1)
-        IO_TYPE(D_F16,  D_F32,  D_F32,  D_F32, D_F32,  D_F32)
-        IO_TYPE(D_F16,  D_F32,  D_F32,  D_F32, D_F32,  D_F16)
-        IO_TYPE(D_F32,  D_F32,  D_F32,  D_F32, D_F32,  D_F32)
-        IO_TYPE(D_F32,  D_F32,  D_F32,  D_F32, D_F32,  D_F16)
-        IO_TYPE(D_F32,  D_F32,  D_F32,  D_F32, D_F32,  D_BF16)
-        IO_TYPE(D_BF16, D_F32,  D_F32,  D_F32, D_F32,  D_BF16)
-        IO_TYPE(D_F16,  D_F32,  D_F32,  D_F32, D_F32,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_F32,  D_F32,  D_F32, D_F32,  D_I8|Q_ASYM)
-        IO_TYPE(D_F16,  D_F32,  D_F32,  D_F32, D_F32,  D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_F32,  D_F32,  D_F32, D_F32,  D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_ASYM, D_F32,  D_F32,  D_F32, D_F32,  D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_F32,  D_F32,  D_F32, D_F32,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_ASYM, D_F32,  D_F32,  D_F32, D_F32,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_F32,  D_F32,  D_F32, D_F32,  D_F16)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32, D_F32,  D_F32)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32, D_F32,  D_F16)
+        IO_TYPE(D_F32,        D_F32,  D_F32,  D_F32, D_F32,  D_F32)
+        IO_TYPE(D_F32,        D_F32,  D_F32,  D_F32, D_F32,  D_F16)
+        IO_TYPE(D_F32,        D_F32,  D_F32,  D_F32, D_F32,  D_BF16)
+        IO_TYPE(D_BF16,       D_F32,  D_F32,  D_F32, D_F32,  D_BF16)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32, D_F32,  D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32, D_F32,  D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32, D_F32,  D_I8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32, D_F32,  D_I8|Q_SYM)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32, D_F32,  D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32, D_F32,  D_I16|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32, D_F32,  D_I16|Q_SYM)
+        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_F32, D_F32,  D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_F32, D_F32,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_F32, D_F32,  D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_F32, D_F32,  D_F16)
+        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_F32, D_F32,  D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_F32, D_F32,  D_F16)
         IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F32, D_F32,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_F32, D_F32,  D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_F32, D_F32,  D_I16|Q_SYM)
         IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F32, D_F32,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_F32,  D_F32,  D_F32, D_F32,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_F32,  D_F32,  D_F32, D_F32,  D_F16)
+        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_F32, D_F32,  D_F16)
+        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_F32, D_F32,  D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F32, D_F32,  D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F32, D_F32,  D_F16)
     END_IO_TYPE_DECL(BATCH_NORM)
     if (!VALIDATE_OP_IO_TYPES(BATCH_NORM, self, inputs, self->input.num, outputs, self->output.num))
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c
index 4ffe7ed..1eaa783 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c
@@ -22,7 +22,6 @@
 *
 *****************************************************************************/
 
-
 #include <string.h>
 #include <stdlib.h>
 
@@ -154,13 +153,25 @@ static vsi_bool op_check
         IO_TYPE(D_U32,        D_U32)
         IO_TYPE(D_U32,        D_BOOL8)
         IO_TYPE(D_F16,        D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_I16|Q_ASYM)
+        IO_TYPE(D_F16,        D_I16|Q_SYM)
         IO_TYPE(D_F16,        D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_I8|Q_ASYM)
+        IO_TYPE(D_F16,        D_I8|Q_SYM)
         IO_TYPE(D_F16,        D_U8|Q_ASYM)
         IO_TYPE(D_F16,        D_BOOL8)
         IO_TYPE(D_I16|Q_DFP,  D_F16)
         IO_TYPE(D_I16|Q_DFP,  D_I8|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,  D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,  D_BOOL8)
+        IO_TYPE(D_I16|Q_ASYM, D_F16)
+        IO_TYPE(D_I16|Q_ASYM, D_I8|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM, D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM, D_BOOL8)
+        IO_TYPE(D_I16|Q_SYM,  D_F16)
+        IO_TYPE(D_I16|Q_SYM,  D_I8|Q_DFP)
+        IO_TYPE(D_I16|Q_SYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_BOOL8)
         IO_TYPE(D_I16,        D_F16)
         IO_TYPE(D_I16,        D_I8|Q_DFP)
         IO_TYPE(D_I16,        D_U8|Q_ASYM)
@@ -172,6 +183,14 @@ static vsi_bool op_check
         IO_TYPE(D_I8|Q_DFP,   D_I16|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,   D_U8|Q_ASYM)
         IO_TYPE(D_I8|Q_DFP,   D_BOOL8)
+        IO_TYPE(D_I8|Q_ASYM,  D_F16)
+        IO_TYPE(D_I8|Q_ASYM,  D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,  D_BOOL8)
+        IO_TYPE(D_I8|Q_SYM,   D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_SYM,   D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_BOOL8)
         IO_TYPE(D_I8,         D_F16)
         IO_TYPE(D_I8,         D_I16|Q_DFP)
         IO_TYPE(D_I8,         D_U8|Q_ASYM)
@@ -191,10 +210,18 @@ static vsi_bool op_check
         IO_TYPE(D_U8,         D_U32)
         IO_TYPE(D_U8,         D_F32)
         IO_TYPE(D_F32,        D_I16|Q_DFP)
+        IO_TYPE(D_F32,        D_I16|Q_ASYM)
+        IO_TYPE(D_F32,        D_I16|Q_SYM)
         IO_TYPE(D_F32,        D_I8|Q_DFP)
+        IO_TYPE(D_F32,        D_I8|Q_ASYM)
+        IO_TYPE(D_F32,        D_I8|Q_SYM)
         IO_TYPE(D_F32,        D_U8|Q_ASYM)
         IO_TYPE(D_I32,        D_I16|Q_DFP)
+        IO_TYPE(D_I32,        D_I16|Q_ASYM)
+        IO_TYPE(D_I32,        D_I16|Q_SYM)
         IO_TYPE(D_I32,        D_I8|Q_DFP)
+        IO_TYPE(D_I32,        D_I8|Q_ASYM)
+        IO_TYPE(D_I32,        D_I8|Q_SYM)
         IO_TYPE(D_I32,        D_U8|Q_ASYM)
         IO_TYPE(D_F16,        D_F32)
         IO_TYPE(D_F16,        D_I32)
@@ -204,7 +231,11 @@ static vsi_bool op_check
         IO_TYPE(D_F16,        D_F16)
         IO_TYPE(D_BOOL8,      D_F16)
         IO_TYPE(D_BOOL8,      D_I16|Q_DFP)
+        IO_TYPE(D_BOOL8,      D_I16|Q_ASYM)
+        IO_TYPE(D_BOOL8,      D_I16|Q_SYM)
         IO_TYPE(D_BOOL8,      D_I8|Q_DFP)
+        IO_TYPE(D_BOOL8,      D_I8|Q_ASYM)
+        IO_TYPE(D_BOOL8,      D_I8|Q_SYM)
         IO_TYPE(D_BOOL8,      D_U8|Q_ASYM)
         IO_TYPE(D_BOOL8,      D_BOOL8)
         IO_TYPE(D_BOOL8,      D_I16)
@@ -212,12 +243,16 @@ static vsi_bool op_check
         IO_TYPE(D_BOOL8,      D_U8)
         IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
         IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_I8|Q_SYM)
         IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_I16|Q_SYM)
         IO_TYPE(D_U8|Q_ASYM,  D_F32)
         IO_TYPE(D_U8|Q_ASYM,  D_I32)
         IO_TYPE(D_BF16,       D_BF16)
     END_IO_TYPE_DECL(CAST)
-    if(!VALIDATE_OP_IO_TYPES(CAST, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(CAST, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -228,7 +263,6 @@ static vsi_bool op_check
     return TRUE;
 } /* op_check() */
 
-
 static vsi_status op_optimize
     (
     vsi_nn_node_t * self,
@@ -249,7 +283,6 @@ static vsi_status op_optimize
     return status;
 } /* op_optimize() */
 
-
 static vsi_bool op_setup
     (
     vsi_nn_node_t * self,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c b/src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c
index 69dbfd5..6fd097c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c
@@ -42,30 +42,11 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    BEGIN_IO_TYPE_DECL(CEIL, 1, 1)
-        IO_TYPE(D_BF16, D_BF16)
-        IO_TYPE(D_F32,  D_F32)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP)
-    END_IO_TYPE_DECL(CEIL)
-    if (!VALIDATE_OP_IO_TYPES(CEIL, self, inputs, self->input.num, outputs, self->output.num))
-    {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
-    }
+    vsi_bool ret = FALSE;
 
-    return TRUE;
+    ret = vsi_nn_OpCheck(VSI_NN_OP_FLOOR, self, inputs, outputs);
+
+    return ret;
 } /* op_check() */
 
 static vsi_status op_compute
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
index 6e7288b..b2b01f5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
@@ -151,6 +151,13 @@ static vsi_bool op_check
         IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
         IO_TYPE(D_I16|Q_SYM,    D_F16)
         IO_TYPE(D_BF16,         D_BF16)
+
+        /* HW 9.1.1 */
+        IO_TYPE(D_U4|Q_ASYM,    D_U4|Q_ASYM)
+        IO_TYPE(D_U4|Q_SYM,     D_U4|Q_SYM)
+        IO_TYPE(D_I4|Q_ASYM,    D_I4|Q_ASYM)
+        IO_TYPE(D_I4|Q_SYM,     D_I4|Q_SYM)
+
     END_IO_TYPE_DECL(CLIP)
     if (!VALIDATE_OP_IO_TYPES(CLIP, self, inputs, self->input.num, outputs, self->output.num))
     {
@@ -249,7 +256,6 @@ static vsi_bool op_setup
     return ret;
 } /* op_init() */
 
-
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
index 01721e6..5ebe3cf 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
@@ -386,10 +386,18 @@ static vsi_bool op_setup
     if ( (self->nn_param.conv1d.ksize == 1024 && self->nn_param.conv1d.dilation == 1)
       || (self->nn_param.conv1d.ksize == 3 && self->nn_param.conv1d.dilation > 7) )
     {
-        if (self->nn_param.conv1d.stride == 1 && self->nn_param.conv1d.multiplier == 0)
+        int32_t ksize = self->nn_param.conv1d.ksize;
+        int32_t stride = self->nn_param.conv1d.stride;
+        int32_t dilation = self->nn_param.conv1d.dilation;
+        int32_t real_kernel = ((ksize - 1) * dilation + ksize + stride - 1) / stride;
+#define MAX_CONV1D_KERNEL_SIZE (255)
+
+        if (self->nn_param.conv1d.stride == 1 && self->nn_param.conv1d.multiplier == 0 &&
+            real_kernel > MAX_CONV1D_KERNEL_SIZE)
         {
+#undef MAX_CONV1D_KERNEL_SIZE
             self->nn_param.conv1d.local->use_ovxlib_kernel = TRUE;
-            if ((p->pad[0] || p->pad[1]) && (inputs[0]->attr.size[0] >= 65535))
+            if ((p->pad[0] || p->pad[1]) && (inputs[0]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH))
             {
                 vsi_nn_tensor_attr_t attr;
                 vsi_nn_internal_node_t* curr = NULL;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
index 7dbe943..ba50ffd 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
@@ -223,6 +223,60 @@ static vsi_bool op_check
             IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_BF16)
             IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_F32)
 
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_I16|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_F16)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_BF16)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_F32)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_NONE,          D_I16|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_NONE,          D_BF16)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_NONE,          D_F32)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F16)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_I16|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_BF16)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_F32)
+
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_I16|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_F16)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_BF16)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_F32)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_ASYM,    D_NONE,          D_I16|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_ASYM,    D_NONE,          D_BF16)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_ASYM,    D_NONE,          D_F32)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_SYM,     D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_SYM,     D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_SYM,     D_I16|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_SYM,     D_F16)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_SYM,     D_BF16)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_SYM,     D_F32)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_I16|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_BF16)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_F32)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_I16|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_BF16)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_F32)
+
             IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
             IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
             IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
@@ -244,6 +298,29 @@ static vsi_bool op_check
             IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
             IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
 
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_NONE,          D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_NONE,          D_I16|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_NONE,          D_F16)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_NONE,          D_BF16)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_NONE,          D_F32)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_NONE,          D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_NONE,          D_I16|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_NONE,          D_BF16)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_NONE,          D_F32)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
+
             IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_U8|Q_ASYM)
             IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_I8|Q_DFP)
             IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_BF16)
@@ -281,6 +358,9 @@ static vsi_bool op_check
             IO_TYPE(D_F32,       D_BF16,         D_F32,           D_BF16)
             IO_TYPE(D_F32,       D_BF16,         D_F32,           D_F32)
 
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM,     D_I32|Q_SYM,     D_U8|Q_ASYM)
+
+            /* HW 9.1.1 */
             IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
             IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_ASYM)
             IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
@@ -295,6 +375,18 @@ static vsi_bool op_check
             IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U4|Q_ASYM)
             IO_TYPE(D_I4|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_I4|Q_DFP)
 
+            IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_SYM,     D_U4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_SYM,     D_I4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM,     D_I32|Q_SYM,     D_U4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM,     D_I32|Q_SYM,     D_I4|Q_SYM)
+            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM,     D_I4|Q_SYM)
+            IO_TYPE(D_I4|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_I4|Q_SYM)
+            IO_TYPE(D_I4|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_U4|Q_ASYM)
+            IO_TYPE(D_I4|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_SYM)
+            IO_TYPE(D_I4|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_I4|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I4|Q_SYM)
+            IO_TYPE(D_I4|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U4|Q_ASYM)
+
         END_IO_TYPE_DECL(CONV2D)
         ret = VALIDATE_OP_IO_TYPES(CONV2D, self, inputs, self->input.num, outputs, self->output.num);
         if(!ret) {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c
index 098c935..6aaa61d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c
@@ -35,14 +35,10 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
-#define _ARG_NUM            (3)
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
-#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-#define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
 
 static vsi_status op_compute
     (
@@ -51,92 +47,7 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status = VSI_FAILURE;
-    vx_nn_stride_slice_params_t param;
-    vsi_nn_tensor_t *begin_dims_tensor = NULL;
-    vsi_nn_tensor_t *end_dims_tensor = NULL;
-    vsi_nn_tensor_t *stride_dims_tensor = NULL;
-    vsi_nn_tensor_attr_t attr;
-    vsi_size_t start[VSI_NN_MAX_DIM_NUM] = {0};
-    vsi_size_t end[VSI_NN_MAX_DIM_NUM] = {0};
-    int32_t stride[VSI_NN_MAX_DIM_NUM] = {0};
-    uint32_t i;
-
-    memset(&param, 0, sizeof(vx_nn_stride_slice_params_t));
-
-    for (i = 0; i < self->nn_param.crop.dims; i++)
-    {
-        start[i] = self->nn_param.crop.offset[i];
-        end[i] = self->nn_param.crop.offset[i] + outputs[0]->attr.size[i];
-        stride[i] = 1;
-    }
-
-    for (i = self->nn_param.crop.dims; i < inputs[0]->attr.dim_num; i++)
-    {
-        start[i] = 0;
-        end[i] = outputs[0]->attr.size[i];
-        stride[i] = 1;
-    }
-
-    memset(&attr, 0, sizeof(attr));
-    attr.size[0] = inputs[0]->attr.dim_num;
-    attr.dim_num = 1;
-    attr.is_const = TRUE;
-    attr.dtype.vx_type = VSI_NN_TYPE_INT32;
-    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-    begin_dims_tensor = vsi_nn_CreateTensorFromData(
-        self->graph,
-        (uint8_t *)start,
-        &attr);
-    if( NULL == begin_dims_tensor )
-    {
-        VSILOGE("Create begin_dims_tensor fail.(crop)");
-        return VSI_FAILURE;
-    }
-
-    end_dims_tensor = vsi_nn_CreateTensorFromData(
-        self->graph,
-        (uint8_t *)end,
-        &attr);
-    if( NULL == end_dims_tensor )
-    {
-        VSILOGE("Create end_dims_tensor fail.(crop)");
-        status = VSI_FAILURE;
-        goto OnError;
-    }
-
-    stride_dims_tensor = vsi_nn_CreateTensorFromData(
-        self->graph,
-        (uint8_t *)stride,
-        &attr);
-    if( NULL == stride_dims_tensor )
-    {
-        VSILOGE("Create stride_dims_tensor fail.(crop)");
-        status = VSI_FAILURE;
-        goto OnError;
-    }
-
-    param.begin_dims = REQUIRED_IO(begin_dims_tensor);
-    param.end_dims = REQUIRED_IO(end_dims_tensor);
-    param.stride_dims = REQUIRED_IO(stride_dims_tensor);
-
-    self->n = vxTensorStrideSliceNode(
-        self->graph->g,
-        inputs[0]->t,
-        &param,
-        sizeof(vx_nn_stride_slice_params_t),
-        outputs[0]->t
-        );
-
-    if( NULL != self->n )
-    {
-        status = VSI_SUCCESS;
-    }
-OnError:
-    if (begin_dims_tensor) vsi_nn_ReleaseTensor(&begin_dims_tensor);
-    if (end_dims_tensor) vsi_nn_ReleaseTensor(&end_dims_tensor);
-    if (stride_dims_tensor) vsi_nn_ReleaseTensor(&stride_dims_tensor);
-    return status;
+    return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
 static vsi_bool op_check
@@ -153,6 +64,17 @@ static vsi_bool op_check
     return ret;
 }
 
+static vsi_status op_optimize
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_nn_opt_direction_e direction
+    )
+{
+    return vsi_nn_internal_optimize_node( self, direction );
+}
+
 static vsi_bool op_setup
     (
     vsi_nn_node_t * self,
@@ -160,27 +82,32 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_nn_crop_param * p;
-    int32_t i;
+    vsi_nn_crop_param * p = NULL;
+    int32_t i = 0;
+    uint32_t j = 0;
+    vsi_nn_internal_node_t* curr = NULL;
+
+    vsi_nn_internal_init_node_wksp( self );
     p = (vsi_nn_crop_param *)&(self->nn_param.crop);
+
     if (p->axis >= (int32_t)inputs[0]->attr.dim_num)
     {
         VSILOGE("Invalid parameter: axis!\n");
         return FALSE;
     }
 
-    if( VSI_NN_DIM_AUTO != outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO != outputs[0]->attr.dim_num )
     {
-        return TRUE;
+        goto final;
     }
 
     if (p->dims + p->axis == inputs[0]->attr.dim_num)
     {
-        for(i = 0; i < p->axis; i++)
+        for (i = 0; i < p->axis; i++)
         {
             outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
         }
-        for(i = p->axis; i < (int32_t)inputs[0]->attr.dim_num; i++)
+        for (i = p->axis; i < (int32_t)inputs[0]->attr.dim_num; i++)
         {
             outputs[0]->attr.size[i] = inputs[1]->attr.size[i];
         }
@@ -190,12 +117,12 @@ static vsi_bool op_setup
     {
         if (p->dims == 1)
         {
-            for(i = 0; i <= p->axis; i++)
+            for (i = 0; i <= p->axis; i++)
             {
                 outputs[0]->attr.size[i] = inputs[1]->attr.size[i];
                 p->offset[i] = p->offset[0];
             }
-            for(i = p->axis + 1; i < (int32_t)inputs[0]->attr.dim_num; i++)
+            for (i = p->axis + 1; i < (int32_t)inputs[0]->attr.dim_num; i++)
             {
                 outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
             }
@@ -208,9 +135,78 @@ static vsi_bool op_setup
         }
     }
 
+final:
+    for (j = 0; j < self->nn_param.crop.dims; j++)
+    {
+        p->lcl_data->begin_dims[j] = (int32_t)self->nn_param.crop.offset[j];
+        p->lcl_data->end_dims[j] = (int32_t)self->nn_param.crop.offset[j] + (int32_t)outputs[0]->attr.size[j];
+        p->lcl_data->stride_dims[j] = 1;
+    }
+
+    for (j = self->nn_param.crop.dims; j < inputs[0]->attr.dim_num; j++)
+    {
+        p->lcl_data->begin_dims[j] = 0;
+        p->lcl_data->end_dims[j] = (int32_t)outputs[0]->attr.size[j];
+        p->lcl_data->stride_dims[j] = 1;
+    }
+
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 );
+    curr->node->nn_param.strided_slice.begin_dims = p->lcl_data->begin_dims;
+    curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num;
+    curr->node->nn_param.strided_slice.end_dims = p->lcl_data->end_dims;
+    curr->node->nn_param.strided_slice.end_dims_num = inputs[0]->attr.dim_num;
+    curr->node->nn_param.strided_slice.stride_dims = p->lcl_data->stride_dims;
+    curr->node->nn_param.strided_slice.stride_dims_num = inputs[0]->attr.dim_num;
+    curr->node->nn_param.strided_slice.begin_mask = 0;
+    curr->node->nn_param.strided_slice.end_mask = 0;
+    curr->node->nn_param.strided_slice.shrink_axis_mask = 0;
+    curr->node->nn_param.strided_slice.new_axis_mask = 0;
+    curr->inputs[0] = inputs[0];
+    curr->outputs[0] = outputs[0];
+    vsi_nn_internal_setup_node( self, curr );
+
     return TRUE;
 } /* op_setup() */
 
+static vsi_status op_init
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_crop_param * p = NULL;
+
+    p = &(self->nn_param.crop);
+
+    p->lcl_data = (vsi_nn_crop_lcl_data *)malloc(sizeof(vsi_nn_crop_lcl_data));
+    if (NULL == p->lcl_data)
+    {
+        return  VSI_FAILURE;
+    }
+    memset(p->lcl_data, 0, sizeof(vsi_nn_crop_lcl_data));
+
+    return status;
+}
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_nn_crop_param * p = NULL;
+
+    p = &(self->nn_param.crop);
+
+    vsi_nn_safe_free(p->lcl_data);
+
+    vsi_nn_internal_deinit_node_wksp( self );
+    vsi_nn_op_common_deinit(self);
+
+    return VSI_SUCCESS;
+} /* op_deinit() */
+
+
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -218,12 +214,12 @@ extern "C" {
 DEF_OP_REG
     (
     /* op_name    */ CROP,
-    /* init       */ NULL,
+    /* init       */ op_init,
     /* compute    */ op_compute,
-    /* deinit     */ vsi_nn_op_common_deinit,
+    /* deinit     */ op_deinit,
     /* check      */ op_check,
     /* setup      */ op_setup,
-    /* optimize   */ NULL,
+    /* optimize   */ op_optimize,
     /* input_num  */ 2,
     /* output_num */ 1
     );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c
new file mode 100644
index 0000000..d976b13
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c
@@ -0,0 +1,178 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+#define _ARG_NUM            (1)
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_kernel_node_t    n = NULL;
+
+    int32_t axis = self->nn_param.cumsum.axis;
+    int32_t exclusive = (int32_t)self->nn_param.cumsum.exclusive;
+    int32_t reverse = (int32_t)self->nn_param.cumsum.reverse;
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32( param, "axis", (int32_t)axis );
+    vsi_nn_kernel_param_add_int32( param, "exclusive", (int32_t)exclusive );
+    vsi_nn_kernel_param_add_int32( param, "reverse", (int32_t)reverse );
+    n = vsi_nn_kernel_selector( self->graph, "cumsum", inputs, 1, outputs, 1, param );
+    if ( n != NULL )
+    {
+        self->n = (vx_node)n;
+        status = VSI_SUCCESS;
+    }
+
+    if (param != NULL)
+    {
+        vsi_nn_kernel_param_release( &param );
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(CUMSUM, 1, 1)
+        IO_TYPE(D_U32,         D_U32)
+        IO_TYPE(D_F32,         D_F32)
+        IO_TYPE(D_F16,         D_F16)
+        IO_TYPE(D_BF16,        D_BF16)
+        IO_TYPE(D_U8|Q_ASYM,   D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_DFP,    D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,   D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,    D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,   D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,  D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,   D_I16|Q_SYM)
+        IO_TYPE(D_F16,         D_U8|Q_ASYM)
+        IO_TYPE(D_F16,         D_I8|Q_DFP)
+        IO_TYPE(D_F16,         D_I8|Q_ASYM)
+        IO_TYPE(D_F16,         D_I8|Q_SYM)
+        IO_TYPE(D_F16,         D_I16|Q_DFP)
+        IO_TYPE(D_F16,         D_I16|Q_ASYM)
+        IO_TYPE(D_F16,         D_I16|Q_SYM)
+    END_IO_TYPE_DECL(CUMSUM)
+    if (!VALIDATE_OP_IO_TYPES(CUMSUM, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    self->nn_param.cumsum.axis = 0;
+    self->nn_param.cumsum.exclusive = FALSE;
+    self->nn_param.cumsum.reverse = FALSE;
+
+    return status;
+} /* op_init() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /* TODO: Add code to comput outputs' shape. */
+    uint32_t i = 0;
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        for (i = 0; i < outputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+        }
+    }
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_nn_op_common_deinit(self);
+
+    return VSI_SUCCESS;
+} /* op_deinit() */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ CUMSUM,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
index 1929167..e18c4bd 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@@ -80,7 +80,6 @@ static vsi_bool _is_same_quant
     return TRUE;
 } /* _is_same_quant */
 
-
 static vsi_status op_optimize
     (
     vsi_nn_node_t * self,
@@ -237,34 +236,48 @@ static vsi_bool op_check
         IO_TYPE(D_BOOL8,      D_U8|Q_ASYM)
         IO_TYPE(D_BOOL8,      D_I8|Q_ASYM)
         IO_TYPE(D_BOOL8,      D_I8|Q_DFP)
+        IO_TYPE(D_BOOL8,      D_I8|Q_SYM)
         IO_TYPE(D_BOOL8,      D_I16|Q_DFP)
-        IO_TYPE(D_BOOL8,      D_U8)
-        IO_TYPE(D_BOOL8,      D_I8)
-        IO_TYPE(D_BOOL8,      D_I8)
-        IO_TYPE(D_BOOL8,      D_I16)
+        IO_TYPE(D_BOOL8,      D_I16|Q_ASYM)
+        IO_TYPE(D_BOOL8,      D_I16|Q_SYM)
+        IO_TYPE(D_BOOL8,      D_I32)
+        IO_TYPE(D_BOOL8,      D_U16)
+        IO_TYPE(D_BOOL8,      D_U32)
+        IO_TYPE(D_U8|Q_ASYM,  D_BOOL8)
+        IO_TYPE(D_I8|Q_ASYM,  D_BOOL8)
+        IO_TYPE(D_I8|Q_DFP,   D_BOOL8)
+        IO_TYPE(D_I8|Q_SYM,   D_BOOL8)
+        IO_TYPE(D_I16|Q_DFP,  D_BOOL8)
+        IO_TYPE(D_I16|Q_ASYM, D_BOOL8)
+        IO_TYPE(D_I16|Q_SYM,  D_BOOL8)
+        IO_TYPE(D_I32,        D_BOOL8)
+        IO_TYPE(D_U16,        D_BOOL8)
+        IO_TYPE(D_U32,        D_BOOL8)
         IO_TYPE(D_BF16,       D_BF16)
         IO_TYPE(D_BF16,       D_F16)
         IO_TYPE(D_BF16,       D_F32)
         IO_TYPE(D_I32,        D_I32)
         IO_TYPE(D_I32,        D_I16|Q_DFP)
-        IO_TYPE(D_I32,        D_I16)
         IO_TYPE(D_I32,        D_I8|Q_DFP)
-        IO_TYPE(D_I32,        D_I8)
         IO_TYPE(D_I32,        D_U32)
         IO_TYPE(D_I32,        D_U16)
         IO_TYPE(D_I32,        D_U8|Q_ASYM)
-        IO_TYPE(D_I32,        D_U8)
         IO_TYPE(D_U32,        D_U32)
         IO_TYPE(D_U32,        D_I16|Q_DFP)
-        IO_TYPE(D_U32,        D_I16)
         IO_TYPE(D_U32,        D_I8|Q_DFP)
-        IO_TYPE(D_U32,        D_I8)
         IO_TYPE(D_U32,        D_I32)
-        IO_TYPE(D_U32,        D_U16)
         IO_TYPE(D_U32,        D_U8|Q_ASYM)
         IO_TYPE(D_U32,        D_U8)
         IO_TYPE(D_BF16,       D_I32)
         IO_TYPE(D_I32,        D_BF16)
+        IO_TYPE(D_U4|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_U4|Q_SYM,   D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_U4|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_U4|Q_SYM)
+        IO_TYPE(D_I4|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_I4|Q_SYM,   D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_I4|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_I4|Q_SYM)
 
         /* HW 9.0.1 */
         IO_TYPE(D_I8|Q_DFP,   D_BF16)
@@ -276,6 +289,25 @@ static vsi_bool op_check
         IO_TYPE(D_I16|Q_DFP,  D_BF16)
         IO_TYPE(D_I16|Q_DFP,  D_F32)
         IO_TYPE(D_F16,        D_F32)
+
+        /* HW 9.1.1 */
+        IO_TYPE(D_U4|Q_ASYM,  D_I8|Q_ASYM)
+        IO_TYPE(D_U4|Q_ASYM,  D_I8|Q_SYM)
+        IO_TYPE(D_U4|Q_SYM,   D_I8|Q_ASYM)
+        IO_TYPE(D_U4|Q_SYM,   D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_ASYM,  D_U4|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_U4|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,  D_U4|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,   D_U4|Q_SYM)
+        IO_TYPE(D_I4|Q_ASYM,  D_I8|Q_ASYM)
+        IO_TYPE(D_I4|Q_ASYM,  D_I8|Q_SYM)
+        IO_TYPE(D_I4|Q_SYM,   D_I8|Q_ASYM)
+        IO_TYPE(D_I4|Q_SYM,   D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_ASYM,  D_I4|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_I4|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,  D_I4|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,   D_I4|Q_SYM)
+
     END_IO_TYPE_DECL(DATACONVERT)
     if (!VALIDATE_OP_IO_TYPES(DATACONVERT, self, inputs, self->input.num, outputs, self->output.num))
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
index 483a6dc..09c59d8 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
@@ -198,190 +198,7 @@ static vsi_bool op_check
 {
     vsi_bool ret = FALSE;
 
-    BEGIN_IO_TYPE_DECL(DECONVOLUTION, 3, 1)
-        IO_TYPE(D_F16,  D_F16,  D_NONE, D_F16)
-        IO_TYPE(D_F16,  D_F16,  D_F32, D_F16)
-        IO_TYPE(D_F16,  D_F16,  D_F16, D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I32|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I32|Q_DFP, D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I32|Q_DFP, D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I32|Q_DFP, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_NONE, D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_NONE, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I32|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I64|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I16|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,  D_I32|Q_DFP, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC, D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC, D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE, D_I8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
-        IO_TYPE(D_BF16,  D_BF16,  D_F32, D_BF16)
-        IO_TYPE(D_BF16,  D_BF16,  D_F32, D_F32)
-        IO_TYPE(D_F32,  D_F32,  D_F32, D_F32)
-        IO_TYPE(D_F32,  D_F32,  D_F32, D_BF16)
-        IO_TYPE(D_F32,  D_F32,  D_NONE, D_F32)
-
-        /* HW 9.0 */
-        IO_TYPE(D_F32,  D_BF16,  D_F32, D_BF16)
-        IO_TYPE(D_F32,  D_BF16,  D_NONE, D_BF16)
-
-        /* HW 9.0.1 */
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_F32)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_F32)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
-
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_F32)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_F32)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_F32)
-
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_F32)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
-
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_F32)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_F32)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_F32)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_F32)
-
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_F32)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_NONE,          D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_NONE,          D_F32)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
-
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_BF16)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_F32)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I32|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I32|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I32|Q_DFP,     D_BF16)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I32|Q_DFP,     D_F32)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I64|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I64|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I64|Q_DFP,     D_BF16)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I64|Q_DFP,     D_F32)
-
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE,          D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE,          D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE,          D_BF16)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE,          D_F32)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC,  D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC,  D_BF16)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC,  D_F32)
-
-        IO_TYPE(D_F16,       D_F16,          D_NONE,          D_BF16)
-        IO_TYPE(D_F16,       D_F16,          D_NONE,          D_F32)
-        IO_TYPE(D_F16,       D_F16,          D_F32,           D_BF16)
-        IO_TYPE(D_F16,       D_F16,          D_F32,           D_F32)
-
-        IO_TYPE(D_BF16,      D_BF16,         D_NONE,          D_F16)
-        IO_TYPE(D_BF16,      D_BF16,         D_F32,           D_F16)
-
-        IO_TYPE(D_F32,       D_BF16,         D_NONE,          D_F16)
-        IO_TYPE(D_F32,       D_BF16,         D_NONE,          D_BF16)
-        IO_TYPE(D_F32,       D_BF16,         D_NONE,          D_F32)
-        IO_TYPE(D_F32,       D_BF16,         D_F32,           D_F16)
-        IO_TYPE(D_F32,       D_BF16,         D_F32,           D_BF16)
-        IO_TYPE(D_F32,       D_BF16,         D_F32,           D_F32)
-
-    END_IO_TYPE_DECL(DECONVOLUTION)
-    if (!VALIDATE_OP_IO_TYPES(DECONVOLUTION, self, inputs, self->input.num, outputs, self->output.num))
-    {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
-    }
-
-    /* Check fl and scale*/
-    ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]);
+    ret = vsi_nn_OpCheck(VSI_NN_OP_CONV2D, self, inputs, outputs);
 
     return ret;
 } /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
index eb8f75b..0692666 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
@@ -220,7 +220,7 @@ static vsi_bool op_check
 {
     vsi_bool ret = FALSE;
 
-    ret = vsi_nn_OpCheck(VSI_NN_OP_DECONVOLUTION, self, inputs, outputs);
+    ret = vsi_nn_OpCheck(VSI_NN_OP_CONV1D, self, inputs, outputs);
 
     return ret;
 } /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c
index c1c4404..6b7cc6f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c
@@ -105,20 +105,32 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(DEPTH2SPACE_INTERNAL, 1, 1)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_BF16,  D_BF16)
-        IO_TYPE(D_F32,   D_F32)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_F16)
+        IO_TYPE(D_BF16,         D_BF16)
+        IO_TYPE(D_F32,          D_F32)
     END_IO_TYPE_DECL(DEPTH2SPACE_INTERNAL)
-    if(!VALIDATE_OP_IO_TYPES(DEPTH2SPACE_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(DEPTH2SPACE_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c
index f63db97..fa53367 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c
@@ -70,32 +70,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    BEGIN_IO_TYPE_DECL(DROPOUT, 1, 1)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_F32,  D_F16)
-        IO_TYPE(D_F32,  D_F32)
-        IO_TYPE(D_F16,  D_F32)
-        IO_TYPE(D_BF16, D_BF16)
-        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16)
-    END_IO_TYPE_DECL(DROPOUT)
-    if (!VALIDATE_OP_IO_TYPES(DROPOUT, self, inputs, self->input.num, outputs, self->output.num))
-    {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
-    }
+    vsi_bool ret = vsi_nn_OpCheck(VSI_NN_OP_LINEAR, self, inputs, outputs);
 
-    return TRUE;
+    return ret;
 } /* op_check() */
 
 static vsi_bool op_setup
@@ -140,4 +117,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
index 496d42e..73ba406 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
@@ -350,30 +350,63 @@ static vsi_bool op_check_pow
 {
     /* check inputs outputs data type */
     BEGIN_IO_TYPE_DECL(POW, 2, 1)
-        IO_TYPE(D_F16,  D_F16,  D_F16)
-        IO_TYPE(D_BF16, D_BF16, D_BF16)
-        IO_TYPE(D_F16,  D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_I8|Q_DFP,  D_F16)
-        IO_TYPE(D_F16,  D_I8|Q_DFP,  D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_F16,  D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_F16,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_F16,  D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_F32,  D_F32,  D_F32)
+        IO_TYPE(D_F16,          D_F16,          D_F16)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_BF16,         D_BF16,         D_BF16)
+        IO_TYPE(D_F32,          D_F32,          D_F32)
+        IO_TYPE(D_I32,          D_I32,          D_I32)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_I16|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_ASYM,    D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_F16)
+        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_ASYM,   D_I16|Q_SYM)
     END_IO_TYPE_DECL(POW)
-    if(!VALIDATE_OP_IO_TYPES(POW, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(POW, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -398,30 +431,56 @@ static vsi_bool op_check_add
         IO_TYPE(D_BF16,         D_BF16,         D_F32)
         IO_TYPE(D_F16,          D_F16,          D_F16)
         IO_TYPE(D_F16,          D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_SYM)
         IO_TYPE(D_F16,          D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_SYM)
         IO_TYPE(D_F16,          D_F16,          D_U8|Q_ASYM)
         IO_TYPE(D_F16,          D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_F16)
         IO_TYPE(D_F16,          D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_F16)
         IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_F16)
         IO_TYPE(D_F16,          D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_I16|Q_SYM)
         IO_TYPE(D_F16,          D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_I8|Q_SYM)
         IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16)
         IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_F16)
         IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16)
         IO_TYPE(D_I16|Q_DFP,    D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_I16|Q_SYM)
         IO_TYPE(D_I8|Q_DFP,     D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_I8|Q_SYM)
         IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_U8|Q_ASYM)
         IO_TYPE(D_I8|Q_DFP,     D_I16|Q_DFP,    D_I16|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,     D_I16|Q_DFP,    D_I8|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_F16)
         IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I16|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_I8|Q_SYM)
         IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_I16,          D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,    D_I16,          D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_I16|Q_SYM)
         IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_F16)
         IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
         IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_DFP,    D_U8|Q_ASYM)
@@ -491,8 +550,14 @@ static vsi_bool op_check_add
         IO_TYPE(D_F32,          D_BF16,         D_BF16)
         IO_TYPE(D_F32,          D_BF16,         F32)
 
+        /* HW 9.1.1 */
+        IO_TYPE(D_U4|Q_ASYM,    D_U4|Q_ASYM,    D_U4|Q_ASYM)
+        IO_TYPE(D_U4|Q_SYM,     D_U4|Q_SYM,     D_U4|Q_SYM)
+        IO_TYPE(D_I4|Q_ASYM,    D_I4|Q_ASYM,    D_I4|Q_ASYM)
+        IO_TYPE(D_I4|Q_SYM,     D_I4|Q_SYM,     D_I4|Q_SYM)
+
     END_IO_TYPE_DECL(ADD)
-    if(!VALIDATE_OP_IO_TYPES(ADD, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(ADD, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -527,45 +592,65 @@ static vsi_bool op_check_div
 {
     /* check inputs outputs data type */
     BEGIN_IO_TYPE_DECL(DIVIDE, 2, 1)
-        IO_TYPE(D_BF16,       D_BF16,       D_BF16)
-        IO_TYPE(D_F16,        D_F16,        D_F16)
-        IO_TYPE(D_F16,        D_F16,        D_I16|Q_DFP)
-        IO_TYPE(D_F16,        D_F16,        D_I8|Q_DFP)
-        IO_TYPE(D_F16,        D_F16,        D_U8|Q_ASYM)
-        IO_TYPE(D_F16,        D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_F16,        D_I8|Q_DFP,   D_F16)
-        IO_TYPE(D_F16,        D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,        D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_F16,        D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,        D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16,        D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16,        D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_I16|Q_DFP,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP,   D_F16)
-        IO_TYPE(D_I8|Q_DFP,   D_F16,        D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_F16,        D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I16,        D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16,        D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_F16,        D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_F16,        D_F16)
-        IO_TYPE(D_F32,  D_F32,  D_F32)
-        IO_TYPE(D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_F32,  D_F16,  D_F32)
-        IO_TYPE(D_F32,  D_F16,  D_F16)
-        IO_TYPE(D_F16,  D_F32,  D_F32)
-        IO_TYPE(D_F16,  D_F32,  D_F16)
-        IO_TYPE(D_F16,  D_F16,  D_F32)
-        IO_TYPE(D_I32,  D_I32,  D_I32)
-        IO_TYPE(D_I16,  D_I32,  D_I32)
-        IO_TYPE(D_I32,  D_I16,  D_I32)
+        IO_TYPE(D_BF16,         D_BF16,         D_BF16)
+        IO_TYPE(D_F16,          D_F16,          D_F16)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_DFP,    D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16)
+        IO_TYPE(D_F32,          D_F32,          D_F32)
+        IO_TYPE(D_F32,          D_F32,          D_F16)
+        IO_TYPE(D_F32,          D_F16,          D_F32)
+        IO_TYPE(D_F32,          D_F16,          D_F16)
+        IO_TYPE(D_F16,          D_F32,          D_F32)
+        IO_TYPE(D_F16,          D_F32,          D_F16)
+        IO_TYPE(D_F16,          D_F16,          D_F32)
+        IO_TYPE(D_I32,          D_I32,          D_I32)
+        IO_TYPE(D_I16,          D_I32,          D_I32)
+        IO_TYPE(D_I32,          D_I16,          D_I32)
     END_IO_TYPE_DECL(DIVIDE)
-    if(!VALIDATE_OP_IO_TYPES(DIVIDE, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(DIVIDE, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -596,22 +681,40 @@ static vsi_bool op_check_mul
         IO_TYPE(D_F16,          D_I8|Q_DFP,     D_F16)
         IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_F16)
         IO_TYPE(D_F16,          D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_I16|Q_SYM)
         IO_TYPE(D_F16,          D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_I8|Q_SYM)
         IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16)
         IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_F16)
         IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16)
         IO_TYPE(D_I16|Q_DFP,    D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_I16|Q_SYM)
         IO_TYPE(D_I8|Q_DFP,     D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_I8|Q_SYM)
         IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_U8|Q_ASYM)
         IO_TYPE(D_I8|Q_DFP,     D_I16|Q_DFP,    D_I16|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,     D_I16|Q_DFP,    D_I8|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_F16)
         IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_I16,          D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,    D_I16,          D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_I16|Q_SYM)
         IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_F16)
         IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
         IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_DFP,    D_U8|Q_ASYM)
@@ -677,8 +780,14 @@ static vsi_bool op_check_mul
         IO_TYPE(D_F32,          D_BF16,         D_BF16)
         IO_TYPE(D_F32,          D_BF16,         F32)
 
+        /* HW 9.1.1 */
+        IO_TYPE(D_U4|Q_ASYM,    D_U4|Q_ASYM,    D_U4|Q_ASYM)
+        IO_TYPE(D_U4|Q_SYM,     D_U4|Q_SYM,     D_U4|Q_SYM)
+        IO_TYPE(D_I4|Q_ASYM,    D_I4|Q_ASYM,    D_I4|Q_ASYM)
+        IO_TYPE(D_I4|Q_SYM,     D_I4|Q_SYM,     D_I4|Q_SYM)
+
     END_IO_TYPE_DECL(MULTIPLY)
-    if(!VALIDATE_OP_IO_TYPES(MULTIPLY, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(MULTIPLY, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
index 9a85fd1..7dc29af 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
@@ -178,6 +178,12 @@ static vsi_bool op_check
         IO_TYPE(D_I8|Q_DFP,     D_F16)
         IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,    D_F16)
+
+        /* HW 9.1.1 */
+        IO_TYPE(D_U4|Q_ASYM,    D_U4|Q_ASYM)
+        IO_TYPE(D_U4|Q_SYM,     D_U4|Q_SYM)
+        IO_TYPE(D_I4|Q_ASYM,    D_I4|Q_ASYM)
+        IO_TYPE(D_I4|Q_SYM,     D_I4|Q_SYM)
     END_IO_TYPE_DECL(ELTWISE_UNARY)
     if (!VALIDATE_OP_IO_TYPES(ELTWISE_UNARY, self, inputs, self->input.num, outputs, self->output.num))
     {
@@ -252,6 +258,9 @@ DEF_ELEMENT_WISE_UNARY_OP( ROUND, round );
 DEF_ELEMENT_WISE_UNARY_OP( GELU, gelu );
 DEF_ELEMENT_WISE_UNARY_OP( SELU, selu );
 DEF_ELEMENT_WISE_UNARY_OP( CELU, celu );
+DEF_ELEMENT_WISE_UNARY_OP( RCP,  rcp );
+DEF_ELEMENT_WISE_UNARY_OP( SIGN, sign );
+DEF_ELEMENT_WISE_UNARY_OP( SOFTSIGN, softsign );
 
 #undef DEF_ELEMENT_UNARY_WISE_OP
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c b/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c
index a789f2c..84e36be 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c
@@ -22,7 +22,6 @@
 *
 *****************************************************************************/
 
-
 #include <string.h>
 #include <stdlib.h>
 
@@ -93,6 +92,13 @@ static vsi_bool op_check
         IO_TYPE(D_I8|Q_DFP,     D_F16)
         IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,    D_F16)
+
+        /* HW 9.1.1 */
+        IO_TYPE(D_U4|Q_ASYM,    D_U4|Q_ASYM)
+        IO_TYPE(D_U4|Q_SYM,     D_U4|Q_SYM)
+        IO_TYPE(D_I4|Q_ASYM,    D_I4|Q_ASYM)
+        IO_TYPE(D_I4|Q_SYM,     D_I4|Q_SYM)
+
     END_IO_TYPE_DECL(ERF)
     if (!VALIDATE_OP_IO_TYPES(ERF, self, inputs, self->input.num, outputs, self->output.num))
     {
@@ -106,7 +112,6 @@ static vsi_bool op_check
     return TRUE;
 } /* op_check() */
 
-
 __BEGIN_DECLS
 
 /* Registrar */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c
index df4aa95..6bb4dad 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c
@@ -54,25 +54,37 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(EXPAND_BROADCAST, 1, 1)
-        IO_TYPE(D_BF16, D_BF16)
-        IO_TYPE(D_F32,  D_F32)
-        IO_TYPE(D_F32,  D_F16)
-        IO_TYPE(D_F16,  D_F32)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_F32,  D_BF16)
-        IO_TYPE(D_BF16, D_F32)
-        IO_TYPE(D_I32|Q_DFP,  D_I32|Q_DFP)
-        IO_TYPE(D_I32|Q_ASYM,  D_I32|Q_ASYM)
+        IO_TYPE(D_BF16,         D_BF16)
+        IO_TYPE(D_F32,          D_F32)
+        IO_TYPE(D_F32,          D_F16)
+        IO_TYPE(D_F16,          D_F32)
+        IO_TYPE(D_F16,          D_F16)
+        IO_TYPE(D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,    D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_F32,          D_BF16)
+        IO_TYPE(D_BF16,         D_F32)
+        IO_TYPE(D_I32|Q_DFP,    D_I32|Q_DFP)
+        IO_TYPE(D_I32|Q_ASYM,   D_I32|Q_ASYM)
     END_IO_TYPE_DECL(EXPAND_BROADCAST)
     if (!VALIDATE_OP_IO_TYPES(EXPAND_BROADCAST, self, inputs, self->input.num, outputs, self->output.num))
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_floor.c b/src/tim/vx/internal/src/ops/vsi_nn_op_floor.c
index 1e9d5a7..9285e6e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_floor.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_floor.c
@@ -43,18 +43,32 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(FLOOR, 1, 1)
-        IO_TYPE(D_BF16, D_BF16)
-        IO_TYPE(D_F32,  D_F32)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP)
+        IO_TYPE(D_BF16,         D_BF16)
+        IO_TYPE(D_F32,          D_F32)
+        IO_TYPE(D_F32,          D_F16)
+        IO_TYPE(D_F16,          D_F32)
+        IO_TYPE(D_F16,          D_F16)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
     END_IO_TYPE_DECL(FLOOR)
     if (!VALIDATE_OP_IO_TYPES(FLOOR, self, inputs, self->input.num, outputs, self->output.num))
     {
@@ -80,7 +94,7 @@ static vsi_status op_compute
     memset(&p, 0, sizeof(p));
     p.mode = VX_NN_DS_SIZE_ROUNDING_FLOOR;
     self->n = vxTensorRoundingNode(self->graph->g, inputs[0]->t, &p, sizeof(p), outputs[0]->t);
-    if( !self->n )
+    if ( !self->n )
     {
         status = VSI_FAILURE;
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c
index 812c7df..8026198 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c
@@ -229,6 +229,60 @@ static vsi_bool op_check
             IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_BF16)
             IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_F32)
 
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_I16|Q_DFP)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_F16)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_BF16)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_F32)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_NONE,          D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_NONE,          D_I16|Q_DFP)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_NONE,          D_BF16)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_NONE,          D_F32)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_DFP)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F16)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
+            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_I16|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_BF16)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_F32)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_I16|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_BF16)
+            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_F32)
+
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_I16|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_F16)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_BF16)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_F32)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_NONE,          D_I16|Q_DFP)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_NONE,          D_BF16)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_NONE,          D_F32)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F16)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
+            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_I16|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_BF16)
+            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_F32)
+
             IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
             IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
             IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
@@ -291,6 +345,33 @@ static vsi_bool op_check
             IO_TYPE(D_F32,       D_BF16,         D_F32,           D_BF16)
             IO_TYPE(D_F32,       D_BF16,         D_F32,           D_F32)
 
+            /* HW 9.1.1 */
+            IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_ASYM,    D_I4|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM, D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM, D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I4|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U4|Q_ASYM)
+            IO_TYPE(D_I4|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_I4|Q_DFP)
+
+            IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_SYM,     D_U4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_SYM,     D_I4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM,     D_I32|Q_SYM,     D_U4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM,     D_I32|Q_SYM,     D_I4|Q_SYM)
+            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM,     D_I4|Q_SYM)
+            IO_TYPE(D_I4|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_I4|Q_SYM)
+            IO_TYPE(D_I4|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_U4|Q_ASYM)
+            IO_TYPE(D_I4|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_SYM)
+            IO_TYPE(D_I4|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_I4|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I4|Q_SYM)
+            IO_TYPE(D_I4|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U4|Q_ASYM)
+
         END_IO_TYPE_DECL(FCL)
         ret = VALIDATE_OP_IO_TYPES(FCL, self, inputs, self->input.num, outputs, self->output.num);
         if(!ret) {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c
index 6c1bdc2..b91fec8 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c
@@ -185,7 +185,8 @@ static vsi_bool op_check
         /* TP Support */
         if (!ret ) {
             uint32_t valid_dtypes[] = {
-                D_F16, D_BF16, D_F32, D_I16|Q_DFP, D_I8|Q_DFP, D_I8|Q_ASYM, D_U8|Q_DFP, D_U8|Q_ASYM
+                D_F16, D_BF16, D_F32, D_I16|Q_DFP, D_I16|Q_SYM, D_I16|Q_ASYM, D_I8|Q_DFP, D_I8|Q_SYM,
+                D_I8|Q_ASYM, D_U8|Q_DFP, D_U8|Q_ASYM
             };
 
             uint32_t weight_type = inputs[1]->attr.dtype.vx_type | inputs[1]->attr.dtype.qnt_type << Q_SHIFT;
@@ -332,7 +333,6 @@ static vsi_bool op_setup
         }
     }
 
-
     if( NULL == inputs[1]->wb )
     {
         VSILOGE( "Create weight bias fail." );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
index 21e0a17..00545d3 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
@@ -34,7 +34,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
@@ -202,26 +201,40 @@ static vsi_bool _op_check
     )
 {
     BEGIN_IO_TYPE_DECL(GROUP_NORM, 3, 1)
-        IO_TYPE(D_F16,  D_F32,  D_F16,  D_F16)
-        IO_TYPE(D_F16,  D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_F16,  D_F32,  D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_F32,  D_F32,  D_U8|Q_ASYM)
-        IO_TYPE(D_F32,  D_F32,  D_F16,  D_F32)
-        IO_TYPE(D_F32,  D_F32,  D_F32,  D_F32)
-        IO_TYPE(D_I32,  D_F32,  D_F16,  D_I32)
-        IO_TYPE(D_I32,  D_F32,  D_F16,  D_F32)
+        IO_TYPE(D_F16,        D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_F16,        D_F32,  D_F16,  D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_U8|Q_ASYM)
+        IO_TYPE(D_F32,        D_F32,  D_F16,  D_F32)
+        IO_TYPE(D_F32,        D_F32,  D_F32,  D_F32)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_SYM)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_SYM)
+        IO_TYPE(D_I32,        D_F32,  D_F16,  D_I32)
+        IO_TYPE(D_I32,        D_F32,  D_F16,  D_F32)
         IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F16)
         IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_F16)
         IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_I8|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_I8|Q_SYM)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F16)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F16)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F16)
         IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_F16)
         IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_I16|Q_SYM)
     END_IO_TYPE_DECL(GROUP_NORM)
     if (!VALIDATE_OP_IO_TYPES(GROUP_NORM, self, inputs, self->input.num, outputs, self->output.num))
     {
@@ -263,21 +276,11 @@ static vsi_status _op_deinit
     )
 {
     vsi_nn_groupnormalize_param *p = &(self->nn_param.groupnorm);
-    if (p->lcl_data->reshaped_input)
-    {
-        vsi_nn_ReleaseTensor(&(p->lcl_data->reshaped_input));
-        p->lcl_data->reshaped_input = NULL;
-    }
-    if (p->lcl_data->reshaped_output)
-    {
-        vsi_nn_ReleaseTensor(&(p->lcl_data->reshaped_output));
-        p->lcl_data->reshaped_output = NULL;
-    }
-    if (self->nn_param.groupnorm.lcl_data)
-    {
-        free(self->nn_param.groupnorm.lcl_data);
-        self->nn_param.groupnorm.lcl_data = NULL;
-    }
+
+    vsi_safe_release_tensor(p->lcl_data->reshaped_input);
+    vsi_safe_release_tensor(p->lcl_data->reshaped_output);
+    vsi_nn_safe_free(self->nn_param.groupnorm.lcl_data)
+
     vsi_nn_op_common_deinit(self);
 
     return VSI_SUCCESS;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
index cdead0c..be1f3f5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
@@ -34,7 +34,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
@@ -175,24 +174,42 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(INSTANCE_NORM, 3, 1)
-        IO_TYPE(D_F16,  D_F32,  D_F16,  D_F16)
-        IO_TYPE(D_F16,  D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_F16,  D_F16,  D_F16,  D_F16)
-        IO_TYPE(D_F32,  D_F32,  D_F16,  D_F32)
-        IO_TYPE(D_F32,  D_F16,  D_F16,  D_F32)
-        IO_TYPE(D_F32,  D_F32,  D_F32,  D_F32)
-        IO_TYPE(D_I32,  D_F32,  D_F16,  D_I32)
-        IO_TYPE(D_I32,  D_F32,  D_F16,  D_F32)
-        IO_TYPE(D_BF16, D_F32,  D_F32,  D_BF16)
+        IO_TYPE(D_F16,        D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_F16,        D_F16,  D_F16,  D_F16)
+        IO_TYPE(D_F32,        D_F32,  D_F16,  D_F32)
+        IO_TYPE(D_F32,        D_F16,  D_F16,  D_F32)
+        IO_TYPE(D_F32,        D_F32,  D_F32,  D_F32)
+        IO_TYPE(D_I32,        D_F32,  D_F16,  D_I32)
+        IO_TYPE(D_I32,        D_F32,  D_F16,  D_F32)
+        IO_TYPE(D_BF16,       D_F32,  D_F32,  D_BF16)
         IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_F16)
         IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_I8|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_I8|Q_SYM)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_SYM)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_SYM)
         IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_I16|Q_SYM)
     END_IO_TYPE_DECL(INSTANCE_NORM)
     if (!VALIDATE_OP_IO_TYPES(INSTANCE_NORM, self, inputs, self->input.num, outputs, self->output.num))
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
index fc378ad..74623e2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
@@ -34,12 +34,12 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
 #define _INPUT_NUM          (3)
 #define _OUTPUT_NUM         (1)
+#define VSI_NN_SUPPORT_AXIS (0)
 
 static vsi_status op_compute
     (
@@ -52,13 +52,12 @@ static vsi_status op_compute
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_kernel_node_t    n = NULL;
     float eps = self->nn_param.layernorm.eps;
-
-    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 &&
-        inputs[2]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
-        outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 )
+#if VSI_NN_SUPPORT_AXIS
+    if ( 0 )
     {
         return vsi_nn_internal_compute_node( self );
     }
+#endif
 
     param = vsi_nn_kernel_param_create();
 
@@ -87,18 +86,18 @@ static vsi_bool op_setup
     )
 {
     vsi_bool ret = TRUE;
+#if VSI_NN_SUPPORT_AXIS
     vsi_nn_internal_node_t* curr = NULL;
+#endif
 
     if ( NULL == self )
     {
         return FALSE;
     }
-
+#if VSI_NN_SUPPORT_AXIS
     vsi_nn_internal_init_node_wksp( self );
 
-   if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 &&
-        inputs[2]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
-        outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 )
+   if ( 0 )
     {
         vsi_nn_internal_tensor_t* mean_tensor = NULL;
         vsi_nn_internal_tensor_t* vari_tensor = NULL;
@@ -137,6 +136,7 @@ static vsi_bool op_setup
         vsi_nn_internal_setup_node( self, curr );
     }
     else
+#endif
     {
         ret = vsi_nn_op_common_setup(self, inputs, outputs);
     }
@@ -152,18 +152,52 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(LAYER_NORM, 3, 1)
-        IO_TYPE(D_F32,  D_F32,  D_F32,  D_F32)
-        IO_TYPE(D_F16,  D_F32,  D_F16,  D_F16)
-        IO_TYPE(D_F16,  D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_F16,  D_F32,  D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_F32,  D_F32,  D_U8|Q_ASYM)
-        IO_TYPE(D_BF16, D_F32,  D_F32,  D_BF16)
+        IO_TYPE(D_F32,        D_F32,  D_F32,  D_F32)
+        IO_TYPE(D_F16,        D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_F16,        D_F32,  D_F16,  D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32,  D_F16,  D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_F32,  D_F16,  D_I8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32,  D_F16,  D_I8|Q_SYM)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_SYM)
+        IO_TYPE(D_F16,        D_F32,  D_F16,  D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_F32,  D_F16,  D_I16|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32,  D_F16,  D_I16|Q_SYM)
+        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_SYM)
+        IO_TYPE(D_BF16,       D_F32,  D_F32,  D_BF16)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F16)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_F16)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F16)
         IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_F16)
     END_IO_TYPE_DECL(LAYER_NORM)
     if (!VALIDATE_OP_IO_TYPES(LAYER_NORM, self, inputs, self->input.num, outputs, self->output.num))
     {
@@ -182,18 +216,9 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
-    uint32_t i = 0;
-    for (i = 0; i < _VSI_NN_LAYERNORM_LOCAL_TENSOR_NUM; i++)
-    {
-        if (self->nn_param.layernorm.local.local_tensor[i] != NULL)
-        {
-            vxReleaseTensor(&(self->nn_param.layernorm.local.local_tensor[i]));
-            self->nn_param.layernorm.local.local_tensor[i] = NULL;
-        }
-    }
-
+#if VSI_NN_SUPPORT_AXIS
     vsi_nn_internal_deinit_node_wksp( self );
-
+#endif
     vsi_nn_op_common_deinit(self);
 
     return VSI_SUCCESS;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_leaky_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_leaky_relu.c
index 5ac26a6..be32c4b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_leaky_relu.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_leaky_relu.c
@@ -66,33 +66,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    BEGIN_IO_TYPE_DECL(LEAKY_RELU, 1, 1)
-        IO_TYPE(D_F32,  D_F32)
-        IO_TYPE(D_F32,  D_BF16)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_F16,  D_F32)
-        IO_TYPE(D_BF16,  D_F32)
-        IO_TYPE(D_BF16,  D_BF16)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_F16)
-    END_IO_TYPE_DECL(LEAKY_RELU)
-    if (!VALIDATE_OP_IO_TYPES(LEAKY_RELU, self, inputs, self->input.num, outputs, self->output.num))
-    {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
-    }
+    vsi_bool ret = vsi_nn_OpCheck(VSI_NN_OP_RELU, self, inputs, outputs);
 
-    return TRUE;
+    return ret;
 } /* op_check() */
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c b/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c
index 3e79acc..097075d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c
@@ -74,33 +74,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    BEGIN_IO_TYPE_DECL(LINEAR, 1, 1)
-        IO_TYPE(D_BF16, D_BF16)
-        IO_TYPE(D_F32,  D_F32)
-        IO_TYPE(D_F32,  D_F16)
-        IO_TYPE(D_F32,  D_BF16)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_F16,  D_F32)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_F16)
-    END_IO_TYPE_DECL(LINEAR)
-    if (!VALIDATE_OP_IO_TYPES(LINEAR, self, inputs, self->input.num, outputs, self->output.num))
-    {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
-    }
+    vsi_bool ret = vsi_nn_OpCheck(VSI_NN_OP_RELU, self, inputs, outputs);
 
-    return TRUE;
+    return ret;
 } /* op_check() */
 
 __BEGIN_DECLS
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c
index 00030fe..fd12173 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c
@@ -140,22 +140,34 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(LOG_SOFTMAX, 1, 1)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_F32,  D_F32)
-        IO_TYPE(D_BF16, D_F16)
-        IO_TYPE(D_BF16, D_F32)
-        IO_TYPE(D_BF16, D_BF16)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_F16)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_F16)
+        IO_TYPE(D_F32,          D_F32)
+        IO_TYPE(D_BF16,         D_F16)
+        IO_TYPE(D_BF16,         D_F32)
+        IO_TYPE(D_BF16,         D_BF16)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_F16)
     END_IO_TYPE_DECL(LOG_SOFTMAX)
-    if(!VALIDATE_OP_IO_TYPES(LOG_SOFTMAX, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(LOG_SOFTMAX, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -206,7 +218,6 @@ DEF_OP_REG  \
 
 DEF_LOG_SOFTMAX_OP( LOG_SOFTMAX, log_softmax );
 
-
 #undef DEF_LOG_SOFTMAX_OP
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c
new file mode 100644
index 0000000..57f8cad
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c
@@ -0,0 +1,300 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_link_list.h"
+#include "vsi_nn_internal_node.h"
+
+typedef struct _max_pool3d_local_data_t {
+    int32_t placeholder;
+} max_pool3d_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    return vsi_nn_internal_compute_node( self );
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = FALSE;
+
+    ret = vsi_nn_OpCheck(VSI_NN_OP_POOL, self, inputs, outputs);
+
+    return ret;
+} /* op_check() */
+
+static vsi_status op_optimize
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_nn_opt_direction_e direction
+    )
+{
+    return vsi_nn_internal_optimize_node( self, direction );
+}
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = TRUE;
+    vsi_nn_max_pool3d_param *p = &(self->nn_param.max_pool3d);
+    vsi_size_t ksize[_cnt_of_array(p->ksize)] = {0}, i = 0;
+    vsi_size_t pad[_cnt_of_array(p->pad)] = {0};
+    vsi_nn_internal_node_t* curr = NULL;
+    vsi_nn_internal_tensor_t* input_tensor = NULL;
+    vsi_nn_internal_tensor_t* pool2d_0_tensor = NULL;
+    vsi_nn_internal_tensor_t* reshape_0_tensor = NULL;
+    vsi_nn_internal_tensor_t* pool2d_1_tensor = NULL;
+    vsi_nn_tensor_attr_t attr;
+    vsi_size_t* reshape_input_size = NULL;
+    vsi_size_t* reshape_pool_size = NULL;
+
+    for (i = 0; i < _cnt_of_array(p->ksize); i++)
+    {
+        ksize[i] = p->ksize[i];
+    }
+    for (i = 0; i < _cnt_of_array(p->pad); i++)
+    {
+        pad[i] = p->pad[i];
+    }
+
+    vsi_nn_compute_padding_3d(
+        inputs[0]->attr.size,
+        ksize,
+        p->stride,
+        NULL,
+        p->pad_type,
+        pad
+        );
+
+    for (i = 0; i < _cnt_of_array(p->ksize); i++)
+    {
+        p->ksize[i] = (uint32_t)ksize[i];
+    }
+
+    for (i = 0; i < _cnt_of_array(p->pad); i++)
+    {
+        p->pad[i] = (uint32_t)pad[i];
+    }
+
+    /* Pooling */
+    outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize
+        (
+        inputs[0]->attr.size[0],
+        p->ksize[0],
+        &p->pad[0],
+        p->stride[0],
+        0,
+        p->round_type
+        );
+
+    outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize
+        (
+        inputs[0]->attr.size[1],
+        p->ksize[1],
+        &p->pad[2],
+        p->stride[1],
+        0,
+        p->round_type
+        );
+
+    outputs[0]->attr.size[2] = vsi_nn_ComputeFilterSize
+        (
+        inputs[0]->attr.size[2],
+        p->ksize[2],
+        &p->pad[4],
+        p->stride[2],
+        0,
+        p->round_type
+        );
+
+    for (i = 3; i < inputs[0]->attr.dim_num; i++)
+    {
+        outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+    }
+
+    outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+
+    vsi_nn_internal_init_node_wksp( self );
+
+    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
+    vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, TRUE);
+    input_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    pool2d_0_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+    reshape_input_size = vsi_nn_internal_new_node_param(curr,
+        VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+    reshape_input_size[0] = inputs[0]->attr.size[0];
+    reshape_input_size[1] = inputs[0]->attr.size[1];
+    reshape_input_size[2] = 1;
+    for (i = 2; i < inputs[0]->attr.dim_num; i++)
+    {
+        reshape_input_size[2] *= inputs[0]->attr.size[i];
+    }
+    reshape_input_size[3] = 1;
+    curr->node->nn_param.reshape2.size = reshape_input_size;
+    curr->node->nn_param.reshape2.dim_num = 4;
+    curr->inputs[0] = inputs[0];
+    curr->outputs[0] = input_tensor->t;
+    vsi_nn_internal_setup_node( self, curr );
+
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_POOL, 0, 0 );
+    curr->node->nn_param.pool.ksize[0] = p->ksize[0];
+    curr->node->nn_param.pool.ksize[1] = p->ksize[1];
+    curr->node->nn_param.pool.stride[0] = p->stride[0];
+    curr->node->nn_param.pool.stride[1] = p->stride[1];
+    curr->node->nn_param.pool.pad[0] = p->pad[0];
+    curr->node->nn_param.pool.pad[1] = p->pad[1];
+    curr->node->nn_param.pool.pad[2] = p->pad[2];
+    curr->node->nn_param.pool.pad[3] = p->pad[3];
+    curr->node->nn_param.pool.type = VX_CONVOLUTIONAL_NETWORK_POOLING_MAX;
+    curr->node->nn_param.pool.round_type = p->round_type;
+    curr->node->nn_param.pool.pad_type = p->pad_type;
+    curr->inputs[0] = input_tensor->t;
+    curr->outputs[0] = pool2d_0_tensor->t;
+    vsi_nn_internal_setup_node( self, curr );
+
+    if (p->ksize[2] == 1 && p->stride[2] == 1 && p->pad[4] == 0 && p->pad[5] == 0)
+    {
+        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+        curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
+        curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
+        curr->inputs[0] = pool2d_0_tensor->t;
+        curr->outputs[0] = outputs[0];
+        vsi_nn_internal_setup_node( self, curr );
+    }
+    else
+    {
+        memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
+        vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, TRUE);
+        reshape_0_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        pool2d_1_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+
+        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+        reshape_pool_size = vsi_nn_internal_new_node_param(curr,
+            VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+        reshape_pool_size[0] = -1;
+        reshape_pool_size[1] = inputs[0]->attr.size[2];
+        reshape_pool_size[2] = 1;
+        for (i = 3; i < inputs[0]->attr.dim_num; i++)
+        {
+            reshape_pool_size[2] *= inputs[0]->attr.size[i];
+        }
+        reshape_pool_size[3] = 1;
+        curr->node->nn_param.reshape2.size = reshape_pool_size;
+        curr->node->nn_param.reshape2.dim_num = 4;
+        curr->inputs[0] = pool2d_0_tensor->t;
+        curr->outputs[0] = reshape_0_tensor->t;
+        vsi_nn_internal_setup_node( self, curr );
+
+        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_POOL, 1, 1 );
+        curr->node->nn_param.pool.ksize[0] = 1;
+        curr->node->nn_param.pool.ksize[1] = p->ksize[2];
+        curr->node->nn_param.pool.stride[0] = 1;
+        curr->node->nn_param.pool.stride[1] = p->stride[2];
+        curr->node->nn_param.pool.pad[0] = 0;
+        curr->node->nn_param.pool.pad[1] = 0;
+        curr->node->nn_param.pool.pad[2] = p->pad[4];
+        curr->node->nn_param.pool.pad[3] = p->pad[5];
+        curr->node->nn_param.pool.type = VX_CONVOLUTIONAL_NETWORK_POOLING_MAX;
+        curr->node->nn_param.pool.round_type = p->round_type;
+        curr->node->nn_param.pool.pad_type = p->pad_type;
+        curr->inputs[0] = reshape_0_tensor->t;
+        curr->outputs[0] = pool2d_1_tensor->t;
+        vsi_nn_internal_setup_node( self, curr );
+
+        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+        curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
+        curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
+        curr->inputs[0] = pool2d_1_tensor->t;
+        curr->outputs[0] = outputs[0];
+        vsi_nn_internal_setup_node( self, curr );
+    }
+
+    return ret;
+} /* op_setup() */
+
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    vsi_nn_internal_deinit_node_wksp( self );
+    status = vsi_nn_op_common_deinit(self);
+
+    return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ MAX_POOL3D,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ op_optimize,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c
new file mode 100644
index 0000000..3432790
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c
@@ -0,0 +1,223 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (2)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    uint32_t new_rank = 0;
+    vsi_nn_kernel_param_t * param = NULL;
+    int32_t ksize_x    = (int32_t)self->nn_param.pool.ksize[0];
+    int32_t ksize_y    = (int32_t)self->nn_param.pool.ksize[1];
+    int32_t stride_x   = (int32_t)self->nn_param.pool.stride[0];
+    int32_t stride_y   = (int32_t)self->nn_param.pool.stride[1];
+    int32_t pad_left   = (int32_t)self->nn_param.pool.pad[0];
+    int32_t pad_right  = (int32_t)self->nn_param.pool.pad[1];
+    int32_t pad_top    = (int32_t)self->nn_param.pool.pad[2];
+    int32_t pad_bottom = (int32_t)self->nn_param.pool.pad[3];
+
+    if ( NULL == self )
+    {
+        return VSI_FAILURE;
+    }
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_optimize_nchw2xhw_shape(inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            shapes[0], &new_rank);
+    vsi_nn_kernel_optimize_nchw2xhw_shape(outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes[1], &new_rank);
+    if (new_rank == 3 && shapes[1][2] == 1)
+    {
+        new_rank = 2;
+    }
+
+    vsi_nn_kernel_param_add_int32( param, "ksize_x",  ksize_x );
+    vsi_nn_kernel_param_add_int32( param, "ksize_y",  ksize_y );
+    vsi_nn_kernel_param_add_int32( param, "stride_x", stride_x );
+    vsi_nn_kernel_param_add_int32( param, "stride_y", stride_y );
+    vsi_nn_kernel_param_add_int32( param, "pad_left", pad_left );
+    vsi_nn_kernel_param_add_int32( param, "pad_right", pad_right );
+    vsi_nn_kernel_param_add_int32( param, "pad_top", pad_top );
+    vsi_nn_kernel_param_add_int32( param, "pad_bottom", pad_bottom );
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
+            inputs[0], shapes[0], new_rank );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+            outputs[0], shapes[1], new_rank );
+    reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph,
+            outputs[1], shapes[1], new_rank );
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "maxpoolwithargmax",
+            &reshape_tensors[0], _INPUT_NUM, &reshape_tensors[1], _OUTPUT_NUM, param );
+
+    vsi_safe_release_tensor(reshape_tensors[0]);
+    vsi_safe_release_tensor(reshape_tensors[1]);
+    vsi_safe_release_tensor(reshape_tensors[2]);
+
+    if ( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    if (param != NULL)
+    {
+        vsi_nn_kernel_param_release( &param );
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(MAXPOOLWITHARGMAX, 1, 2)
+        IO_TYPE(D_I32,   D_I32,  D_I32)
+        IO_TYPE(D_F32,   D_F32,  D_I32)
+        IO_TYPE(D_F16,   D_F16,  D_I32)
+        IO_TYPE(D_BF16, D_BF16,  D_I32)
+        IO_TYPE(D_I16|Q_DFP,   D_I16|Q_DFP,  D_I32)
+        IO_TYPE(D_U8|Q_ASYM,   D_U8|Q_ASYM,  D_I32)
+        IO_TYPE(D_I8|Q_DFP,    D_I8|Q_DFP,   D_I32)
+    END_IO_TYPE_DECL(MAXPOOLWITHARGMAX)
+    if (!VALIDATE_OP_IO_TYPES(MAXPOOLWITHARGMAX, self, inputs, self->input.num, outputs, self->output.num)) {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /* TODO: Add code to comput outputs' shape. */
+    vsi_bool ret = TRUE;
+    vsi_size_t ksize[_cnt_of_array(self->nn_param.pool.ksize)] = {0};
+    vsi_size_t i = 0;
+    vsi_size_t pad[_cnt_of_array(self->nn_param.pool.pad)] = {0};
+    for (i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++)
+    {
+        ksize[i] = self->nn_param.pool.ksize[i];
+    }
+    for (i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++)
+    {
+        pad[i] = self->nn_param.pool.pad[i];
+    }
+
+    vsi_nn_compute_padding(
+        inputs[0]->attr.size,
+        ksize,
+        self->nn_param.pool.stride,
+        NULL,
+        self->nn_param.pool.pad_type,
+        pad
+    );
+    for (i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++)
+    {
+        self->nn_param.pool.ksize[i] = (uint32_t)ksize[i];
+    }
+    for (i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++)
+    {
+        self->nn_param.pool.pad[i] = (uint32_t)pad[i];
+    }
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        ret = vsi_nn_OpSetup( VSI_NN_OP_POOL, self, inputs, outputs );
+
+        outputs[1]->attr.dim_num = outputs[0]->attr.dim_num;
+        memcpy( outputs[1]->attr.size, outputs[0]->attr.size,
+            VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
+    }
+
+    return ret;
+} /* op_setup() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_nn_op_common_deinit(self);
+
+    return VSI_SUCCESS;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ MAXPOOLWITHARGMAX,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c b/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c
new file mode 100644
index 0000000..29310ad
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c
@@ -0,0 +1,237 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "utils/vsi_nn_math.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_log.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _mod_local_data_t {
+    int32_t placeholder;
+} mod_local_data_t;
+
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    vsi_size_t new_rank = 0;
+    vsi_bool ret;
+    vsi_nn_kernel_param_t * param = NULL;
+    int32_t isfmod = (int32_t)self->nn_param.mod.fmod;
+
+    if (NULL == self)
+    {
+        return VSI_FAILURE;
+    }
+
+    param = vsi_nn_kernel_param_create();
+
+    ret = vsi_nn_kernel_optimize_eltwise_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            inputs[1]->attr.size, inputs[1]->attr.dim_num,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes[0], shapes[1], shapes[2], &new_rank );
+
+    vsi_nn_kernel_param_add_int32( param, "isfmod",  isfmod );
+
+    if (ret)
+    {
+        reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
+                inputs[0], shapes[0], new_rank );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+                inputs[1], shapes[1], new_rank );
+        reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph,
+                outputs[0], shapes[2], new_rank );
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "mod",
+                                                 &reshape_tensors[0], _INPUT_NUM,
+                                                 &reshape_tensors[2], _OUTPUT_NUM, param );
+        vsi_nn_ReleaseTensor( &reshape_tensors[0] );
+        vsi_nn_ReleaseTensor( &reshape_tensors[1] );
+        vsi_nn_ReleaseTensor( &reshape_tensors[2] );
+    }
+
+    if (self->n)
+    {
+        status = VSI_SUCCESS;
+    }
+
+    vsi_nn_kernel_param_release( &param );
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(MOD, 2, 1)
+        IO_TYPE(D_F16,          D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_F16)
+        IO_TYPE(D_F32,          D_F32,          D_F32)
+        IO_TYPE(D_I32,          D_I32,          D_I32)
+        IO_TYPE(D_I32,          D_I32,          D_U8|Q_ASYM)
+        IO_TYPE(D_BF16,         D_BF16,         D_BF16)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_I32,          D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_I32,          D_U8|Q_ASYM)
+        IO_TYPE(D_I32,          D_I16|Q_DFP,    D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_U8|Q_ASYM)
+        IO_TYPE(D_I16,          D_I32,          D_I32)
+        IO_TYPE(D_I8|Q_DFP,     D_I32,          D_I8|Q_DFP)
+        IO_TYPE(D_I32,          D_I32,          D_I8|Q_DFP)
+        IO_TYPE(D_I32,          D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,    D_I32,          D_I16|Q_DFP)
+        IO_TYPE(D_I32,          D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I32,          D_I32,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_I32,          D_I8|Q_ASYM)
+        IO_TYPE(D_I32,          D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I32,          D_I32,          D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I32,          D_I8|Q_SYM)
+        IO_TYPE(D_I32,          D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I32,          D_I32,          D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_I32,          D_I16|Q_ASYM)
+        IO_TYPE(D_I32,          D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I32,          D_I32,          D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I32,          D_I16|Q_SYM)
+        IO_TYPE(D_I32,          D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I32,          D_I32,          D_I16|Q_SYM)
+    END_IO_TYPE_DECL(MOD)
+    if (!VALIDATE_OP_IO_TYPES(MOD, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    uint32_t i, out_rank, in1_rank, in2_rank;
+    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_bool ret = TRUE;
+
+    in1_rank = inputs[0]->attr.dim_num;
+    in2_rank = inputs[1]->attr.dim_num;
+    out_rank = vsi_nn_max( in1_rank, in2_rank );
+
+    for(i = 0; i < out_rank; i++)
+    {
+        vsi_size_t sz0, sz1;
+        sz0 = i < in1_rank ? inputs[0]->attr.size[i] : 1;
+        sz1 = i < in2_rank ? inputs[1]->attr.size[i] : 1;
+        shape[i] = vsi_nn_max( sz0, sz1 );
+    }
+
+    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
+    {
+        outputs[0]->attr.dim_num = out_rank;
+        memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) );
+    }
+    else
+    {
+        vsi_size_t total_size_got;
+        vsi_size_t total_size_expected;
+        total_size_expected = vsi_nn_ShapeProduct( shape, out_rank );
+        total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num );
+        if (total_size_expected != total_size_got)
+        {
+            VSILOGW("Output size mismatch, expect %"VSI_SIZE_T_SPECIFIER", but got %"VSI_SIZE_T_SPECIFIER"",
+                    total_size_expected, total_size_got);
+            ret = FALSE;
+        }
+    }
+
+    return ret;
+} /* op_setup() */
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ MOD,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c b/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c
index 28602c7..eb15ccc 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c
@@ -53,20 +53,63 @@ static const char *_get_vx_nbg_type
 }
 
 static void _set_io_index
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
+(
+    vsi_nn_node_t* self,
+    vsi_nn_tensor_t** inputs,
+    vsi_nn_tensor_t** outputs
+)
 {
-    uint32_t idx,i;
+    uint32_t idx, i, j;
 
     idx = 0;
-    for(i = 0; i < self->input.num; i++)
+    for (i = 0; i < self->input.num; i++)
     {
+        uint32_t scalar_index=0;
+        vx_parameter param = 0;
+        vx_enum type = 0;
+
         vxSetParameterByIndex(self->n, idx++, (vx_reference)inputs[i]->t);
+        scalar_index = idx;
+        param = vxGetParameterByIndex(self->n, scalar_index);
+        vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum));
+        if (param != NULL)
+        {
+            vxReleaseParameter(&param);
+            param = NULL;
+
+        }
+        if (type != VX_TYPE_SCALAR)
+        {
+            continue;
+        }
+        else
+        {
+
+            /* 4 crop scalar parameters input */
+            for (j = scalar_index; j < scalar_index + 4; j++)
+            {
+                vx_enum data_type = 0;
+                vx_reference ref = 0;
+                vsi_status status;
+                param = vxGetParameterByIndex(self->n, j);
+                vxQueryParameter(param, VX_PARAMETER_REF, &ref, sizeof(vx_reference));
+                status = vxQueryScalar((vx_scalar)ref, VX_SCALAR_TYPE, &data_type, sizeof(vx_enum));
+                if (status == VX_ERROR_INVALID_REFERENCE)
+                {
+                    vx_scalar scalar = vxCreateScalar(self->graph->ctx->c, VX_TYPE_INT32, 0);
+                    ref = (vx_reference)scalar;
+                    vxSetParameterByIndex(self->n, idx++, ref);
+                    vxReleaseReference(&ref);
+                }
+                if (param != NULL)
+                {
+                    vxReleaseParameter(&param);
+                    param = NULL;
+                }
+            }
+        }
     }
-    for(i = 0; i < self->output.num; i++)
+    for (i = 0; i < self->output.num; i++)
     {
         vxSetParameterByIndex(self->n, idx++, (vx_reference)outputs[i]->t);
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
index e6e5d72..c1d35eb 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
@@ -48,13 +48,14 @@ vsi_status vsi_nn_InitPadParameter
     uint8_t i;
     vsi_status status = VSI_FAILURE;
 
-    if(NULL == node || NULL == param)
+    memset(param, 0, sizeof(vx_nn_pad_params_t));
+
+    if (NULL == node)
     {
         VSILOGE("Set param fail\n");
         return VSI_FAILURE;
     }
 
-    memset(param, 0, sizeof(vx_nn_pad_params_t));
     pad_const_val = node->nn_param.pad.const_val;
     param->pad_mode = node->nn_param.pad.mode;
     param->pad_const = vxCreateScalar( node->graph->ctx->c, VX_TYPE_INT32, &pad_const_val );
@@ -139,10 +140,10 @@ static vsi_status op_compute
     vsi_nn_tensor_t *convert_tensor = NULL;
 
     status = VSI_FAILURE;
-    if(VSI_SUCCESS != vsi_nn_InitPadParameter(self, &p))
+    if (VSI_SUCCESS != vsi_nn_InitPadParameter(self, &p))
     {
         VSILOGE("Set Pad Layer Parameter fail\n");
-        return VSI_FAILURE;
+        goto final;
     }
 
     if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
@@ -174,14 +175,15 @@ static vsi_status op_compute
         sizeof(p)
         );
 
-    vsi_nn_DeinitPadParameter(&p);
-    vsi_safe_release_tensor(convert_tensor);
-
-    if( NULL != self->n )
+    if ( NULL != self->n )
     {
         status = VSI_SUCCESS;
     }
 
+final:
+    vsi_nn_DeinitPadParameter(&p);
+    vsi_safe_release_tensor(convert_tensor);
+
     return status;
 } /* op_compute() */
 
@@ -193,14 +195,26 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(PAD, 1, 1)
-        IO_TYPE(D_F32,  D_F32)
-        IO_TYPE(D_F32,  D_BF16)
-        IO_TYPE(D_BF16, D_F32)
-        IO_TYPE(D_BF16, D_BF16)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
+        IO_TYPE(D_F32,          D_F32)
+        IO_TYPE(D_F32,          D_BF16)
+        IO_TYPE(D_BF16,         D_F32)
+        IO_TYPE(D_BF16,         D_BF16)
+        IO_TYPE(D_F16,          D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I32,          D_I32)
+
+        /* HW 9.1.1 */
+        IO_TYPE(D_U4|Q_ASYM,    D_U4|Q_ASYM)
+        IO_TYPE(D_U4|Q_SYM,     D_U4|Q_SYM)
+        IO_TYPE(D_I4|Q_ASYM,    D_I4|Q_ASYM)
+        IO_TYPE(D_I4|Q_SYM,     D_I4|Q_SYM)
+
     END_IO_TYPE_DECL(PAD)
     if (!VALIDATE_OP_IO_TYPES(PAD, self, inputs, self->input.num, outputs, self->output.num))
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c
index d0b89aa..bd01a72 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c
@@ -22,7 +22,6 @@
 *
 *****************************************************************************/
 
-
 #include <string.h>
 #include <stdlib.h>
 
@@ -71,6 +70,48 @@ static int32_t _get_vx_pad_mode(vx_enum mode)
     return pad_mode;
 }
 
+static int32_t _check_mirror_pad_size
+    (
+    vx_enum mode,
+    const uint32_t * front_size,
+    const uint32_t * back_size,
+    uint32_t pad_dim,
+    vsi_size_t *input_size,
+    uint32_t tensor_dim
+    )
+{
+    uint32_t dim = pad_dim > tensor_dim ? tensor_dim : pad_dim;
+    uint32_t i = 0;
+
+    for (i = 0; i < dim; i++)
+    {
+        uint32_t front = front_size[i];
+        uint32_t end = back_size[i];
+        uint32_t sz = (uint32_t)input_size[i];
+
+        if (mode == VSI_NN_PAD_MODE_SYMMETRIC)
+        {
+            if (front > sz || end > sz)
+            {
+                VSILOGE("MIRROR SYMMETRIC PAD:each padding value must be less than \
+                    or equal to the corresponding dimension");
+                return FALSE;
+            }
+        }
+        else if (mode == VSI_NN_PAD_MODE_REFLECT)
+        {
+            if (front >= sz || end >= sz)
+            {
+                VSILOGE("MIRROR REFLECT PAD:each padding value must be less than \
+                    the corresponding dimension");
+                return FALSE;
+            }
+        }
+    }
+
+    return TRUE;
+}
+
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -110,6 +151,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    vsi_bool ret = FALSE;
+    vsi_nn_pad2_param *p = &self->nn_param.pad2;
+
     BEGIN_IO_TYPE_DECL(PAD2, 1, 1)
         IO_TYPE(D_F32,          D_F32)
         IO_TYPE(D_F32,          D_BF16)
@@ -118,7 +162,19 @@ static vsi_bool op_check
         IO_TYPE(D_F16,          D_F16)
         IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
         IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I32,          D_I32)
+
+        /* HW 9.1.1 */
+        IO_TYPE(D_U4|Q_ASYM,    D_U4|Q_ASYM)
+        IO_TYPE(D_U4|Q_SYM,     D_U4|Q_SYM)
+        IO_TYPE(D_I4|Q_ASYM,    D_I4|Q_ASYM)
+        IO_TYPE(D_I4|Q_SYM,     D_I4|Q_SYM)
+
     END_IO_TYPE_DECL(PAD2)
     if (!VALIDATE_OP_IO_TYPES(PAD2, self, inputs, self->input.num, outputs, self->output.num))
     {
@@ -136,7 +192,10 @@ static vsi_bool op_check
         return FALSE;
     }
 
-    return TRUE;
+    ret = _check_mirror_pad_size(p->mode, p->front_size, p->back_size, p->dim_num,
+        inputs[0]->attr.size, inputs[0]->attr.dim_num);
+
+    return ret;
 } /* op_check() */
 
 static vsi_bool op_setup
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c
index f1386c7..399d0c6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c
@@ -105,7 +105,7 @@ static vsi_bool _is_same_quant
     dtype = &inputs[0]->attr.dtype;
     _dtype = &outputs[0]->attr.dtype;
 
-    if(vsi_nn_DtypeCompare(dtype, _dtype) == FALSE)
+    if (vsi_nn_DtypeCompare(dtype, _dtype) == FALSE)
     {
         return FALSE;
     }
@@ -136,7 +136,7 @@ static vsi_status op_compute
             self->nn_param.permute.dim_num
             );
 
-        if( NULL != self->n )
+        if ( NULL != self->n )
         {
             status = VSI_SUCCESS;
         }
@@ -153,23 +153,27 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(PERMUTE, 1, 1)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_F16,  D_F32)
-        IO_TYPE(D_I16,  D_I16)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_F32)
-        IO_TYPE(D_I8|Q_SYM_PC,   D_I8|Q_SYM_PC)
-        IO_TYPE(D_BOOL8,  D_BOOL8)
-        IO_TYPE(D_BOOL8,  D_I8|Q_DFP)
-        IO_TYPE(D_F32,  D_F32)
-        IO_TYPE(D_F32,  D_BF16)
-        IO_TYPE(D_F32,  D_F16)
-        IO_TYPE(D_BF16, D_F32)
-        IO_TYPE(D_BF16, D_BF16)
-        IO_TYPE(D_I32,  D_I32)
+        IO_TYPE(D_F16,          D_F16)
+        IO_TYPE(D_F16,          D_F32)
+        IO_TYPE(D_I16,          D_I16)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_F32)
+        IO_TYPE(D_I8|Q_SYM_PC,  D_I8|Q_SYM_PC)
+        IO_TYPE(D_BOOL8,        D_BOOL8)
+        IO_TYPE(D_BOOL8,        D_I8|Q_DFP)
+        IO_TYPE(D_F32,          D_F32)
+        IO_TYPE(D_F32,          D_BF16)
+        IO_TYPE(D_F32,          D_F16)
+        IO_TYPE(D_BF16,         D_F32)
+        IO_TYPE(D_BF16,         D_BF16)
+        IO_TYPE(D_I32,          D_I32)
     END_IO_TYPE_DECL(PERMUTE)
     if (!VALIDATE_OP_IO_TYPES(PERMUTE, self, inputs, self->input.num, outputs, self->output.num))
     {
@@ -202,13 +206,13 @@ static vsi_bool op_setup
     }
 
     ret = TRUE;
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
-        for( i = 0; i < self->nn_param.permute.dim_num; i ++ )
+        for ( i = 0; i < self->nn_param.permute.dim_num; i ++ )
         {
             axis = self->nn_param.permute.perm[i];
-            if( axis >= inputs[0]->attr.dim_num )
+            if ( axis >= inputs[0]->attr.dim_num )
             {
                 VSILOGE( "Error permute axis '%u', the dim is '%u' ",
                     axis, inputs[0]->attr.dim_num );
@@ -231,8 +235,6 @@ static vsi_status op_optimize
     )
 {
     vsi_status     status;
-    vsi_size_t shape[VSI_NN_MAX_DIM_NUM];
-    uint32_t i = 0;
 
     status = VSI_SUCCESS;
 
@@ -245,18 +247,13 @@ static vsi_status op_optimize
 
     VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
 
-    for (i = 0; i < self->nn_param.permute.dim_num; i++)
+    if ( direction == VSI_NN_OPTIMIZE_BACKWARD )
     {
-        shape[i] = inputs[0]->attr.size[self->nn_param.permute.perm[i]];
-    }
-
-    if( direction == VSI_NN_OPTIMIZE_BACKWARD )
-    {
-        if(NULL == inputs[0]->t && NULL != outputs[0]->t)
+        if (NULL == inputs[0]->t && NULL != outputs[0]->t)
         {
             inputs[0]->t = vsi_nn_safe_reshape_tensor( outputs[0]->t,
                 (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num, sizeof(inputs[0]->attr.size[0]) );
-            if( inputs[0]->t == NULL )
+            if ( inputs[0]->t == NULL )
             {
                 status = VSI_FAILURE;
             }
@@ -265,12 +262,17 @@ static vsi_status op_optimize
     }
     else
     {
-        if(NULL == outputs[0]->t)
+        if (NULL == outputs[0]->t)
         {
-            vsi_bool ret;
-            ret = vsi_nn_ReshapeTensor( self->graph, inputs[0], outputs[0],
-                shape, (vsi_size_t)self->nn_param.permute.dim_num );
-            if( ret == FALSE )
+            if ( NULL == inputs[0]->t )
+            {
+                vsi_nn_TensorReinit( self->graph, inputs[0] );
+            }
+
+            outputs[0]->t = vsi_nn_safe_reshape_tensor( inputs[0]->t,
+                (void*)outputs[0]->attr.size, (vsi_size_t)outputs[0]->attr.dim_num,
+                sizeof(outputs[0]->attr.size[0]) );
+            if ( outputs[0]->t == NULL )
             {
                 status = VSI_FAILURE;
             }
@@ -278,8 +280,6 @@ static vsi_status op_optimize
         }
     }
 
-    //vsi_nn_ReshapeTensor(self->graph, inputs[0], outputs[0], shape, self->nn_param.permute.dim_num);
-
     return status;
 } /* op_optimize() */
 
@@ -302,4 +302,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c
index 67e2113..eadb94a 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c
@@ -78,7 +78,7 @@ static vsi_status op_compute
     status = VSI_FAILURE;
 
     memset( &params, 0, sizeof( params ) );
-    if(_is_pool1d(self, inputs))
+    if (_is_pool1d(self, inputs))
     {
         // pool1d
         tmp_inputs[0]  = local->reshaped_input;
@@ -120,7 +120,7 @@ static vsi_status op_compute
         tmp_outputs[0]->t
         );
 
-    if( NULL != self->n )
+    if ( NULL != self->n )
     {
         status = VSI_SUCCESS;
     }
@@ -170,11 +170,11 @@ static vsi_status op_optimize
         shape[3] = outputs[0]->attr.size[2];
         dim = 4;
         local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim);
-        if(local->reshaped_output && local->reshaped_output->t)
+        if (local->reshaped_output && local->reshaped_output->t)
         {
             memset(tensor_name, 0, sizeof(tensor_name));
             snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid);
-            if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE)
+            if (vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE)
             {
                 VSILOGW("Set uid %u pool1d reshaped output name fail", self->uid);
                 return VSI_FAILURE;
@@ -185,7 +185,6 @@ static vsi_status op_optimize
     return VSI_SUCCESS;
 } /* op_optimize() */
 
-
 static vsi_bool op_check
     (
     vsi_nn_node_t * self,
@@ -196,32 +195,51 @@ static vsi_bool op_check
     /* check inputs outputs data type */
     BEGIN_IO_TYPE_DECL(POOL, 1, 1)
         /* IO_TYPE(INPUT, OUTPUT) */
-        IO_TYPE(D_F32, D_F32)
-        IO_TYPE(D_F32, D_F16)
-        IO_TYPE(D_F32, D_BF16)
+        IO_TYPE(D_F32,          D_F32)
+        IO_TYPE(D_F32,          D_F16)
+        IO_TYPE(D_F16,          D_F32)
+        IO_TYPE(D_F16,          D_F16)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_BF16,         D_BF16)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_F16)
 
-        IO_TYPE(D_F16, D_F32)
-        IO_TYPE(D_F16, D_F16)
-        IO_TYPE(D_F16, D_U8|Q_ASYM)
-        IO_TYPE(D_F16, D_I8|Q_DFP)
-        IO_TYPE(D_F16, D_I16|Q_DFP)
-
-        IO_TYPE(D_BF16, D_BF16)
-        IO_TYPE(D_BF16, D_F32)
-
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_F16)
-
-        IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_ASYM, D_F16)
-
-        IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP, D_F16)
-
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_F16)
+        /* HW 9.0 */
+        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,    D_BF16)
+        IO_TYPE(D_U8|Q_ASYM,    D_F32)
+        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_DFP,     D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_BF16)
+        IO_TYPE(D_I8|Q_DFP,     D_F32)
+        IO_TYPE(D_I16|Q_DFP,    D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_I8|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,    D_BF16)
+        IO_TYPE(D_I16|Q_DFP,    D_F32)
+        IO_TYPE(D_F32,          D_BF16)
+        IO_TYPE(D_BF16,         D_F32)
+        IO_TYPE(D_F16,          D_BF16)
     END_IO_TYPE_DECL(POOL)
-    if(!VALIDATE_OP_IO_TYPES(POOL, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(POOL, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -260,21 +278,11 @@ static vsi_status op_deinit
     )
 {
     vsi_nn_pool_param *p = &(self->nn_param.pool);
-    if(p->local->reshaped_input)
-    {
-        vsi_nn_ReleaseTensor(&(p->local->reshaped_input));
-        p->local->reshaped_input = NULL;
-    }
-    if(p->local->reshaped_output)
-    {
-        vsi_nn_ReleaseTensor(&(p->local->reshaped_output));
-        p->local->reshaped_output = NULL;
-    }
-    if(self->nn_param.pool.local)
-    {
-        free(self->nn_param.pool.local);
-        self->nn_param.pool.local = NULL;
-    }
+
+    vsi_safe_release_tensor(p->local->reshaped_input);
+    vsi_safe_release_tensor(p->local->reshaped_output);
+    vsi_nn_safe_free(self->nn_param.pool.local);
+
     vsi_nn_op_common_deinit(self);
 
     return VSI_SUCCESS;
@@ -288,20 +296,20 @@ static vsi_bool op_setup
     )
 {
     vsi_bool ret;
-    vsi_size_t ksize[_cnt_of_array(self->nn_param.pool.ksize)], i;
+    vsi_size_t ksize[_cnt_of_array(self->nn_param.pool.ksize)] = {0}, i = 0;
     vsi_size_t pad[_cnt_of_array(self->nn_param.pool.pad)] = {0};
 
     ret = TRUE;
 
-    for(i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++)
+    for (i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++)
     {
         ksize[i] = self->nn_param.pool.ksize[i];
     }
-    for(i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++)
+    for (i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++)
     {
         pad[i] = self->nn_param.pool.pad[i];
     }
-    if(_is_pool1d(self, inputs))
+    if (_is_pool1d(self, inputs))
     {
         vsi_nn_compute_padding_conv1d(
             inputs[0]->attr.size,
@@ -311,11 +319,11 @@ static vsi_bool op_setup
             self->nn_param.pool.pad_type,
             pad
         );
-        for(i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++)
+        for (i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++)
         {
             self->nn_param.pool.ksize[i] = (uint32_t)ksize[i];
         }
-        for(i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++)
+        for (i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++)
         {
             self->nn_param.pool.pad[i] = (uint32_t)pad[i];
         }
@@ -344,11 +352,11 @@ static vsi_bool op_setup
             self->nn_param.pool.pad_type,
             pad
         );
-        for(i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++)
+        for (i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++)
         {
             self->nn_param.pool.ksize[i] = (uint32_t)ksize[i];
         }
-        for(i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++)
+        for (i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++)
         {
             self->nn_param.pool.pad[i] = (uint32_t)pad[i];
         }
@@ -374,17 +382,13 @@ static vsi_bool op_setup
             self->nn_param.pool.round_type
             );
 
-        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
-        outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
+        for (i = 2; i < inputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+        }
     }
 
     outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
-    if( NULL != outputs[1] )
-    {
-        outputs[1]->attr.dim_num = outputs[0]->attr.dim_num;
-        memcpy( outputs[1]->attr.size, outputs[0]->attr.size,
-            VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
-    }
 
     return ret;
 } /* op_setup() */
@@ -408,4 +412,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c
index 88edb90..cfdf7c2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c
@@ -51,7 +51,7 @@ static vsi_bool vsi_nn_poolwithargmax_optimize_shape
     )
 {
     vsi_bool   enable_image_2d = FALSE;
-    int32_t    hwLitimLen      = 65536;
+    int32_t    hwLitimLen      = GPU_TENSOR_MAX_WIDTH;
 
     if ((2 == self->nn_param.pool.ksize[1])
        && (2 == self->nn_param.pool.stride[1])
@@ -166,7 +166,6 @@ static vsi_status op_compute
 
     if( ret )
     {
-
         reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
                 inputs[0], shapes[0], new_rank );
         reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
@@ -189,7 +188,6 @@ static vsi_status op_compute
     vsi_nn_kernel_param_release( &param );
 
     return status;
-
 } /* op_compute() */
 
 static vsi_bool op_check
@@ -233,7 +231,6 @@ static vsi_bool op_check
     }
 
     return TRUE;
-
 } /* op_check() */
 
 static vsi_bool op_setup
@@ -276,6 +273,10 @@ static vsi_bool op_setup
     if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         ret = vsi_nn_OpSetup( VSI_NN_OP_POOL, self, inputs, outputs );
+
+        outputs[1]->attr.dim_num = outputs[0]->attr.dim_num;
+        memcpy( outputs[1]->attr.size, outputs[0]->attr.size,
+            VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
     }
 
     return ret;
@@ -310,4 +311,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
index eb74aff..f913afd 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
@@ -295,9 +295,7 @@ static vsi_bool op_setup
         {
             uint32_t i = 0;
             uint32_t axis = 2;
-            uint32_t group = 3;
             vsi_bool is_input_sep = p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ? FALSE : TRUE;
-            vsi_nn_tensor_t ** input_tensor_group = &p->local->local_tensor[0];
             vsi_nn_internal_tensor_t * output_tensor_group[3] = {NULL};
             vsi_nn_internal_tensor_t* tmp_outputs[3] = { NULL };
             vsi_nn_tensor_attr_t attr;
@@ -305,17 +303,6 @@ static vsi_bool op_setup
             vsi_size_t size_32bit[VSI_NN_MAX_DIM_NUM] = {0};
 
             memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-
-            if (!is_input_sep)
-            {
-                ret = vsi_nn_CreateTensorGroup(self->graph, inputs[0], axis,
-                input_tensor_group, group);
-                if (ret == FALSE)
-                {
-                    goto final;
-                }
-            }
-
             memcpy(&attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t));
             for(i = 0; i < p->output_attr.dim_num; i++)
             {
@@ -361,9 +348,9 @@ static vsi_bool op_setup
             }
             else
             {
-                curr->inputs[0] = input_tensor_group[0];
-                curr->inputs[1] = input_tensor_group[1];
-                curr->inputs[2] = input_tensor_group[2];
+                curr->inputs[0] = inputs[0];
+                curr->inputs[1] = NULL;
+                curr->inputs[2] = NULL;
             }
             curr->outputs[0] = output_tensor_group[0]->t;
             curr->outputs[1] = output_tensor_group[1]->t;
@@ -512,8 +499,6 @@ static vsi_bool op_setup
         }
     }
 
-final:
-
     return ret;
 } /* op_setup() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
index b7f4f1d..6d19e4a 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
@@ -34,7 +34,6 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c
index e0123fa..13a636d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c
@@ -72,6 +72,7 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb888_planar.b_mean );
     vsi_nn_kernel_param_add_float32( param, "scale", self->nn_param.pre_process_rgb888_planar.scale );
     vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb888_planar.local->enable_copy );
+
     n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb888_planar", inputs, 3, outputs, 3, param );
     if ( n != NULL )
     {
@@ -94,18 +95,41 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 3, 3)
-        IO_TYPE(D_U8, D_U8, D_U8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_U8, D_U8, D_U8, D_I8|Q_DFP,  D_I8|Q_DFP,  D_I8|Q_DFP)
-        IO_TYPE(D_U8, D_U8, D_U8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_U8, D_U8, D_U8, D_F16,       D_F16,       D_F16)
-    END_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR)
-    if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_RGB888_PLANAR, self, inputs, self->input.num, outputs, self->output.num)) {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
+    if (inputs[1] == NULL)
+    {
+        BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 1, 3)
+            IO_TYPE(D_U8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
+            IO_TYPE(D_U8, D_I8|Q_DFP,  D_I8|Q_DFP,  D_I8|Q_DFP)
+            IO_TYPE(D_U8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
+            IO_TYPE(D_U8, D_F16,       D_F16,       D_F16)
+        END_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR)
+
+        if (!VALIDATE_OP_IO_TYPES(PRE_PROCESS_RGB888_PLANAR, self, inputs, 1,
+            outputs, self->output.num)) {
+            char* desc = generate_op_io_types_desc(inputs,
+                    self->input.num, outputs, self->output.num);
+            VSILOGE("Inputs/Outputs data type not support: %s", desc);
+            destroy_op_io_types_desc(desc);
+            return FALSE;
+        }
+    }
+    else
+    {
+        BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 3, 3)
+            IO_TYPE(D_U8, D_U8, D_U8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
+            IO_TYPE(D_U8, D_U8, D_U8, D_I8|Q_DFP,  D_I8|Q_DFP,  D_I8|Q_DFP)
+            IO_TYPE(D_U8, D_U8, D_U8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
+            IO_TYPE(D_U8, D_U8, D_U8, D_F16,       D_F16,       D_F16)
+        END_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR)
+
+        if (!VALIDATE_OP_IO_TYPES(PRE_PROCESS_RGB888_PLANAR, self, inputs, self->input.num,
+            outputs, self->output.num)) {
+            char* desc = generate_op_io_types_desc(inputs,
+                    self->input.num, outputs, self->output.num);
+            VSILOGE("Inputs/Outputs data type not support: %s", desc);
+            destroy_op_io_types_desc(desc);
+            return FALSE;
+        }
     }
 
     return TRUE;
@@ -192,11 +216,7 @@ static vsi_status op_deinit
 {
     vsi_status status = VSI_SUCCESS;
 
-    if (self->nn_param.pre_process_rgb888_planar.local != NULL)
-    {
-        free(self->nn_param.pre_process_rgb888_planar.local);
-        self->nn_param.pre_process_rgb888_planar.local = NULL;
-    }
+    vsi_nn_safe_free(self->nn_param.pre_process_rgb888_planar.local);
     vsi_nn_op_common_deinit(self);
 
     return status;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
index 3642b47..5a37151 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
@@ -80,7 +80,7 @@ static vsi_bool caculate_reshape_size(uint32_t* dim_value,
                                       vsi_size_t* re_sizes, vsi_size_t* re_sizes2,
                                       vx_int32 *resolved_dim, vx_int32 resolved_dim_count)
 {
-#define VSI_NN_MAX_IMAGE_WIDTH  (65536)
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
     vsi_bool enable_reshape = TRUE;
     vsi_size_t size_count = 1;
     uint32_t i = 0;
@@ -225,6 +225,21 @@ static vsi_status op_compute
         vsi_nn_tensor_t *mean_tmp_tensor = NULL;
         vsi_nn_tensor_t *reshaped_input1 = self->nn_param.reduce.local2->reshaped_input1;
         vsi_nn_tensor_t *reshaped_output1 = self->nn_param.reduce.local2->reshaped_output1;
+        char tensor_name[128];
+
+        memset(tensor_name, 0, sizeof(tensor_name));
+        snprintf(tensor_name,
+                 sizeof(tensor_name),
+                 "uid_%u_reshape_out_0",
+                 self->uid);
+        if (reshaped_output1 && vxSetReferenceName(
+                (vx_reference)reshaped_output1->t, tensor_name) == VSI_FAILURE)
+        {
+            VSILOGW("Set uid %u reduce reshaped output name fail",
+                    self->uid);
+            return VSI_FAILURE;
+        }
+
 
         resolved_dim_count = self->nn_param.reduce.local2->axes_num;
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
index ebfa574..fffe060 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
@@ -69,7 +69,7 @@ static vsi_status _comparisons_op_compute
             inputs[1]->attr.size, inputs[1]->attr.dim_num,
             outputs[0]->attr.size, outputs[0]->attr.dim_num,
             shapes[0], shapes[1], shapes[2], &new_rank );
-    if( ret )
+    if ( ret )
     {
         // Add params
         reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
@@ -117,7 +117,7 @@ static vsi_status _comparisons_op_compute
 
         vsi_nn_kernel_param_release( &param );
     }
-    if( self->n )
+    if ( self->n )
     {
         status = VSI_SUCCESS;
     }
@@ -133,37 +133,61 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(RELATIONAL_OPS, 2, 1)
-        IO_TYPE(D_F16,  D_F16, D_BOOL8)
-        IO_TYPE(D_F16,  D_I16|Q_DFP, D_BOOL8)
-        IO_TYPE(D_F16,  D_I8|Q_DFP, D_BOOL8)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM, D_BOOL8)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP, D_BOOL8)
-        IO_TYPE(D_I16|Q_DFP,  D_F16, D_BOOL8)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP, D_BOOL8)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_BOOL8)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM, D_BOOL8)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16, D_BOOL8)
-        IO_TYPE(D_BF16,  D_BF16,  D_BOOL8)
-        IO_TYPE(D_BOOL8, D_BOOL8, D_BOOL8)
-        IO_TYPE(D_F32, D_F32, D_BOOL8)
-        IO_TYPE(D_I32, D_I32, D_BOOL8)
+        IO_TYPE(D_F16,          D_F16,          D_BOOL8)
+        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_BOOL8)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_BOOL8)
+        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_BOOL8)
+        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_BOOL8)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_BOOL8)
+        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_BOOL8)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_BOOL8)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_BOOL8)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_BOOL8)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_BOOL8)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_BOOL8)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_BOOL8)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_BOOL8)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_BOOL8)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_BOOL8)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_BOOL8)
+        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_BOOL8)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_BOOL8)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_BOOL8)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_BOOL8)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_BOOL8)
+        IO_TYPE(D_BF16,         D_BF16,         D_BOOL8)
+        IO_TYPE(D_BOOL8,        D_BOOL8,        D_BOOL8)
+        IO_TYPE(D_F32,          D_F32,          D_BOOL8)
+        IO_TYPE(D_I32,          D_I32,          D_BOOL8)
 
-        IO_TYPE(D_F16,  D_F16, D_I8)
-        IO_TYPE(D_F16,  D_I16|Q_DFP, D_I8)
-        IO_TYPE(D_F16,  D_I8|Q_DFP, D_I8)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM, D_I8)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP, D_I8)
-        IO_TYPE(D_I16|Q_DFP,  D_F16, D_I8)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP, D_I8)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_I8)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM, D_I8)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16, D_I8)
-        IO_TYPE(D_BF16,  D_BF16,  D_I8)
-        IO_TYPE(D_BOOL8, D_BOOL8, D_I8)
-        IO_TYPE(D_F32, D_F32, D_I8)
-        IO_TYPE(D_I32, D_I32, D_I8)
+        IO_TYPE(D_F16,          D_F16,          D_I8)
+        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_I8)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_I8)
+        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_I8)
+        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_I8)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_I8)
+        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_I8)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_I8)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I8)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_I8)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_I8)
+        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_I8)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_I8)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_I8)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_I8)
+        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_I8)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_I8)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_I8)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_I8)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_I8)
+        IO_TYPE(D_BF16,         D_BF16,         D_I8)
+        IO_TYPE(D_BOOL8,        D_BOOL8,        D_I8)
+        IO_TYPE(D_F32,          D_F32,          D_I8)
+        IO_TYPE(D_I32,          D_I32,          D_I8)
     END_IO_TYPE_DECL(RELATIONAL_OPS)
-    if(!VALIDATE_OP_IO_TYPES(RELATIONAL_OPS, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(RELATIONAL_OPS, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -174,7 +198,6 @@ static vsi_bool op_check
     return TRUE;
 } /* op_check() */
 
-
 static vsi_bool op_setup
     (
     vsi_nn_node_t * self,
@@ -190,14 +213,14 @@ static vsi_bool op_setup
     in2_rank = inputs[1]->attr.dim_num;
     out_rank = vsi_nn_max( in1_rank, in2_rank );
 
-    for(i = 0; i < out_rank; i++)
+    for (i = 0; i < out_rank; i++)
     {
         vsi_size_t sz0, sz1;
         sz0 = i < in1_rank ? inputs[0]->attr.size[i] : 1;
         sz1 = i < in2_rank ? inputs[1]->attr.size[i] : 1;
         shape[i] = vsi_nn_max( sz0, sz1 );
     }
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = (uint32_t)out_rank;
         memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) );
@@ -209,7 +232,7 @@ static vsi_bool op_setup
         total_size_expected = vsi_nn_ShapeProduct( shape, out_rank );
         total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num );
-        if( total_size_expected != total_size_got )
+        if ( total_size_expected != total_size_got )
         {
             VSILOGW("Output size mismatch, expect %"VSI_SIZE_T_SPECIFIER", but got %"VSI_SIZE_T_SPECIFIER"",
                     total_size_expected, total_size_got);
@@ -238,7 +261,6 @@ DEF_OP_REG(name, NULL, op_compute_##kernel_name, vsi_nn_op_common_deinit, op_che
 
 DEF_COMPARISONS_OP( RELATIONAL_OPS, relational_ops );
 
-
 #undef DEF_COMPARISONS_OP
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c
index a10cbe6..295b6ee 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c
@@ -178,15 +178,19 @@ static vsi_bool op_check
     vsi_nn_repeat_param * p = NULL;
 
     BEGIN_IO_TYPE_DECL(REPEAT, 2, 1)
-        IO_TYPE(D_F16,  D_I32,  D_F16)
-        IO_TYPE(D_F32,  D_I32,  D_F32)
-        IO_TYPE(D_I32,  D_I32,  D_I32)
-        IO_TYPE(D_I8,   D_I32,  D_I8)
-        IO_TYPE(D_U8,   D_I32,  D_U8)
-        IO_TYPE(D_I16,  D_I32,  D_I16)
-        IO_TYPE(D_I8|Q_DFP,  D_I32,  D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I32,  D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP, D_I32,  D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_I32,  D_F16)
+        IO_TYPE(D_F32,        D_I32,  D_F32)
+        IO_TYPE(D_I32,        D_I32,  D_I32)
+        IO_TYPE(D_I8,         D_I32,  D_I8)
+        IO_TYPE(D_U8,         D_I32,  D_U8)
+        IO_TYPE(D_I16,        D_I32,  D_I16)
+        IO_TYPE(D_I8|Q_DFP,   D_I32,  D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,  D_I32,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_I32,  D_I8|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32,  D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,  D_I32,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM, D_I32,  D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_I32,  D_I16|Q_SYM)
     END_IO_TYPE_DECL(REPEAT)
     if (!VALIDATE_OP_IO_TYPES(REPEAT, self, inputs, self->input.num, outputs, self->output.num))
     {
@@ -337,4 +341,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
index b16ba26..6ea0fc0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
@@ -54,8 +54,29 @@ static vsi_status op_compute
         self->nn_param.reshape.local.initialized == FALSE)
     {
         vsi_status status = VSI_SUCCESS;
-        vsi_nn_tensor_t *tmp_tensor = NULL;
+#ifdef VX_REMOVE_RESHAPE_SUPPORT
+        vsi_nn_tensor_attr_t attr;
+        vsi_nn_tensor_t *dims_tensor = NULL;
+        vx_nn_reshape_params_t reshape_param;
 
+        memset(&attr, 0, sizeof(attr));
+        attr.size[0] = self->nn_param.reshape.dim_num;
+        attr.dim_num = 1;
+        attr.is_const = TRUE;
+        attr.dtype.vx_type = VSI_NN_TYPE_INT32;
+        attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+        dims_tensor = vsi_nn_CreateTensorFromData(
+            self->graph,
+            (uint8_t *)self->nn_param.reshape.size,
+            &attr);
+
+        reshape_param.dims = REQUIRED_IO(dims_tensor);
+
+        self->n = vxTensorReshapeNode(self->graph->g,
+            inputs[0]->t, &reshape_param, sizeof(reshape_param), outputs[0]->t);
+        vsi_safe_release_tensor(dims_tensor);
+#else
+        vsi_nn_tensor_t *tmp_tensor = NULL;
         tmp_tensor = vsi_nn_reshape_tensor( self->graph,
             outputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num );
 
@@ -69,6 +90,7 @@ static vsi_status op_compute
         VSILOGD("Create a copy node for reshape");
 
         vsi_safe_release_tensor(tmp_tensor);
+#endif
 
         return status;
     }
@@ -122,7 +144,9 @@ static vsi_status op_optimize
     vsi_status status;
 
     status = VSI_SUCCESS;
-
+#ifdef VX_REMOVE_RESHAPE_SUPPORT
+    self->nn_param.reshape.local.initialized = FALSE;
+#else
     if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
     {
         return status;
@@ -162,7 +186,7 @@ static vsi_status op_optimize
             self->nn_param.reshape.local.initialized = TRUE;
         }
     }
-
+#endif
     return status;
 } /* op_optimize() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
index 6a84273..9deb02e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
@@ -22,7 +22,6 @@
 *
 *****************************************************************************/
 
-
 #include <string.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -53,15 +52,46 @@ static vsi_status op_compute
     if (inputs[0]->t != NULL && outputs[0]->t != NULL &&
         self->nn_param.reshape2.local->initialized == FALSE)
     {
+#ifdef VX_REMOVE_RESHAPE_SUPPORT
+        vsi_nn_tensor_attr_t attr;
+        vsi_nn_tensor_t *dims_tensor = NULL;
+        vx_nn_reshape_params_t reshape_param;
+        int32_t dims_data[VSI_NN_MAX_DIM_NUM] = {1};
+        uint32_t i = 0;
+
+        for (i = 0; i < self->nn_param.reshape2.dim_num; i++)
+        {
+            dims_data[i] = (int32_t)self->nn_param.reshape2.size[i];
+        }
+
+        memset(&attr, 0, sizeof(attr));
+        attr.size[0] = self->nn_param.reshape2.dim_num;
+        attr.dim_num = 1;
+        attr.is_const = TRUE;
+        attr.dtype.vx_type = VSI_NN_TYPE_INT32;
+        attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+        dims_tensor = vsi_nn_CreateTensorFromData(
+            self->graph,
+            (uint8_t *)dims_data,
+            &attr);
+
+        reshape_param.dims = REQUIRED_IO(dims_tensor);
+
+        self->n = vxTensorReshapeNode(self->graph->g,
+            inputs[0]->t, &reshape_param, sizeof(reshape_param), outputs[0]->t);
+        vsi_safe_release_tensor(dims_tensor);
+#else
         self->n = vxTensorCopyNode(self->graph->g,
             inputs[0]->t, outputs[0]->t);
-        if(NULL == self->n)
+#endif
+        if (NULL == self->n)
         {
             VSILOGE( "Create vxTensorCopyNode fail." );
             return VSI_FAILURE;
         }
         VSILOGD("Create a copy node for reshape");
     }
+
     return VSI_SUCCESS;
 } /* op_compute() */
 
@@ -140,6 +170,9 @@ static vsi_status op_optimize
     vsi_status status;
 
     status = VSI_SUCCESS;
+#ifdef VX_REMOVE_RESHAPE_SUPPORT
+    self->nn_param.reshape2.local->initialized = FALSE;
+#else
     if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
     {
         return status;
@@ -178,7 +211,7 @@ static vsi_status op_optimize
             self->nn_param.reshape2.local->initialized = TRUE;
         }
     }
-
+#endif
     return status;
 } /* op_optimize() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
index ad39a8b..fd544a8 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
@@ -41,15 +41,11 @@
 #include "utils/vsi_nn_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
+#include "kernel/vsi_nn_kernel.h"
 
-#define _ARG_NUM            (1)
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
-#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-#define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
-
 
 static vsi_bool _is_same_shape
     (
@@ -87,24 +83,39 @@ static vsi_status op_compute
     }
     else
     {
-        vx_nn_scale_params_t para;
+        char kernel_name[128];
+        vsi_nn_kernel_param_t * param = NULL;
+        int32_t align_corners = self->nn_param.resize.align_corners;
+        int32_t half_pixel_centers = self->nn_param.resize.half_pixel_centers;
+
+        param = vsi_nn_kernel_param_create();
+
+        vsi_nn_kernel_param_add_int32( param, "align_corners",  align_corners );
+        vsi_nn_kernel_param_add_int32( param, "half_pixel_centers",  half_pixel_centers );
+        vsi_nn_kernel_param_add_int32( param, "type",  self->nn_param.resize.type );
+
         switch (self->nn_param.resize.type)
         {
             case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR:
-                para.type = VX_INTERPOLATION_NEAREST_NEIGHBOR; break;
+                 snprintf(kernel_name, sizeof(kernel_name),
+                 "resize_nearest");
+                 break;
             case VSI_NN_INTERPOLATION_BILINEAR:
-                para.type = VX_INTERPOLATION_BILINEAR; break;
-            case VSI_NN_INTERPOLATION_AREA:
-                para.type = VX_INTERPOLATION_AREA; break;
+                 snprintf(kernel_name, sizeof(kernel_name),
+                 "resize_bilinear");
+                 break;
             default:
-                para.type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
+                break;
         }
-        self->n = vxTensorScaleNode( self->graph->g, inputs[0]->t, &para,
-            sizeof(vx_nn_scale_params_t), outputs[0]->t );
-        if( NULL != self->n )
-        {
+
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+            kernel_name, &inputs[0], 1, &outputs[0], 1, param );
+
+        if (self->n) {
             status = VSI_SUCCESS;
         }
+
+        vsi_nn_kernel_param_release(&param);
     }
 
     return status;
@@ -151,7 +162,7 @@ static vsi_bool op_setup
     vsi_enum layout = self->nn_param.resize.layout;
     vsi_nn_internal_node_t* curr = NULL;
 
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
         if (factor != 0)
@@ -192,9 +203,7 @@ static vsi_bool op_setup
         }
     }
 
-    if ( ( self->nn_param.resize.align_corners ||
-           self->nn_param.resize.half_pixel_centers ||
-           layout == VSI_NN_RESIZE_LAYOUT_NHWC )
+    if ( ( layout == VSI_NN_RESIZE_LAYOUT_NHWC )
        && ( VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type ) )
     {
         self->nn_param.resize.lcl_data->use_internal_node = TRUE;
@@ -209,20 +218,6 @@ static vsi_bool op_setup
         curr->outputs[0] = outputs[0];
         vsi_nn_internal_setup_node(self, curr);
     }
-    else if ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers)
-            && (VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type))
-    {
-        self->nn_param.resize.lcl_data->use_internal_node = TRUE;
-
-        vsi_nn_internal_init_node_wksp( self );
-        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_NEAREST_INTERNAL, 0, 0 );
-        curr->node->nn_param.resize_nearest_internal.align_corners = self->nn_param.resize.align_corners;
-        curr->node->nn_param.resize_nearest_internal.factor = self->nn_param.resize.factor;
-        curr->node->nn_param.resize_nearest_internal.half_pixel_centers = self->nn_param.resize.half_pixel_centers;
-        curr->inputs[0]  = inputs[0];
-        curr->outputs[0] = outputs[0];
-        vsi_nn_internal_setup_node(self, curr);
-    }
     else if (_is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num))
     {
         self->nn_param.resize.lcl_data->use_internal_node = TRUE;
@@ -242,7 +237,6 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
-
     if (self->nn_param.resize.lcl_data->use_internal_node)
     {
         vsi_nn_safe_free(self->nn_param.resize.lcl_data);
@@ -266,7 +260,7 @@ static vsi_status op_init
 
     self->nn_param.resize.lcl_data =
         (vsi_nn_resize_local_data *)malloc( sizeof(vsi_nn_resize_local_data) );
-    if( NULL == self->nn_param.resize.lcl_data )
+    if ( NULL == self->nn_param.resize.lcl_data )
     {
         VSILOGE( "Create resize local data fail." );
         status = VSI_FAILURE;
@@ -274,11 +268,8 @@ static vsi_status op_init
     }
     memset( self->nn_param.resize.lcl_data, 0, sizeof(vsi_nn_resize_local_data) );
 
-    if (vsi_nn_compareVersion(self->graph, 1, 1, 14) == -1)
-    {
-        self->nn_param.resize.align_corners      = FALSE;
-        self->nn_param.resize.half_pixel_centers = FALSE;
-    }
+    self->nn_param.resize.align_corners = FALSE;
+    self->nn_param.resize.half_pixel_centers = FALSE;
 
     self->nn_param.resize.layout = VSI_NN_RESIZE_LAYOUT_NCHW;
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c
index a77de72..5092467 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c
@@ -95,7 +95,6 @@ static vsi_status op_compute
     }
 } /* op_compute() */
 
-
 static vsi_bool op_check
     (
     vsi_nn_node_t * self,
@@ -104,33 +103,39 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(REVERSE, 1, 1)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_U8|Q_DFP,   D_U8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I32|Q_DFP,  D_I32|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM)
-        IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM)
-        IO_TYPE(D_I32|Q_ASYM, D_I32|Q_ASYM)
-        IO_TYPE(D_U8|Q_SYM_PC,   D_U8|Q_SYM_PC)
-        IO_TYPE(D_I8|Q_SYM_PC,   D_I8|Q_SYM_PC)
-        IO_TYPE(D_I16|Q_SYM_PC,  D_I16|Q_SYM_PC)
-        IO_TYPE(D_I32|Q_SYM_PC,  D_I32|Q_SYM_PC)
-        IO_TYPE(D_U8,   D_U8)
-        IO_TYPE(D_I8,   D_I8)
-        IO_TYPE(D_I16,  D_I16)
-        IO_TYPE(D_I32,  D_I32)
-        IO_TYPE(D_F32,  D_F32)
-        IO_TYPE(D_F32,  D_BF16)
-        IO_TYPE(D_BF16, D_F32)
-        IO_TYPE(D_F16,        D_I32)
-        IO_TYPE(D_U8|Q_ASYM,  D_I32)
-        IO_TYPE(D_I8|Q_DFP,   D_I32)
-        IO_TYPE(D_I16|Q_DFP,  D_I32)
+        IO_TYPE(D_F16,              D_F16)
+        IO_TYPE(D_U8|Q_DFP,         D_U8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,         D_I8|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,        D_I16|Q_DFP)
+        IO_TYPE(D_I32|Q_DFP,        D_I32|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,        D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,        D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,         D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_ASYM,       D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,        D_I16|Q_SYM)
+        IO_TYPE(D_I32|Q_ASYM,       D_I32|Q_ASYM)
+        IO_TYPE(D_U8|Q_SYM_PC,      D_U8|Q_SYM_PC)
+        IO_TYPE(D_I8|Q_SYM_PC,      D_I8|Q_SYM_PC)
+        IO_TYPE(D_I16|Q_SYM_PC,     D_I16|Q_SYM_PC)
+        IO_TYPE(D_I32|Q_SYM_PC,     D_I32|Q_SYM_PC)
+        IO_TYPE(D_I32,              D_I32)
+        IO_TYPE(D_F32,              D_F32)
+        IO_TYPE(D_F32,              D_BF16)
+        IO_TYPE(D_BF16,             D_F32)
+        IO_TYPE(D_F16,              D_I32)
+        IO_TYPE(D_U8|Q_ASYM,        D_I32)
+        IO_TYPE(D_I8|Q_DFP,         D_I32)
+        IO_TYPE(D_I16|Q_DFP,        D_I32)
 
         /* HW 9.0 */
         IO_TYPE(D_BF16, D_BF16)
+
+        /* HW 9.1.1 */
+        IO_TYPE(D_U4|Q_ASYM,    D_U4|Q_ASYM)
+        IO_TYPE(D_U4|Q_SYM,     D_U4|Q_SYM)
+        IO_TYPE(D_I4|Q_ASYM,    D_I4|Q_ASYM)
+        IO_TYPE(D_I4|Q_SYM,     D_I4|Q_SYM)
+
     END_IO_TYPE_DECL(REVERSE)
     if(!VALIDATE_OP_IO_TYPES(REVERSE, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c
index f754a67..49dbd7b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c
@@ -83,10 +83,25 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    BEGIN_IO_TYPE_DECL(ROI_ALIGN, 3, 1)
+        IO_TYPE(D_F16,       D_F16,         D_I32, D_F16)
+        IO_TYPE(D_F16,       D_F16,         D_I32, D_F32)
+        IO_TYPE(D_F16,       D_F32,         D_I32, D_F16)
+        IO_TYPE(D_F32,       D_F32,         D_I32, D_F32)
+        IO_TYPE(D_U8|Q_ASYM, D_U16|Q_ASYM,  D_I32, D_U8|Q_ASYM)
+    END_IO_TYPE_DECL(ROI_ALIGN)
+    if (!VALIDATE_OP_IO_TYPES(ROI_ALIGN, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
     return TRUE;
 } /* op_check() */
 
-
 static vsi_bool op_setup
     (
     vsi_nn_node_t * self,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c
index 90395c0..b4c8666 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c
@@ -71,14 +71,26 @@ static vsi_bool op_check
     BEGIN_IO_TYPE_DECL(RSQRT, 1, 1)
         IO_TYPE(D_F16,          D_F16)
         IO_TYPE(D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_SYM)
         IO_TYPE(D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_SYM)
         IO_TYPE(D_F16,          D_U8|Q_ASYM)
         IO_TYPE(D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_F16)
         IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
         IO_TYPE(D_U8|Q_ASYM,    D_F16)
         IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_F16)
         IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
         IO_TYPE(D_BF16,         D_BF16)
         IO_TYPE(D_BF16,         D_F32)
         IO_TYPE(D_F32,          D_BF16)
@@ -106,6 +118,12 @@ static vsi_bool op_check
         IO_TYPE(D_F16,          D_BF16)
         IO_TYPE(D_F16,          D_F32)
 
+        /* HW 9.1.1 */
+        IO_TYPE(D_U4|Q_ASYM,    D_U4|Q_ASYM)
+        IO_TYPE(D_U4|Q_SYM,     D_U4|Q_SYM)
+        IO_TYPE(D_I4|Q_ASYM,    D_I4|Q_ASYM)
+        IO_TYPE(D_I4|Q_SYM,     D_I4|Q_SYM)
+
     END_IO_TYPE_DECL(RSQRT)
     if(!VALIDATE_OP_IO_TYPES(RSQRT, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
@@ -136,4 +154,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c
index fd01b8a..c95d75e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c
@@ -118,23 +118,60 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(SELECT, 3, 1)
-        IO_TYPE(D_I8, D_I8|Q_DFP,  D_I8|Q_DFP,  D_I8|Q_DFP)
-        IO_TYPE(D_I8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_I8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I8, D_F16, D_F16, D_F16)
-        IO_TYPE(D_I8, D_I32, D_I32, D_I32)
-        IO_TYPE(D_I8, D_F32, D_F32, D_F32)
-        IO_TYPE(D_BOOL8, D_I8|Q_DFP,  D_I8|Q_DFP,  D_I8|Q_DFP)
-        IO_TYPE(D_BOOL8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_BOOL8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_BOOL8, D_F16, D_F16, D_F16)
-        IO_TYPE(D_BOOL8, D_I32, D_I32, D_I32)
-        IO_TYPE(D_BOOL8, D_F32, D_F32, D_F32)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP)
+        IO_TYPE(D_I8,        D_I8|Q_DFP,    D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8,        D_I8|Q_ASYM,   D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8,        D_I8|Q_SYM,    D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I8,        D_U8|Q_ASYM,   D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_I8,        D_I16|Q_DFP,   D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I8,        D_I16|Q_ASYM,  D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I8,        D_I16|Q_SYM,   D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I8,        D_F16,         D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8,        D_F16,         D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I8,        D_F16,         D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8,        D_F16,         D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I8,        D_F16,         D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_I8,        D_F16,         D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I8,        D_I16|Q_SYM,   D_F16,          D_F16)
+        IO_TYPE(D_I8,        D_U8|Q_ASYM,   D_F16,          D_F16)
+        IO_TYPE(D_I8,        D_I8|Q_DFP,    D_F16,          D_F16)
+        IO_TYPE(D_I8,        D_I8|Q_ASYM,   D_F16,          D_F16)
+        IO_TYPE(D_I8,        D_I8|Q_SYM,    D_F16,          D_F16)
+        IO_TYPE(D_I8,        D_I16|Q_DFP,   D_F16,          D_F16)
+        IO_TYPE(D_I8,        D_I16|Q_ASYM,  D_F16,          D_F16)
+        IO_TYPE(D_I8,        D_I16|Q_SYM,   D_F16,          D_F16)
+        IO_TYPE(D_I8,        D_F16,         D_F16,          D_F16)
+        IO_TYPE(D_I8,        D_I32,         D_I32,          D_I32)
+        IO_TYPE(D_I8,        D_F32,         D_F32,          D_F32)
+        IO_TYPE(D_BOOL8,     D_I8|Q_DFP,    D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_BOOL8,     D_I8|Q_ASYM,   D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_BOOL8,     D_I8|Q_SYM,    D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_BOOL8,     D_U8|Q_ASYM,   D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_BOOL8,     D_I16|Q_DFP,   D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_BOOL8,     D_I16|Q_ASYM,  D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_BOOL8,     D_I16|Q_SYM,   D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_BOOL8,     D_F16,         D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_BOOL8,     D_F16,         D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_BOOL8,     D_F16,         D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_BOOL8,     D_F16,         D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_BOOL8,     D_F16,         D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_BOOL8,     D_F16,         D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_BOOL8,     D_I16|Q_SYM,   D_F16,          D_F16)
+        IO_TYPE(D_BOOL8,     D_U8|Q_ASYM,   D_F16,          D_F16)
+        IO_TYPE(D_BOOL8,     D_I8|Q_DFP,    D_F16,          D_F16)
+        IO_TYPE(D_BOOL8,     D_I8|Q_ASYM,   D_F16,          D_F16)
+        IO_TYPE(D_BOOL8,     D_I8|Q_SYM,    D_F16,          D_F16)
+        IO_TYPE(D_BOOL8,     D_I16|Q_DFP,   D_F16,          D_F16)
+        IO_TYPE(D_BOOL8,     D_I16|Q_ASYM,  D_F16,          D_F16)
+        IO_TYPE(D_BOOL8,     D_I16|Q_SYM,   D_F16,          D_F16)
+        IO_TYPE(D_BOOL8,     D_F16,         D_F16,          D_F16)
+        IO_TYPE(D_BOOL8,     D_F16,         D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_BOOL8,     D_I32,         D_I32,          D_I32)
+        IO_TYPE(D_BOOL8,     D_F32,         D_F32,          D_F32)
+        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,   D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,   D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,    D_I8|Q_DFP,     D_I8|Q_DFP)
     END_IO_TYPE_DECL(SELECT)
-    if(!VALIDATE_OP_IO_TYPES(SELECT, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(SELECT, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -193,7 +230,6 @@ static vsi_bool op_setup
     return ret;
 } /* op_setup() */
 
-
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c
index b5ef3e5..bb41e98 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c
@@ -84,18 +84,28 @@ static vsi_bool op_check
     if (self->input.num > 1)
     {
         BEGIN_IO_TYPE_DECL(SLICE, 2, 1)
-            IO_TYPE(D_F16,       D_I32,  D_F16)
-            IO_TYPE(D_F16,       D_I32,  D_I8|Q_DFP)
-            IO_TYPE(D_F16,       D_I32,  D_I16|Q_DFP)
-            IO_TYPE(D_F16,       D_I32,  D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_I32,  D_F16)
-            IO_TYPE(D_I16|Q_DFP, D_I32,  D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_I32,  D_F16)
-            IO_TYPE(D_I8|Q_DFP,  D_I32,  D_I8|Q_DFP)
-            IO_TYPE(D_I16|Q_DFP, D_I32,  D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I32,  D_U8|Q_ASYM)
-            IO_TYPE(D_F32,       D_I32,  D_F32)
-            IO_TYPE(D_I32,       D_I32,  D_I32)
+            IO_TYPE(D_F16,        D_I32,  D_F16)
+            IO_TYPE(D_F16,        D_I32,  D_I8|Q_DFP)
+            IO_TYPE(D_F16,        D_I32,  D_I8|Q_ASYM)
+            IO_TYPE(D_F16,        D_I32,  D_I8|Q_SYM)
+            IO_TYPE(D_F16,        D_I32,  D_I16|Q_DFP)
+            IO_TYPE(D_F16,        D_I32,  D_I16|Q_ASYM)
+            IO_TYPE(D_F16,        D_I32,  D_I16|Q_SYM)
+            IO_TYPE(D_F16,        D_I32,  D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_DFP,   D_I32,  D_F16)
+            IO_TYPE(D_I8|Q_ASYM,  D_I32,  D_F16)
+            IO_TYPE(D_I8|Q_SYM,   D_I32,  D_F16)
+            IO_TYPE(D_I16|Q_DFP,  D_I32,  D_F16)
+            IO_TYPE(D_U8|Q_ASYM,  D_I32,  D_F16)
+            IO_TYPE(D_I8|Q_DFP,   D_I32,  D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_ASYM,  D_I32,  D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,   D_I32,  D_I8|Q_SYM)
+            IO_TYPE(D_I16|Q_DFP,  D_I32,  D_I16|Q_DFP)
+            IO_TYPE(D_I16|Q_ASYM, D_I32,  D_I16|Q_ASYM)
+            IO_TYPE(D_I16|Q_SYM,  D_I32,  D_I16|Q_SYM)
+            IO_TYPE(D_U8|Q_ASYM,  D_I32,  D_U8|Q_ASYM)
+            IO_TYPE(D_F32,        D_I32,  D_F32)
+            IO_TYPE(D_I32,        D_I32,  D_I32)
 
             /* HW 9.0 */
             IO_TYPE(D_BF16,     D_I32,    D_BF16)
@@ -211,7 +221,7 @@ static vsi_status op_init
     {
         return  VX_ERROR_NO_MEMORY;
     }
-    memset(p->lcl_data, 0, sizeof(vsi_nn_split_lcl_data));
+    memset(p->lcl_data, 0, sizeof(vsi_nn_slice_lcl_data));
 
     return status;
 }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c
index 257f1e2..b8cd921 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c
@@ -57,37 +57,47 @@ static vsi_bool op_check
     /* check inputs outputs data type */
     BEGIN_IO_TYPE_DECL(SOFTMAX, 1, 1)
         /* IO_TYPE(INPUT, OUTPUT) */
-        IO_TYPE(D_F32, D_F32)
-        IO_TYPE(D_F32, D_F16)
+        IO_TYPE(D_F32,          D_F32)
+        IO_TYPE(D_F32,          D_F16)
+        IO_TYPE(D_F16,          D_F16)
+        IO_TYPE(D_F16,          D_F32)
+        IO_TYPE(D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_BF16,         D_BF16)
+        IO_TYPE(D_BF16,         D_F32)
+        IO_TYPE(D_BF16,         D_F16)
 
-        IO_TYPE(D_F16, D_F16)
-        IO_TYPE(D_F16, D_F32)
-        IO_TYPE(D_F16, D_I16|Q_DFP)
-        IO_TYPE(D_F16, D_I8|Q_DFP)
-        IO_TYPE(D_F16, D_I8|Q_ASYM)
-        IO_TYPE(D_F16, D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_F32)
 
-        IO_TYPE(D_BF16, D_BF16)
-        IO_TYPE(D_BF16, D_F32)
-        IO_TYPE(D_BF16, D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_F32)
+        IO_TYPE(D_I8|Q_SYM,     D_F32)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_F32)
 
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_F32)
 
-        IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_ASYM, D_F16)
-        IO_TYPE(D_I8|Q_ASYM, D_F32)
-
-        IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP, D_F16)
-        IO_TYPE(D_I8|Q_DFP, D_F32)
-
-        IO_TYPE(D_I16|Q_DFP, D_F32)
-        IO_TYPE(D_I16|Q_DFP, D_F16)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_F32)
+        IO_TYPE(D_I16|Q_SYM,    D_F32)
+        IO_TYPE(D_I16|Q_DFP,    D_F32)
+        IO_TYPE(D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
     END_IO_TYPE_DECL(SOFTMAX)
-    if(!VALIDATE_OP_IO_TYPES(SOFTMAX, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(SOFTMAX, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c
index 34202c3..2cba925 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c
@@ -108,13 +108,17 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(SPACE2DEPTH_INTERNAL, 1, 1)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_F32,  D_F32)
-        IO_TYPE(D_F32,  D_BF16)
-        IO_TYPE(D_BF16, D_F32)
+        IO_TYPE(D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_F32,          D_F32)
+        IO_TYPE(D_F32,          D_BF16)
+        IO_TYPE(D_BF16,         D_F32)
 
         /* HW 9.0 */
         IO_TYPE(D_BF16, D_BF16)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_split.c b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c
index a510217..9810b2c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_split.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c
@@ -62,7 +62,7 @@ static vsi_bool op_check
 
     /* compute the output tensor number */
     num = (uint32_t)(self->output.num - 1);
-    while( NULL == outputs[num] )
+    while ( NULL == outputs[num] )
     {
         num --;
     }
@@ -70,37 +70,37 @@ static vsi_bool op_check
 
     ret = TRUE;
     /* 1. check the input tensor number */
-    if(self->input.num != 1)
+    if (self->input.num != 1)
     {
         VSILOGE("The split layer input num must be 1, here is %u\n", self->input.num);
         return FALSE;
     }
 
     /* 2. check output tensor number */
-    if(slices_num == 0)
+    if (slices_num == 0)
     {
         uint32_t remaind = inputs[0]->attr.size[axis] % num;
-        if(remaind != 0)
+        if (remaind != 0)
         {
             VSILOGE("Can not average the input tensor %u shape\n", axis);
             return FALSE;
         }
     }
-    else if(slices_num != num)
+    else if (slices_num != num)
     {
         VSILOGE( "slices num %u != output tensor num %u\n", slices_num, num);
         return FALSE;
     }
 
     /* 3. check output tensor shape and dimensions */
-    for( i = 0; i < num; i ++ )
+    for ( i = 0; i < num; i ++ )
     {
         /* the virtual tensor shape has not been calculated yet */
-        if(outputs[i]->attr.vtl == TRUE
+        if (outputs[i]->attr.vtl == TRUE
             || outputs[i]->attr.dim_num == VSI_NN_DIM_AUTO)
             continue;
 
-        if( outputs[i]->attr.dim_num != inputs[0]->attr.dim_num )
+        if ( outputs[i]->attr.dim_num != inputs[0]->attr.dim_num )
         {
             VSILOGE( "Split dims num(%d vs %d)",
                 outputs[i]->attr.dim_num,
@@ -109,14 +109,14 @@ static vsi_bool op_check
             break;
         }
 
-        for( j = 0; j < outputs[i]->attr.dim_num; j ++ )
+        for ( j = 0; j < outputs[i]->attr.dim_num; j ++ )
         {
-            if( axis == j )
+            if ( axis == j )
             {
                 continue;
             }
 
-            if( outputs[i]->attr.size[j] != inputs[0]->attr.size[j] )
+            if ( outputs[i]->attr.size[j] != inputs[0]->attr.size[j] )
             {
                 VSILOGE( "Split dims size(%d vs %d)",
                     outputs[i]->attr.size[j],
@@ -126,12 +126,12 @@ static vsi_bool op_check
             }
         }
 
-        if( FALSE == ret )
+        if ( FALSE == ret )
         {
             break;
         }
     }
-    for(i = 0; i < num; i++)
+    for (i = 0; i < num; i++)
     {
         BEGIN_IO_TYPE_DECL(SPLIT, 1, 1)
             IO_TYPE(D_F16,  D_F16)
@@ -161,7 +161,7 @@ static vsi_bool op_check
             /* HW 9.0 */
             IO_TYPE(D_BF16, D_BF16)
         END_IO_TYPE_DECL(SPLIT)
-        if(!VALIDATE_OP_IO_TYPES(SPLIT, self, inputs, 1, &outputs[i], 1)) {
+        if (!VALIDATE_OP_IO_TYPES(SPLIT, self, inputs, 1, &outputs[i], 1)) {
             char* desc = generate_op_io_types_desc(inputs, 1, &outputs[i], 1);
             VSILOGE("Inputs/Outputs data type not support: %s", desc);
             destroy_op_io_types_desc(desc);
@@ -179,7 +179,7 @@ static vsi_bool op_setup
     )
 {
     vsi_bool ret;
-    uint32_t i,num;
+    uint32_t i, num;
     vsi_size_t average;
     vsi_size_t start[VSI_NN_MAX_DIM_NUM] = { 0 };
     vsi_size_t end[VSI_NN_MAX_DIM_NUM] = { 0 };
@@ -193,7 +193,7 @@ static vsi_bool op_setup
     average = 1;
     /* compute the output tensor number */
     num = (uint32_t)(self->output.num - 1);
-    while( NULL == outputs[num] )
+    while ( NULL == outputs[num] )
     {
         num --;
     }
@@ -202,7 +202,7 @@ static vsi_bool op_setup
     p = &(self->nn_param.split);
     vsi_nn_internal_init_node_wksp( self );
 
-    if(slices_num == 0)
+    if (slices_num == 0)
     {
         average = inputs[0]->attr.size[axis] / num;
     }
@@ -211,7 +211,7 @@ static vsi_bool op_setup
     {
         p->lcl_data->stride_dims[i] = 1;
     }
-    for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+    for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
     {
         end[i] = inputs[0]->attr.size[i];
     }
@@ -231,7 +231,7 @@ static vsi_bool op_setup
             outputs[i]->attr.size[j] = inputs[0]->attr.size[j];
         }
         outputs[i]->attr.size[axis] = end[axis] - start[axis];
-        for(j = 0; j < VSI_NN_MAX_DIM_NUM; j++)
+        for (j = 0; j < VSI_NN_MAX_DIM_NUM; j++)
         {
             p->lcl_data->begin_dims[j] = (int32_t)start[j];
             p->lcl_data->end_dims[j] = (int32_t)end[j];
@@ -368,4 +368,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c
index 250f4f3..5fe93f7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c
@@ -22,7 +22,6 @@
 *
 *****************************************************************************/
 
-
 #include <string.h>
 #include <stdlib.h>
 
@@ -89,6 +88,8 @@ static vsi_bool op_check
         }
     }
 
+    ret = vsi_nn_OpCheck(VSI_NN_OP_RSQRT, self, inputs, outputs);
+
     return ret;
 } /* op_check() */
 
@@ -191,4 +192,3 @@ DEF_OP_REG
     );
 
 __END_DECLS
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
index dcd34fe..aa22120 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
@@ -297,7 +297,7 @@ static vsi_status op_compute
             dst_tensor = p->dst_tensor ? p->dst_tensor : outputs[0]->t;
             p->cp_node = vxTensorCopyNode(self->graph->g,
                     p->src_tensor, dst_tensor );
-            if( NULL == p->cp_node )
+            if ( NULL == p->cp_node )
             {
                 VSILOGE( "Create vxTensorCopyNode fail." );
                 status = VSI_FAILURE;
@@ -322,7 +322,7 @@ static vsi_status op_compute
             self->graph,
             (uint8_t *)start_dims,
             &attr);
-        if( NULL == begin_dims_tensor )
+        if ( NULL == begin_dims_tensor )
         {
             VSILOGE("Create begin_dims_tensor fail.(strided_slice)");
             return VSI_FAILURE;
@@ -341,7 +341,7 @@ static vsi_status op_compute
             self->graph,
             (uint8_t *)stop_dims,
             &attr);
-        if( NULL == end_dims_tensor )
+        if ( NULL == end_dims_tensor )
         {
             VSILOGE("Create end_dims_tensor fail.(strided_slice)");
             return VSI_FAILURE;
@@ -360,7 +360,7 @@ static vsi_status op_compute
             self->graph,
             (uint8_t *)stride_dims,
             &attr);
-        if( NULL == stride_dims_tensor )
+        if ( NULL == stride_dims_tensor )
         {
             VSILOGE("Create stride_dims_tensor fail.(strided_slice)");
             return VSI_FAILURE;
@@ -396,7 +396,7 @@ static vsi_status op_compute
         }
 
         output_tensor = vsi_nn_reshape_tensor(self->graph, outputs[0], sizes, dims);
-        if( NULL == output_tensor )
+        if ( NULL == output_tensor )
         {
             VSILOGE("Create output_tensor fail.(strided_slice)");
             return VSI_FAILURE;
@@ -415,7 +415,7 @@ static vsi_status op_compute
             vsi_nn_ReleaseTensor(&output_tensor);
         }
 
-        if( NULL != self->n )
+        if ( NULL != self->n )
         {
             status = VSI_SUCCESS;
         }
@@ -436,10 +436,18 @@ static vsi_bool op_check
         IO_TYPE(D_F16,          D_I16|Q_DFP)
         IO_TYPE(D_F16,          D_U8|Q_ASYM)
         IO_TYPE(D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_F16)
         IO_TYPE(D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_F16)
         IO_TYPE(D_U8|Q_ASYM,    D_F16)
         IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
         IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
         IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
         IO_TYPE(D_F32,          D_F32)
         IO_TYPE(D_BF16,         D_BF16)
@@ -469,8 +477,14 @@ static vsi_bool op_check
         IO_TYPE(D_F16,          D_F32)
         IO_TYPE(D_U8|Q_ASYM,    D_I32|Q_ASYM)
 
+        /* HW 9.1.1 */
+        IO_TYPE(D_U4|Q_ASYM,    D_U4|Q_ASYM)
+        IO_TYPE(D_U4|Q_SYM,     D_U4|Q_SYM)
+        IO_TYPE(D_I4|Q_ASYM,    D_I4|Q_ASYM)
+        IO_TYPE(D_I4|Q_SYM,     D_I4|Q_SYM)
+
     END_IO_TYPE_DECL(STRIDED_SLICE)
-    if (!VALIDATE_OP_IO_TYPES(STRIDED_SLICE, self, inputs, self->input.num, outputs, self->output.num))
+    if (!VALIDATE_OP_IO_TYPES(STRIDED_SLICE, self, inputs, 1, outputs, self->output.num))
     {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
@@ -482,6 +496,46 @@ static vsi_bool op_check
     return TRUE;
 } /* op_check() */
 
+int32_t _reverse_mask_bits(int32_t mask, int32_t dims)
+{
+    int32_t i = 0;
+    int32_t new_mask = 0;
+    int32_t bits = mask;
+    int32_t leading_one = 0;
+
+    for (leading_one = 0; leading_one < VSI_NN_MAX_DIM_NUM; leading_one ++)
+    {
+        if ( bits == 0 )
+        {
+            break;
+        }
+
+        bits >>= 1;
+    }
+
+    dims = vsi_nn_max(dims, leading_one);
+    for (i = 0; i < dims; i++)
+    {
+        int32_t offset = dims - i - 1;
+        if (mask & (1 << i))
+        {
+            new_mask |= (1 << offset);
+        }
+    }
+
+    return new_mask;
+}
+
+void _reverse_indices(int32_t *dst, const int32_t *src, int32_t dims)
+{
+    int32_t i = 0;
+
+    for (i = 0; i < dims; i++)
+    {
+        dst[dims - i - 1] = src[i];
+    }
+}
+
 static vsi_bool _build_strided_slice_params(vsi_nn_strided_slice_param * op_params, int32_t input_dims)
 {
     uint32_t i = 0;
@@ -490,37 +544,60 @@ static vsi_bool _build_strided_slice_params(vsi_nn_strided_slice_param * op_para
     int32_t begin_mask = op_params->begin_mask;
     int32_t end_mask = op_params->end_mask;
     int32_t shrink_axis_mask = op_params->shrink_axis_mask;
+    int32_t new_axis_mask = op_params->new_axis_mask;
+    int32_t start_indices[2][VSI_NN_MAX_DIM_NUM] = {{0}};
+    int32_t stop_indices[2][VSI_NN_MAX_DIM_NUM] = {{0}};
+    int32_t strides[2][VSI_NN_MAX_DIM_NUM] = {{0}};
+    int32_t start_mask = 0;
+    int32_t stop_mask = 0;
+    int32_t shrink_mask = 0;
+    int32_t output_dims = input_dims;
     const int32_t *begin_dims = op_params->begin_dims;
     const int32_t *end_dims = op_params->end_dims;
     const int32_t *stride_dims = op_params->stride_dims;
     strided_slice_param *params = &op_params->lcl2_data->params;
 
+    begin_mask = _reverse_mask_bits(begin_mask, input_dims);
+    end_mask   = _reverse_mask_bits(end_mask, input_dims);
+    shrink_axis_mask = _reverse_mask_bits(shrink_axis_mask, input_dims);
+    _reverse_indices(start_indices[0], begin_dims, op_params->begin_dims_num);
+    _reverse_indices(stop_indices[0], end_dims, op_params->end_dims_num);
+    _reverse_indices(strides[0], stride_dims, op_params->stride_dims_num);
+
     for (i = 0; i < op_params->begin_dims_num; i++)
     {
         if ( op_params->new_axis_mask & (1 << i))
         {
             num_add_axis ++;
+            output_dims ++;
         }
     }
 
+    for (i = 0; i < (uint32_t)(input_dims + num_add_axis); i++)
+    {
+        if ( op_params->shrink_axis_mask & (1 << i))
+        {
+            output_dims --;
+        }
+    }
+
+    new_axis_mask = _reverse_mask_bits(new_axis_mask, output_dims);
+
     params->num_add_axis = num_add_axis;
 
     for (i = 0; i < (uint32_t)(input_dims + num_add_axis); i++)
     {
-        if ( op_params->new_axis_mask & (1 << i) )
+        if ( new_axis_mask & (1 << i) )
         {
             continue;
         }
         else if (i >= op_params->begin_dims_num + added_ellipsis)
         {
-            params->begin_mask |= (1 << params->begin_dims_num);
-            params->end_mask |= (1 << params->end_dims_num);
-            params->begin_dims[params->begin_dims_num ++ ] =
-                0;
-            params->end_dims[params->end_dims_num ++] =
-                0;
-            params->stride_dims[params->stride_dims_num ++] =
-                1;
+            start_mask |= (1 << params->begin_dims_num);
+            stop_mask |= (1 << params->end_dims_num);
+            start_indices[1][params->begin_dims_num ++ ] = 0;
+            stop_indices[1][params->end_dims_num ++] = 0;
+            strides[1][params->stride_dims_num ++] = 1;
         }
         else
         {
@@ -528,28 +605,32 @@ static vsi_bool _build_strided_slice_params(vsi_nn_strided_slice_param * op_para
 
             if (begin_mask & (1 << orig_idx))
             {
-                params->begin_mask |= (1 << params->begin_dims_num);
+                start_mask |= (1 << params->begin_dims_num);
             }
 
             if (end_mask & (1 << orig_idx))
             {
-                params->end_mask |= (1 << params->end_dims_num);
+                stop_mask |= (1 << params->end_dims_num);
             }
 
             if (shrink_axis_mask & (1 << orig_idx))
             {
-                params->shrink_axis_mask |= (1 << params->begin_dims_num);
+                shrink_mask |= (1 << params->begin_dims_num);
             }
 
-            params->begin_dims[params->begin_dims_num ++] =
-                begin_dims[orig_idx];
-            params->end_dims[params->end_dims_num ++] =
-                end_dims[orig_idx];
-            params->stride_dims[params->stride_dims_num ++] =
-                stride_dims[orig_idx];
+            start_indices[1][params->begin_dims_num ++] = start_indices[0][orig_idx];
+            stop_indices[1][params->end_dims_num ++] = stop_indices[0][orig_idx];
+            strides[1][params->stride_dims_num ++] = strides[0][orig_idx];
         }
     }
 
+    params->begin_mask = _reverse_mask_bits(start_mask, input_dims);
+    params->end_mask   = _reverse_mask_bits(stop_mask, input_dims);
+    params->shrink_axis_mask = _reverse_mask_bits(shrink_mask, input_dims);
+    _reverse_indices(params->begin_dims, start_indices[1], params->begin_dims_num);
+    _reverse_indices(params->end_dims, stop_indices[1], params->end_dims_num);
+    _reverse_indices(params->stride_dims, strides[1], params->stride_dims_num);
+
     return TRUE;
 }
 
@@ -678,7 +759,7 @@ static vsi_status op_optimize
 
     VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
 
-    if( NULL == inputs[0]->t )
+    if ( NULL == inputs[0]->t )
     {
         vsi_nn_TensorReinit( self->graph, inputs[0] );
     }
@@ -687,7 +768,7 @@ static vsi_status op_optimize
     memcpy( start, (vsi_size_t*)start_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM );
     memcpy( end, (vsi_size_t*)stop_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM );
     in_view_tensor = vsi_nn_CreateViewTensor(self->graph, start, end, inputs[0]);
-    if( NULL == in_view_tensor )
+    if ( NULL == in_view_tensor )
     {
         VSILOGE( "Create tensor %d from view fail.", i );
         status = VSI_FAILURE;
@@ -697,12 +778,12 @@ static vsi_status op_optimize
     self->nn_param.strided_slice.lcl2_data->is_optimized = TRUE;
 
     is_same_quant_type = _is_same_quant(inputs, outputs);
-    if( NULL != outputs[0]->t || is_same_quant_type == FALSE)
+    if ( NULL != outputs[0]->t || is_same_quant_type == FALSE)
     {
         VSILOGI( "stride slice copy tensor.");
         // Copy old tensor values to the new address.
         status = copy_tensor_to_view( self, in_view_tensor, outputs[0]);
-        if( VSI_FAILURE == status )
+        if ( VSI_FAILURE == status )
         {
             goto OnError;
         }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_swish.c b/src/tim/vx/internal/src/ops/vsi_nn_op_swish.c
index 7900d88..2aaeb36 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_swish.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_swish.c
@@ -22,7 +22,6 @@
 *
 *****************************************************************************/
 
-
 #include <string.h>
 #include <stdlib.h>
 
@@ -77,32 +76,11 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    BEGIN_IO_TYPE_DECL(SWISH, 1, 1)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_BF16, D_BF16)
-        IO_TYPE(D_F32,  D_F32)
-        IO_TYPE(D_F32,  D_BF16)
-        IO_TYPE(D_BF16, D_F32)
-        IO_TYPE(D_I32,  D_I32)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_F16)
-    END_IO_TYPE_DECL(SWISH)
-    if(!VALIDATE_OP_IO_TYPES(SWISH, self, inputs, self->input.num, outputs, self->output.num)) {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
-    }
+    vsi_bool ret = FALSE;
 
-    return TRUE;
+    ret = vsi_nn_OpCheck(VSI_NN_OP_RSQRT, self, inputs, outputs);
+
+    return ret;
 } /* op_check() */
 
 static vsi_bool op_setup
@@ -153,4 +131,3 @@ DEF_OP_REG
     );
 
 __END_DECLS
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c
index b2c13dd..647396f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c
@@ -22,7 +22,6 @@
 *
 *****************************************************************************/
 
-
 #include <string.h>
 #include <stdlib.h>
 #include "vsi_nn_types.h"
@@ -57,7 +56,6 @@ static vsi_status _tile_op_compute
         &inputs[0], 1,
         &outputs[0], 1, NULL );
 
-
     if( self->n )
     {
         status = VSI_SUCCESS;
@@ -77,17 +75,21 @@ static vsi_bool op_check
     vsi_nn_tile_param * p;
 
     BEGIN_IO_TYPE_DECL(TILE, 1, 1)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_F16)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_BF16, D_BF16)
-        IO_TYPE(D_I32,  D_I32)
-        IO_TYPE(D_U32,  D_U32)
-        IO_TYPE(D_F32,  D_F32)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_F16,          D_F16)
+        IO_TYPE(D_BF16,         D_BF16)
+        IO_TYPE(D_I32,          D_I32)
+        IO_TYPE(D_U32,          D_U32)
+        IO_TYPE(D_F32,          D_F32)
     END_IO_TYPE_DECL(TILE)
-    if(!VALIDATE_OP_IO_TYPES(TILE, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(TILE, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -160,4 +162,3 @@ DEF_TILE_OP( TILE, tile );
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c
index 34becd3..1923b26 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c
@@ -53,7 +53,7 @@ static vsi_bool vsi_nn_upsample_optimize_shape
     )
 {
     vsi_bool   enable_image_2d = FALSE;
-    vsi_ssize_t    hwLitimLen      = 65536;
+    vsi_ssize_t    hwLitimLen      = GPU_TENSOR_MAX_WIDTH;
 
     if ((2 == self->nn_param.upsample.scale[0])
        && (2 == self->nn_param.upsample.scale[1]))
@@ -166,7 +166,6 @@ static vsi_status op_compute
 
     if( ret )
     {
-
         reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
                 inputs[0],  shapes[0], new_rank );
         reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
@@ -311,4 +310,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
index ade122c..b782511 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@@ -444,6 +444,13 @@ static _op_param_gen_t s_op_gen[] =
     /* GATHER_ELEMENTS */       NULL,
     /* SELU */                  NULL,
     /* CELU */                  NULL,
+    /* MAX_POOL3D */            NULL,
+    /* RCP */                   NULL,
+    /* SIGN */                  NULL,
+    /* SOFTSIGN */              NULL,
+    /* CUMSUM */                NULL,
+    /* MAXPOOLWITHARGMAX */     NULL,
+    /* MOD */                   NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
 
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
index 92dedcc..2e6b26e 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
@@ -31,7 +31,7 @@
 #include "kernel/vsi_nn_kernel.h"
 
 #define DEF_DTYPE_CONVERT_NORMAL(SRC_NAME, SRC_DTYPE, DST_NAME, DST_DTYPE) \
-static inline void _convert_##SRC_NAME##_to_##DST_NAME \
+static VSI_INLINE_API void _convert_##SRC_NAME##_to_##DST_NAME \
         ( \
         const SRC_DTYPE * buffer, \
         size_t size, \
@@ -60,7 +60,7 @@ DEF_DTYPE_CONVERT_NORMAL( float, float, uint32, uint32_t )
 DEF_DTYPE_CONVERT_NORMAL( float, float, uint16, uint16_t )
 #undef DEF_DTYPE_CONVERT_NORMAL
 
-static inline void _convert_float16_to_float
+static VSI_INLINE_API void _convert_float16_to_float
     (
     const vsi_float16 * buffer,
     size_t size,
@@ -74,7 +74,7 @@ static inline void _convert_float16_to_float
     }
 } /* _convert_float16_to_float */
 
-static inline void _convert_float_to_float16
+static VSI_INLINE_API void _convert_float_to_float16
     (
     const float * buffer,
     size_t size,
@@ -88,7 +88,7 @@ static inline void _convert_float_to_float16
     }
 } /* _convert_float_to_float16 */
 
-static inline void _convert_bfloat16_to_float
+static VSI_INLINE_API void _convert_bfloat16_to_float
     (
     const vsi_bfloat16 * buffer,
     size_t size,
@@ -102,7 +102,7 @@ static inline void _convert_bfloat16_to_float
     }
 } /* _convert_bfloat16_to_float */
 
-static inline void _convert_float_to_bfloat16
+static VSI_INLINE_API void _convert_float_to_bfloat16
     (
     const float * buffer,
     size_t size,
@@ -163,7 +163,7 @@ DEF_DTYPE_CONVERT_QUANTIZE( symm16,  int16_t,  vsi_rtne, SHRT_MIN,  SHRT_MAX  )
 DEF_DTYPE_CONVERT_QUANTIZE( symm32,  int32_t,  vsi_rtne, INT_MIN,   INT_MAX   )
 DEF_DTYPE_CONVERT_QUANTIZE( symm64,  int64_t,  vsi_rtne, LLONG_MIN, LLONG_MAX )
 DEF_DTYPE_CONVERT_QUANTIZE( asymm8,  uint8_t,  vsi_rtne, 0,         UCHAR_MAX )
-//DEF_DTYPE_CONVERT_QUANTIZE( asymm16, uint16_t, vsi_rtne, 0,         USHRT_MAX )
+DEF_DTYPE_CONVERT_QUANTIZE( asymm16, uint16_t, vsi_rtne, 0,         USHRT_MAX )
 //DEF_DTYPE_CONVERT_QUANTIZE( asymm32, uint32_t, vsi_rtne, 0,         UINT_MAX  )
 #undef DEF_DTYPE_CONVERT_QUANTIZE
 
@@ -419,6 +419,9 @@ vsi_bool vsi_nn_dtype_convert_quantize_asymm_to_float
         case U8:
             return vsi_nn_dtype_convert_quantize_asymm8_to_float(
                     (const uint8_t *)buffer, size, scale, zero_point, out_buffer );
+        case U16:
+            return vsi_nn_dtype_convert_quantize_asymm16_to_float(
+                (const uint16_t*)buffer, size, scale, zero_point, out_buffer);
         case I8:
             return vsi_nn_dtype_convert_quantize_symm8_to_float(
                     (const int8_t *)buffer, size, scale, zero_point, out_buffer );
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c
index 3c45846..6547f46 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c
@@ -447,34 +447,66 @@ vsi_bool vsi_nn_DtypeCompare
     vsi_nn_dtype_t *dtype1
     )
 {
-    if(NULL == dtype0 || NULL == dtype1)
+    if (NULL == dtype0 || NULL == dtype1)
     {
         return FALSE;
     }
 
-    if(dtype0->vx_type != dtype1->vx_type || dtype0->qnt_type != dtype1->qnt_type)
+    if ( dtype0->vx_type != dtype1->vx_type ||
+         dtype0->qnt_type != dtype1->qnt_type )
     {
         return FALSE;
     }
-    if(dtype0->qnt_type == VSI_NN_QNT_TYPE_DFP)
+
+    switch (dtype0->qnt_type)
     {
-        if(dtype0->fl != dtype1->fl)
+        case VSI_NN_QNT_TYPE_DFP:
+            if (dtype0->fl != dtype1->fl)
+            {
+                return FALSE;
+            }
+            break;
+        case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
+        case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
         {
-            return FALSE;
+            const float diff = (float)1e-5;
+            if (dtype0->zero_point != dtype1->zero_point)
+            {
+                return FALSE;
+            }
+            if (vsi_nn_float_compare(dtype0->scale, dtype1->scale, diff)
+                == FALSE)
+            {
+                return FALSE;
+            }
+
+            break;
         }
-    }
-    else if( dtype0->qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC ||
-             dtype0->qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC )
-    {
-        const float diff = (float)1e-5;
-        if(dtype0->zero_point != dtype1->zero_point)
+        case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC:
+        case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC:
         {
-            return FALSE;
-        }
-        if(vsi_nn_float_compare(dtype0->scale, dtype1->scale, diff) == FALSE)
-        {
-            return FALSE;
+            const float diff = (float)1e-5;
+            int32_t i = 0;
+            int32_t scale_cnt0 = dtype0->scale_dim;
+            int32_t scale_cnt1 = dtype1->scale_dim;
+
+            if (scale_cnt0 == scale_cnt1)
+            {
+                const float* src_scale_ptr = dtype0->scales;
+                const float* dst_scale_ptr = dtype1->scales;
+                for (i = 0; i < scale_cnt0; i++)
+                {
+                    if (vsi_nn_float_compare(src_scale_ptr[i],dst_scale_ptr[i], diff)
+                        == FALSE)
+                    {
+                        return FALSE;
+                    }
+                }
+            }
+            break;
         }
+        default:
+            break;
     }
 
     return TRUE;
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c
index bd14b39..25ffab7 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c
@@ -28,7 +28,7 @@
 #include <math.h>
 #include <fcntl.h>
 
-#ifdef _WIN32
+#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
 #include <io.h>
 #include <direct.h>
 #else
@@ -112,7 +112,7 @@ char* vsi_nn_strncpy
     )
 {
     char* ret = NULL;
-    #ifdef _MSC_VER
+    #if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
         strncpy_s(dest, count, source, _TRUNCATE);
     #else
         strncpy(dest, source, count);
@@ -128,7 +128,7 @@ char* vsi_nn_strncat
     )
 {
     char* ret = NULL;
-    #ifdef _MSC_VER
+    #if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
         strncat_s(dest, count, source, _TRUNCATE);
         ret = dest;
     #else
@@ -143,7 +143,7 @@ char* vsi_nn_getenv
     )
 {
     char* var = NULL;
-    #ifdef _MSC_VER
+    #if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
         size_t var_size = 0;
         _dupenv_s(&var, &var_size, var_name);
     #else
@@ -159,7 +159,7 @@ FILE* vsi_nn_fopen
     )
 {
     FILE * file = NULL;
-    #ifdef _MSC_VER
+    #if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
         fopen_s(&file, file_name, mode);
     #else
         file = fopen(file_name, mode);
@@ -795,7 +795,7 @@ int32_t vsi_nn_Access
         return -1;
     }
 
-#ifdef _WIN32
+#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
     return _access(path, mode);
 #else
     return access(path, mode);
@@ -813,7 +813,7 @@ int32_t vsi_nn_Mkdir
         return -1;
     }
 
-#ifdef _WIN32
+#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
     return _mkdir(path);
 #else
     return mkdir(path, mode);
@@ -1128,59 +1128,67 @@ vsi_bool vsi_nn_is_same_quant_type(
     vsi_nn_tensor_t * dst
     )
 {
-    vx_bool result = FALSE;
+    vsi_nn_dtype_t *src_dtype = NULL, *dst_dtype = NULL;
 
-    if (src->attr.dtype.vx_type == dst->attr.dtype.vx_type)
+    src_dtype = &src->attr.dtype;
+    dst_dtype = &dst->attr.dtype;
+
+    if (src_dtype->qnt_type != dst_dtype->qnt_type)
     {
-        switch (src->attr.dtype.qnt_type)
-        {
-        case VSI_NN_QNT_TYPE_NONE:
-            result = TRUE;
-            break;
+        return FALSE;
+    }
 
+    switch (src_dtype->qnt_type)
+    {
         case VSI_NN_QNT_TYPE_DFP:
-            if (src->attr.dtype.fl == dst->attr.dtype.fl)
+            if (src_dtype->fl != dst_dtype->fl)
             {
-                result = TRUE;
+                return FALSE;
             }
             break;
         case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
         case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
-            if (src->attr.dtype.scale == dst->attr.dtype.scale &&
-                src->attr.dtype.zero_point == dst->attr.dtype.zero_point)
+        {
+            const float diff = (float)1e-5;
+            if (src_dtype->zero_point != dst_dtype->zero_point)
             {
-                result = TRUE;
+                return FALSE;
+            }
+            if (vsi_nn_float_compare(src_dtype->scale, dst_dtype->scale, diff)
+                == FALSE)
+            {
+                return FALSE;
             }
             break;
-
+        }
         case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC:
+        case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC:
+        {
+            const float diff = (float)1e-5;
+            int32_t i = 0;
+            int32_t scale_cnt0 = src_dtype->scale_dim;
+            int32_t scale_cnt1 = dst_dtype->scale_dim;
+
+            if (scale_cnt0 == scale_cnt1)
             {
-                int32_t i = 0;
-                int32_t scale_cnt0 = src->attr.dtype.scale_dim;
-                int32_t scale_cnt1 = dst->attr.dtype.scale_dim;
-
-                if (scale_cnt0 == scale_cnt1)
+                const float* src_scale_ptr = src_dtype->scales;
+                const float* dst_scale_ptr = dst_dtype->scales;
+                for (i = 0; i < scale_cnt0; i++)
                 {
-                    const float *src_scale_ptr = src->attr.dtype.scales;
-                    const float *dst_scale_ptr = dst->attr.dtype.scales;
-                    for (i = 0; i < scale_cnt0; i++)
+                    if (vsi_nn_float_compare(
+                            src_scale_ptr[i], dst_scale_ptr[i], diff) == FALSE)
                     {
-                        if (src_scale_ptr[i] != dst_scale_ptr[i])
-                            break;
+                        return FALSE;
                     }
-
-                    if (i == scale_cnt0)
-                        result = TRUE;
                 }
             }
             break;
-
+        }
         default:
             break;
-        }
     }
 
-    return result;
+    return TRUE;
 }
 
 vsi_bool vsi_nn_is_same_type
@@ -1220,6 +1228,67 @@ vsi_bool vsi_nn_is_broadcast_operaton
     return FALSE;
 }
 
+vsi_bool vsi_nn_is_broadcast_axes_operaton
+    (
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            *  output,
+    int32_t                    *  axis,
+    int32_t                       axis_num
+    )
+{
+    vsi_size_t out_rank = output->attr.dim_num;
+    vsi_size_t i = 0;
+
+    if (vsi_nn_is_broadcast_operaton(inputs, input_num, output) == FALSE)
+    {
+        return FALSE;
+    }
+
+    for (i = 0; i < out_rank; i++)
+    {
+        size_t j = 0;
+        int32_t k = 0;
+        vsi_size_t src0_size = i < inputs[0]->attr.dim_num  ?
+                        inputs[0]->attr.size[i] : 1;
+
+        for (k = 0; k < axis_num; k++)
+        {
+            if (axis[k] == (int32_t)i)
+            {
+                for (j = 1; j < input_num; j++)
+                {
+                    vsi_size_t src_size = i < inputs[j]->attr.dim_num  ?
+                        inputs[j]->attr.size[i] : 1;
+
+                    if (src0_size == src_size)
+                    {
+                        return FALSE;
+                    }
+                }
+
+                break;
+            }
+        }
+
+        if (axis[k] == (int32_t)i)
+        {
+            continue;
+        }
+
+        for (j = 1; j < input_num; j++)
+        {
+            vsi_size_t src_size = i < inputs[j]->attr.dim_num  ? inputs[j]->attr.size[i] : 1;
+
+            if (src0_size != src_size)
+            {
+                return FALSE;
+            }
+        }
+    }
+    return TRUE;
+}
+
 float vsi_nn_get_tensor_scale
     (
     vsi_nn_tensor_t * tensor
diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c
index acd5b4f..cbddf2d 100644
--- a/src/tim/vx/internal/src/vsi_nn_context.c
+++ b/src/tim/vx/internal/src/vsi_nn_context.c
@@ -62,6 +62,8 @@ static vsi_status query_hardware_caps
                 sizeof(vx_hardware_caps_params_ext2_t));
     context->config.support_stream_processor = paramExt.supportStreamProcessor;
     context->config.sp_exec_count = paramExt2.streamProcessorExecCount;
+    context->config.sp_vector_depth = paramExt2.streamProcessorVectorSize;
+    context->config.sp_per_core_vector_depth = context->config.sp_vector_depth / context->config.sp_exec_count;
 #endif
 
 #endif
diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c
index ee81ac1..535f595 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph.c
@@ -1875,7 +1875,6 @@ final:
     return status;
 } /* vsi_nn_TrySetupCompleteSignalNode() */
 
-
 /*
  * Documented in vsi_nn_graph.h
  */
@@ -1884,7 +1883,7 @@ vsi_status vsi_nn_setup_binary_graph_inputs_outputs
     vsi_nn_graph_t* graph
     )
 {
-    uint32_t i,j;
+    uint32_t i,j,k,p;
     vsi_status status = VSI_FAILURE;
     uint32_t num_of_graph_inputs;
     uint32_t num_of_graph_real_inputs;
@@ -1911,6 +1910,33 @@ vsi_status vsi_nn_setup_binary_graph_inputs_outputs
             ;//do nothing
         }
     }
+    /*update inputs for nbg node  who has crop scalar parameter as inputs*/
+    for (i = 0; i < graph->node_num; i++)
+    {
+        vsi_nn_node_t* node = vsi_nn_GetNode(graph, i);
+        uint32_t numParams = 0;
+        if (node->op == VSI_NN_OP_NBG)
+        {
+            status = vxQueryNode(
+                node->n, VX_NODE_PARAMETERS, &numParams, sizeof(numParams));
+            for (j = 0; j < numParams; j++)
+            {
+                vx_parameter param = 0;
+                vx_enum type = 0;
+                param = vxGetParameterByIndex(node->n, j);
+                status = vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum));
+                if (type == VX_TYPE_SCALAR)
+                {
+                    num_of_graph_real_inputs++;
+                }
+                if (param != NULL)
+                {
+                    vxReleaseParameter(&param);
+                    param = NULL;
+                }
+            }
+        }
+    }
     graph_inputs = (vx_reference *)malloc( num_of_graph_real_inputs * sizeof( vx_reference ) );
     CHECK_PTR_FAIL_GOTO( graph_inputs, "Create buffer fail.", final );
     for( i = 0, j = 0; i < num_of_graph_inputs; i++ )
@@ -1924,6 +1950,52 @@ vsi_status vsi_nn_setup_binary_graph_inputs_outputs
                 goto final;
             }
             graph_inputs[j++] = (vx_reference)( tensor->t );
+            for (k = 0; k < graph->node_num; k++)
+            {
+                vsi_nn_node_t* node = vsi_nn_GetNode(graph, k);
+                if (node->op == VSI_NN_OP_NBG)
+                {
+                    vx_parameter param = 0;
+                    vx_reference ref = 0;
+                    vx_enum type = 0;
+                    uint32_t scalar_index = j;
+                    param = vxGetParameterByIndex(node->n, scalar_index);
+                    status = vxQueryParameter(param,
+                                                VX_PARAMETER_TYPE,
+                                                &type,
+                                                sizeof(vx_enum));
+                    if (param != NULL)
+                    {
+                        vxReleaseParameter(&param);
+                        param = NULL;
+                    }
+                    if (type != VX_TYPE_SCALAR)
+                    {
+                        break;
+                    }
+                    for (p = scalar_index; p < scalar_index+4; p++)
+                    {
+                        param = vxGetParameterByIndex(node->n, p);
+                        status = vxQueryParameter(param,
+                                                    VX_PARAMETER_TYPE,
+                                                    &type,
+                                                    sizeof(vx_enum));
+                        if (type == VX_TYPE_SCALAR)
+                        {
+                            vxQueryParameter(param,
+                                                VX_PARAMETER_REF,
+                                                &ref,
+                                                sizeof(vx_reference));
+                            graph_inputs[j++] = ref;
+                            vxReleaseReference(&ref);
+                        }
+                        if (param != NULL)
+                        {
+                            vxReleaseParameter(&param);
+                        }
+                    }
+                }
+            }
         }
         else
         {
diff --git a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
index 1f46c3f..1845bc7 100644
--- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
+++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
@@ -195,6 +195,9 @@ static _node_template s_template[] =
     /* GRU */                   NULL,
     /* GRUCELL */               NULL,
     /* GRUCELL_ACTIVATION */    NULL,
+    /* CUMSUM */                NULL,
+    /* MAXPOOLWITHARGMAX */     NULL,
+    /* MOD */                   NULL,
 };
 //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c );
 
diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
index 3b10e44..10f25ac 100644
--- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
+++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
@@ -835,15 +835,15 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam
                     if (node->op == VSI_NN_OP_PRE_PROCESS && node->nn_param.pre_process.type !=
                             VSI_NN_SOURCE_FORMAT_TENSOR)
                     {
-                        if(node->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR)
-                        {
+                        //if(node->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR)
+                        //{
                             /* 2 additional input tensors and 4 paramter scalar*/
-                            num_of_graph_real_inputs += 6;
-                        }
-                        else
-                        {
-                            num_of_graph_real_inputs += 4;
-                        }
+                        //    num_of_graph_real_inputs += 6;
+                        //}
+                        //else
+                        //{
+                        num_of_graph_real_inputs += 4;
+                        //}
                     }
                 }
             }
@@ -885,10 +885,10 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam
                         break;
                     }
                 }
-                if (!processed && enabled)
+                if (!processed)
                 {
                     processed_node_id_list[processed_idx++] = node->uid;
-                    if (node->op == VSI_NN_OP_PRE_PROCESS)
+                    if (enabled)
                     {
                         vx_node prenode = NULL;
                         vx_uint32 numParams = 0;
@@ -1028,3 +1028,82 @@ final:
     }
     return status;
 } /* vs_nn_AddBinaryGraphInputsWithCropParam() */
+
+vsi_status vsi_nn_UpdateCropParamsForBinaryGraph
+(
+    vsi_nn_graph_t* graph,
+    uint32_t enabled_crop_input_idx,
+    uint32_t start_x,
+    uint32_t start_y,
+    uint32_t crop_w,
+    uint32_t crop_h,
+    uint32_t dst_w,
+    uint32_t dst_h
+)
+{
+    uint32_t i, j;
+    uint32_t numParams = 0;
+    int32_t scalar_value[4] = {0};
+    vsi_status status = VSI_FAILURE;
+    uint32_t input_idx = enabled_crop_input_idx;
+    scalar_value[0] = (int32_t)((crop_w << 15) / dst_w);
+    scalar_value[1] = (int32_t)((crop_h << 15) / dst_h);
+    scalar_value[2] = start_x; /*rgb start_x*3, rgb start_x*4*/
+    scalar_value[3] = start_y;
+
+    for (i = 0; i < graph->node_num; i++)
+    {
+        vsi_nn_node_t* node = vsi_nn_GetNode(graph, i);
+        if (node->op == VSI_NN_OP_NBG)
+        {
+            vx_parameter param = 0;
+            vx_enum type = 0;
+            vx_reference ref = 0;
+            uint32_t scalar_idx = 0;
+            uint32_t scalar_value_idx = 0;
+            int32_t temp_value = 0;
+            status = vxQueryNode(node->n, VX_NODE_PARAMETERS, &numParams, sizeof(numParams));
+            for (j = 0; j < numParams; j++)
+            {
+
+                param = vxGetParameterByIndex(node->n, j);
+                status = vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum));
+                if (type == VX_TYPE_SCALAR)
+                {
+                    scalar_idx = j;
+                    break;
+                }
+            }
+            while (input_idx > 0)
+            {
+                uint32_t tensor_idx = scalar_idx + 4;
+                for (j = tensor_idx; j < numParams; j++)
+                {
+                    param = vxGetParameterByIndex(node->n, j);
+                    status = vxQueryParameter(
+                        param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum));
+                    if (type == VX_TYPE_SCALAR)
+                    {
+                        scalar_idx = j;
+                        break;
+                    }
+                }
+                input_idx--;
+            }
+            for (j = scalar_idx; j < scalar_idx + 4; j++)
+            {
+                temp_value = scalar_value[scalar_value_idx++];
+                param = vxGetParameterByIndex(node->n, j);
+                status = vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum));
+                if (type == VX_TYPE_SCALAR)
+                {
+                    status = vxQueryParameter(param, VX_PARAMETER_REF, &ref, sizeof(vx_reference));
+                    status = vxWriteScalarValue((vx_scalar)ref, &temp_value);
+                    status = vxSetParameterByIndex(node->n, j, ref);
+                }
+            }
+
+        }
+    }
+    return status;
+} /* vsi_nn_UpdateCropParamsForBinaryGraph() */
diff --git a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c
index 8fa073c..b3f8800 100644
--- a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c
+++ b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c
@@ -60,7 +60,7 @@ vsi_bool vsi_nn_rnn_find_best_kernel_size
         /* try NxN */
         if( !multi_batch )
         {
-            #if( !defined( _WIN32 ) )
+            #if( !(defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32)) )
             /* try NxN conv */
             kernel_h = 8;
             while( input_size % (kernel_h * kernel_h) != 0 )
@@ -958,12 +958,16 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_permute
 {
     vsi_nn_internal_node_t* curr = NULL;
     vsi_nn_internal_tensor_t* tensor0 = NULL;
-    uint32_t* permute_in_perm = NULL;
+    uint32_t i = 0, * permute_in_perm = NULL;
 
     curr = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0);
     permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(curr,
         dim_num * sizeof(uint32_t));
-    memcpy(permute_in_perm, perm, dim_num * sizeof(uint32_t));
+
+    for (i = 0; i < dim_num; i++)
+    {
+        permute_in_perm[i] = (uint32_t)perm[i];
+    }
     curr->node->nn_param.permute.perm = permute_in_perm;
     curr->node->nn_param.permute.dim_num = (uint32_t)dim_num;
     curr->inputs[0] = input_tensor;
diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c
index 18c964e..54236c0 100644
--- a/src/tim/vx/internal/src/vsi_nn_tensor.c
+++ b/src/tim/vx/internal/src/vsi_nn_tensor.c
@@ -428,7 +428,7 @@ static vsi_bool _init_tensor
     if ( TRUE == tensor->attr.is_dummy )
     {
         tensor->t = vxCreateDummyTensor( graph->ctx->c,
-            (vsi_size_t)tensor->attr.dim_num, tensor->attr.size, (vsi_enum)tensor->attr.dtype.vx_type );
+            (vsi_size_t)tensor->attr.dim_num, size_vxsize, (vsi_enum)tensor->attr.dtype.vx_type );
     } else
 #endif
     if( TRUE == tensor->attr.is_created_from_handle )