diff --git a/prebuilt-sdk/x86_64_linux/VERSION b/prebuilt-sdk/x86_64_linux/VERSION
index 2524731..703a8d3 100644
--- a/prebuilt-sdk/x86_64_linux/VERSION
+++ b/prebuilt-sdk/x86_64_linux/VERSION
@@ -1 +1 @@
-REL/6.4.9
+REL/6.4.10.2
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h
index d412514..48f824f 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h
@@ -3347,6 +3347,36 @@ VX_API_ENTRY vx_status VX_API_CALL vxSwapTensorHandle(vx_tensor tensor, void* ne
 VX_API_ENTRY vx_status VX_API_CALL vxCopyTensorPatch(vx_tensor tensor, vx_size number_of_dims, const vx_size * view_start, const vx_size * view_end,
         const vx_size * user_stride, void * user_ptr, vx_enum usage, vx_enum user_memory_type);
 
+/*! \brief Allows the application to copy a view patch from/into an tensor object .
+ * \param [in] tensor The reference to the tensor object that is the source or the
+ * destination of the copy.
+ * \param [in] number_of_dims Number of patch dimension. Error return if 0 or greater than number of
+ * tensor dimensions. If smaller than number of tensor dimensions, the lower dimensions are assumed.
+ * \param [in] view_start Array of patch start points in each dimension
+ * \param [in] view_end Array of patch end points in each dimension
+ * \param [in] tensorpatch_addressing Pointer to parameter of type <tt>\ref vx_tensorpatch_addressing_t</tt>.
+ * \param [in] user_ptr The address of the memory location where to store the requested data
+ * if the copy was requested in read mode, or from where to get the data to store into the tensor
+ * object if the copy was requested in write mode. The accessible memory must be large enough
+ * to contain the specified patch with the specified layout:\n
+ * accessible memory in bytes >= (end[last_dimension] - start[last_dimension]) * stride[last_dimension].\n
+ * The layout of the user memory must follow a row major order.
+ * \param [in] usage This declares the effect of the copy with regard to the tensor object
+ * using the <tt>\ref vx_accessor_e</tt> enumeration. Only <tt>\ref VX_READ_ONLY</tt> and <tt>\ref VX_WRITE_ONLY</tt> are supported:
+ * \arg <tt>\ref VX_READ_ONLY</tt> means that data is copied from the tensor object into the application memory
+ * \arg <tt>\ref VX_WRITE_ONLY</tt> means that data is copied into the tensor object from the application memory
+ * \param [in] user_memory_type A <tt>\ref vx_memory_type_e</tt> enumeration that specifies
+ * the memory type of the memory referenced by the user_addr.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_ERROR_OPTIMIZED_AWAY This is a reference to a virtual tensor that cannot be
+ * accessed by the application.
+ * \retval VX_ERROR_INVALID_REFERENCE The tensor reference is not actually an tensor reference.
+ * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect.
+ * \ingroup group_object_tensor
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxCopyTensorPatch2(vx_tensor tensor, vx_size number_of_dims, const vx_size * view_start, const vx_size * view_end,
+        const vx_tensorpatch_addressing_t * addressing, vx_size size_of_addressing, void * user_ptr, vx_enum usage, vx_enum user_memory_type);
+
 /*! \brief Allows the application to get direct access to a patch of tensor object.
  * \param [in] tensor The reference to the tensor object that is the source or the
  * destination for direct access.
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
index 782961c..a8ea910 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
@@ -50,7 +50,6 @@ enum vx_library_e {
  * \ingroup group_kernel
  */
 enum vx_kernel_e {
-
     /*!
      * \brief The Color Space conversion kernel.
      * \details The conversions are based on the <tt>\ref vx_df_image_e</tt> code in the images.
@@ -377,7 +376,7 @@ enum vx_kernel_e {
     * \see group_vision_function_min
     */
     VX_KERNEL_MIN = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x3F,
- 
+
     /*! \brief The weigthed average kernel.
      * \see group_vision_function_weighted_average
      */
@@ -391,14 +390,14 @@ enum vx_kernel_e {
     VX_KERNEL_NN_FULLY_CONNECTED_RELU_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x2,
 
     //VX_KERNEL_NN_SOFTMAX_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x3,
-    
-    //VX_KERNEL_NN_NORMALIZATION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x4, 
+
+    //VX_KERNEL_NN_NORMALIZATION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x4,
 
     VX_KERNEL_NN_LRN_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x3,
 
-    //VX_KERNEL_NN_NORMALIZE_IMAGE_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x4, 
+    //VX_KERNEL_NN_NORMALIZE_IMAGE_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x4,
 
-    //VX_KERNEL_NN_POOLING_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x7, 
+    //VX_KERNEL_NN_POOLING_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x7,
 
     //VX_KERNEL_NN_ACTIVATION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x9,
 
@@ -415,7 +414,7 @@ enum vx_kernel_e {
     //VX_KERNEL_NN_CONVOLUTION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0xF,
 
     VX_KERNEL_NN_CONCATINDEFINITE_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x8,
-    
+
     VX_KERNEL_NN_REORG_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x9,
 
     //VX_KERNEL_NN_DECONVOLUTION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x12,
@@ -429,9 +428,9 @@ enum vx_kernel_e {
     VX_KERNEL_NN_CONVOLUTION_RELU_POOLING_LAYER2 = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0xD,
 
     VX_KERNEL_NN_POOLING_LAYER2 = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0xE,
-    
+
     VX_KERNEL_NN_TENSOR_REDUCE_SUM = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0xF,
-    
+
     VX_KERNEL_NN_TENSOR_PAD = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x10,
 
     VX_KERNEL_NN_LSTM_UNIT = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x11,
@@ -439,25 +438,25 @@ enum vx_kernel_e {
     VX_KERNEL_NN_LSTM_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x12,
 
     VX_KERNEL_NN_REORG2_LAYER         = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x13,
-    
+
     VX_KERNEL_NN_TENSOR_ROUNDING      = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x14,
-    
+
     VX_KERNEL_NN_HASH_LUT_LAYER       = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x15,
-    
+
     VX_KERNEL_NN_LSH_PROJECTION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x16,
-    
+
     VX_KERNEL_NN_TENSOR_RESHPE        = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x17,
-    
+
     VX_KERNEL_NN_LUT2_LAYER           = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x18,
-    
+
     VX_KERNEL_NN_TENSOR_SCALE         = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x19,
-    
+
     VX_KERNEL_NN_RNN_LAYER            = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x1A,
-    
+
     VX_KERNEL_NN_SOFTMAX2_LAYER       = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x1B,
-    
+
     VX_KERNEL_NN_SVDF_LAYER           = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x1C,
-    
+
     VX_KERNEL_NN_NORMALIZATION_LAYER2 = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x1D,
 
     VX_KERNEL_NN_TENSOR_REVERSE       = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x1E,
@@ -477,11 +476,11 @@ enum vx_kernel_e {
     VX_KERNEL_NN_PRELU                = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x25,
 
     VX_KERNEL_NN_GRU_UNIT_LAYER       = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x26,
-    
+
     VX_KERNEL_NN_GRU_LAYER            = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x27,
-    
+
     VX_KERNEL_NN_CONV_LSTM_UNIT_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x28,
-    
+
     VX_KERNEL_NN_CONV_LSTM_LAYER      = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x29,
 
     VX_KERNEL_NN_FULLY_CONNECTED_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x2A,
@@ -498,6 +497,8 @@ enum vx_kernel_e {
 
     VX_KERNEL_NN_CONV_3D_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x30,
 
+    VX_KERNEL_NN_DECONV_3D_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x31,
+
     VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */
 };
 
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
index d6d9b93..74f3592 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
@@ -29,8 +29,8 @@
 #define __VX_KHR_COMPATIBLE_H__
 /*
  VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS is used to distingush deconvolution weight layout
- [value] 
- 0: weight_layout is whnc 
+ [value]
+ 0: weight_layout is whnc
  1: weight_layout is whcn
 */
 #ifndef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS
@@ -166,4 +166,34 @@ VX_CONV_3D_API_SUPPORT is used to declare that vsi openvx driver can support con
 #define VX_CONV_3D_API_SUPPORT 1
 #endif
 
+/*
+VX_DECONV_3D_API_SUPPORT is used to declare that vsi openvx driver can support deconv3d by vxDeconv3dLayer API.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_DECONV_3D_API_SUPPORT
+#define VX_DECONV_3D_API_SUPPORT 0
+#endif
+
+/*
+ VX_PAD_CONST_SUPPORT is used to declare that openvx can support pad_const for tensorpad and convolution.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_PAD_CONST_SUPPORT
+#define VX_PAD_CONST_SUPPORT 1
+#endif
+
+/*
+ VX_TENSOR_STRIDE_X_BITS_SUPPORT is used to declare that openvx can support tensor which bits of stride in x dimension is not an integer number of bytes.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_TENSOR_STRIDE_X_BITS_SUPPORT
+#define VX_TENSOR_STRIDE_X_BITS_SUPPORT 1
+#endif
+
 #endif /* __VX_KHR_COMPATIBLE_H__ */
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
index 88a9967..cca4338 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
@@ -1,4 +1,4 @@
-/* 
+/*
 
  * Copyright (c) 2012-2017 The Khronos Group Inc.
  *
@@ -74,7 +74,7 @@ CONVOLUTIONAL_NETWORK structs and enums
 /*! \brief The Neural Network Extension Library Set
  * \ingroup group_cnn
  */
-#define VX_LIBRARY_KHR_NN_EXTENSION (0x1) 
+#define VX_LIBRARY_KHR_NN_EXTENSION (0x1)
 
 /*! \brief The list of Neural Network Extension Kernels.
  * \ingroup group_cnn
@@ -212,7 +212,7 @@ enum vx_nn_activation_function_e
     VX_NN_ACTIVATION_NONE = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x7,
 };
 
-/*! \brief  The Convolutional network type 
+/*! \brief  The Convolutional network type
  * \ingroup group_cnn
  */
 enum vx_nn_layer_type_e
@@ -337,6 +337,30 @@ typedef struct _vx_nn_convolution_3d_params_t
     vx_int32 depth_multiplier;              /*!< \brief depthwise multiplier value, if 0, means convolution, elsewise(>=1), the convolution is depthwiseconvolution. */
 }vx_nn_convolution_3d_params_t;
 
+typedef struct _vx_nn_deconvolution_3d_params_t
+{
+    vx_int32 padding_w_left;                /*!< \brief Number of elements subtracted at left of the w dimension of the input. */
+    vx_int32 padding_w_right;               /*!< \brief Number of elements subtracted at right of the w dimension of the input. */
+    vx_int32 padding_h_top;                 /*!< \brief Number of elements subtracted at top of the h dimension of the input. */
+    vx_int32 padding_h_bottom;              /*!< \brief Number of elements subtracted at bottom of the h dimension of the input. */
+    vx_int32 padding_d_front;               /*!< \brief Number of elements subtracted at front of the d dimension of the input. */
+    vx_int32 padding_d_rear;                /*!< \brief Number of elements subtracted at end of the d dimension of the input. */
+
+    vx_int32 stride_w;                      /*!< \brief  inter 0 between input elements at w direction for down scale.  */
+    vx_int32 stride_h;                      /*!< \brief  inter 0 between input elements at h direction for down scale.  */
+    vx_int32 stride_d;                      /*!< \brief  inter 0 between input elements at d direction for down scale.  */
+
+    vx_int32 a_w;                            /*!< \brief user-specified quantity used to distinguish between the \f$upscale_w\f$ different possible output sizes. */
+    vx_int32 a_h;                            /*!< \brief user-specified quantity used to distinguish between the \f$upscale_h\f$ different possible output sizes. */
+    vx_int32 a_d;                            /*!< \brief user-specified quantity used to distinguish between the \f$upscale_d\f$ different possible output sizes. */
+
+    vx_int32 channel_group;                 /*!< \brief  Number of separate groups for deconvolution (Range: 0 <= groups <= size of z dimension of input; size of z dimension of input can be divided by groups) */
+
+    vx_enum overflow_policy;                /*!< \brief A <tt> VX_TYPE_ENUM</tt> of the <tt> vx_convert_policy_e</tt> enumeration. */
+    vx_enum rounding_policy;                /*!< \brief A <tt> VX_TYPE_ENUM</tt> of the <tt> vx_round_policy_e</tt> enumeration. */
+    vx_enum down_scale_size_rounding;       /*!< \brief Rounding method for calculating output dimensions. See <tt>\ref vx_nn_rounding_type_e</tt> */
+}vx_nn_deconvolution_3d_params_t;
+
 /*==============================================================================
     TENSOR DATA FUNCTIONS
 =============================================================================*/
@@ -415,9 +439,9 @@ VX_API_ENTRY vx_status VX_API_CALL vxReleaseTensorAddressing(vx_tensor_addressin
 /*! \brief Creates an array of tensors
  * \param [in] context      The reference to the overall Context.
  * \param [in] count        Number of Objects to create in the ObjectArray.
- * \param [in] tensor*     The tensors array that need add to the ObjectArray. 
+ * \param [in] tensor*     The tensors array that need add to the ObjectArray.
  *
- * \returns An ObjectArray reference <tt>\ref vx_object_array</tt>. Any possible errors preventing a 
+ * \returns An ObjectArray reference <tt>\ref vx_object_array</tt>. Any possible errors preventing a
  * successful creation should be checked using <tt>\ref vxGetStatus</tt>. Data objects are not initialized by this function.
  *
  * \ingroup group_object_array
@@ -426,18 +450,18 @@ VX_API_ENTRY vx_object_array VX_API_CALL vxCreateTensorObjectArray(vx_context co
 
 typedef union _vx_tensor_quant_param
 {
-    struct 
+    struct
     {
         vx_int8 fixed_point_pos; /*!< \brief Specifies the fixed point position when the input element type is int16/int8, if 0 calculations are performed in integer math */
     } dfp;
 
-    struct 
+    struct
     {
         vx_float32      scale;       /*!< \brief Scale vaule for the quantized value */
         vx_int32        zeroPoint;  /*!< \brief  A 32 bit integer, in range [0, 255] */
     } affine;
 
-    struct 
+    struct
     {
         vx_uint32       channelDim; /*!< \brief a 32 bit unsigned integer indicating channel dimension */
         vx_uint32       scaleCount; /*!< \brief the size of the scale array, must be equal to size[channelDim] */
@@ -515,22 +539,22 @@ VX_API_ENTRY vx_status VX_API_CALL vxSwapTensor(vx_tensor tensor0, vx_tensor ten
  * \param [in] context The reference to the implementation context.
  * \param [in] tensor_create_params The <tt>\ref vx_tensor_create_params_t</tt> that points to a parameter structure.
  * \param [in] size_of_create_params Size of parameter structure.
- * \param [in] addrs The tensor patch addressing structures that define the dimension and stride of pointers. See note below. 
+ * \param [in] addrs The tensor patch addressing structures that define the dimension and stride of pointers. See note below.
  * \param [in] ptr The logical pointer of platform-defined references to tensor data.
  * \param [in] import_type <tt>\ref vx_memory_type_e</tt>. When giving <tt>\ref VX_MEMORY_TYPE_HOST</tt>
  * the \a ptr is assumed to be a HOST accessible pointer to memory.
- * \returns An tensor reference <tt>\ref vx_tensor</tt>. Any possible errors preventing a 
+ * \returns An tensor reference <tt>\ref vx_tensor</tt>. Any possible errors preventing a
  * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
  *
  * In order to release the image back to the application we should use <tt>\ref vxSwapTensorHandle</tt>.
- * 
+ *
  * \ingroup group_tensor
  *\version 0.4
  */
 VX_API_ENTRY vx_tensor VX_API_CALL vxCreateTensorFromHandle2(
-        vx_context context, const vx_tensor_create_params_t* tensor_create_params, vx_size size_of_create_params, const vx_tensor_addressing addrs, 
+        vx_context context, const vx_tensor_create_params_t* tensor_create_params, vx_size size_of_create_params, const vx_tensor_addressing addrs,
         void * const ptr, vx_enum import_type);
-    
+
 /*! \brief Flush the memory referenced by reference's handle when it is ready.
 * \param [in] ref The reference(image or tensor) which created from handle.
 * \return A <tt>\ref vx_status_e</tt> enumeration.;
@@ -607,7 +631,7 @@ typedef struct _vx_nn_convolution_params_t
 typedef struct _vx_nn_convolution_params_ext_t
 {
     vx_nn_convolution_params_t khr;          /*!< \brief Khronos standard structure head */
-    vx_size padding_x_right;                 /*!< \brief Number of elements added at each side in the right of x dimension of the input, 
+    vx_size padding_x_right;                 /*!< \brief Number of elements added at each side in the right of x dimension of the input,
                                                "padding_x" is for the left */
     vx_size padding_y_bottom;                /*!< \brief Number of elements added at each side in the bottom of y dimension of the input.
                                                 "padding_y" is for the top */
@@ -696,7 +720,7 @@ typedef struct _vx_nn_convolution_params_ext2_t
  * The relation between input to output is as follows: \n
  * \f$ width_{output} = round(\frac{(width_{input} + 2 * padding_x - kernel_x - (kernel_x -1) * dilation_x)}{skip_x} + 1) \f$\n
  * and \n
- * \f$ height_{output} = round(\frac{(height + 2 * padding_y - kernel_y - (kernel_y -1) * dilation_y)}{skip_y} + 1) \f$\n 
+ * \f$ height_{output} = round(\frac{(height + 2 * padding_y - kernel_y - (kernel_y -1) * dilation_y)}{skip_y} + 1) \f$\n
  * where \f$width\f$ is the size of the input width dimension. \f$height\f$ is the size of the input height dimension.
  * \f$width_{output}\f$ is the size of the output width dimension. \f$height_{output}\f$ is the size of the output height dimension.
  * \f$kernel_x\f$ and \f$kernel_y\f$ are the convolution sizes in width and height dimensions.
@@ -705,11 +729,11 @@ typedef struct _vx_nn_convolution_params_ext2_t
  * Notice that this node creation function has more parameters than the corresponding kernel. Numbering of kernel parameters (required if you create this node using the generic interface) is explicitly specified here.
  * \param [in] graph The handle to the graph.
  * \param [in] inputs The input tensor data. 3 lower dimensions represent a single input, all following dimensions represent number of batches, possibly nested.
- * The dimension order is [width, height, #IFM, #batches].\n 
+ * The dimension order is [width, height, #IFM, #batches].\n
  * \param [in] weights [*static] Weights are 4d tensor with dimensions [kernel_x, kernel_y, #IFM, #OFM]. see <tt>\ref vxCreateTensor2</tt> and <tt>\ref vxCreateVirtualTensor2</tt> \n Weights data type must match the data type of the inputs.  (Kernel parameter #1)
  * \param [in] biases [*static] Optional, ignored if NULL. The biases, which may be shared (one per ofm) or unshared (one per ofm * output location). The possible layouts are
- * either [#OFM] or [width, height, #OFM]. Biases data type must match the data type of the inputs. 
- * \param [in] convolution_params [static] Pointer to parameters of type <tt>\ref vx_nn_convolution_params_t</tt>. 
+ * either [#OFM] or [width, height, #OFM]. Biases data type must match the data type of the inputs.
+ * \param [in] convolution_params [static] Pointer to parameters of type <tt>\ref vx_nn_convolution_params_t</tt>.
  * \param [in] size_of_convolution_params [static] Size in bytes of convolution_params. Note that this parameter is not counted as one of the kernel parameters.
  * \param [out] outputs The output tensor data. Output will have the same number and structure of dimensions as input. Output tensor data type must be same as the inputs.
  * \return <tt> vx_node</tt>.
@@ -725,8 +749,8 @@ VX_API_ENTRY vx_node VX_API_CALL vxConvolutionLayer(vx_graph graph, vx_tensor in
 * round: rounding according the <tt>vx_round_policy_e</tt> enumeration. \n
 * saturate: A saturation according the <tt>vx_convert_policy_e</tt> enumeration.
 * The saturation is done based on the accumulator_bits parameter.
-* According the accumulator_bits, the saturation might not be performed every operation. 
-* But every a specified amount of operations, 
+* According the accumulator_bits, the saturation might not be performed every operation.
+* But every a specified amount of operations,
 * that are suspected to saturate the accumulation bits\n
 * The equation for Fully connected layer:\n
 * \f$ outputs[i] = ( \sum_{j} saturate(round(inputs[j] \times weights[j,i])))+biasses[i] \f$\n
@@ -735,10 +759,10 @@ VX_API_ENTRY vx_node VX_API_CALL vxConvolutionLayer(vx_graph graph, vx_tensor in
 * Then down scale is done by picking the results according to a skip jump. The skip is determined by the output size dimensions.
 * The relation between input to output is as follows:
 * \f$ size_{output} = round(\frac{(size_{input} + 2 * pad)}{skip} + 1) \f$\n
-* where \f$size_{input}\f$ is the size of the input dimension. 
-* \f$size_{output}\f$ is the size of the output dimension. 
+* where \f$size_{input}\f$ is the size of the input dimension.
+* \f$size_{output}\f$ is the size of the output dimension.
 * skip is calculated by the relation between input and output.
-* rounding is done according to <tt>\ref vx_convolutional_network_rounding_type_e</tt>. 
+* rounding is done according to <tt>\ref vx_convolutional_network_rounding_type_e</tt>.
 * \param [in] graph The handle to the graph.
 * \param [in] inputs The input tensor data. There two possible input layouts:
 * 1. [#IFM, #batches]. See <tt>\ref vxCreateTensor2</tt> and <tt>\ref vxCreateVirtualTensor2</tt>.
@@ -884,7 +908,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxNormalizationLayer2(vx_graph graph, vx_tensor
  * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
  * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
  */
-VX_API_ENTRY vx_node VX_API_CALL vxActivationLayer(vx_graph graph, vx_tensor inputs, vx_enum function, vx_float32 a,vx_float32 b, vx_tensor outputs); 
+VX_API_ENTRY vx_node VX_API_CALL vxActivationLayer(vx_graph graph, vx_tensor inputs, vx_enum function, vx_float32 a,vx_float32 b, vx_tensor outputs);
 
 /*! \brief [Graph] Creates a Convolutional Network ROI pooling node
  * \details Pooling is done on the width and height dimensions of the <tt>\ref vx_tensor</tt>. The ROI Pooling get an array of roi rectangles, and an input tensor.
@@ -892,9 +916,9 @@ VX_API_ENTRY vx_node VX_API_CALL vxActivationLayer(vx_graph graph, vx_tensor inp
  * The down scale method is determined by the pool_type.
  * Notice that this node creation function has more parameters than the corresponding kernel. Numbering of kernel parameters (required if you create this node using the generic interface) is explicitly specified here.
  * \param [in] graph The handle to the graph.
- * \param [in] inputs The input tensor data. 3 lower dimensions represent a single input, 4th dimension for batch of inputs is optional. Dimension layout is [width, height, #IFM, #batches]. 
+ * \param [in] inputs The input tensor data. 3 lower dimensions represent a single input, 4th dimension for batch of inputs is optional. Dimension layout is [width, height, #IFM, #batches].
  * See <tt>\ref vxCreateTensor2</tt> and <tt>\ref vxCreateVirtualTensor2</tt>.
- * Implementations must support input tensor data types indicated by the extension strings 'KHR_NN_8' or 'KHR_NN_8 KHR_NN_16'.  (Kernel parameter #0) 
+ * Implementations must support input tensor data types indicated by the extension strings 'KHR_NN_8' or 'KHR_NN_8 KHR_NN_16'.  (Kernel parameter #0)
  * \param [in] inputs_rois The roi array tensor. ROI array with dimensions [4, roi_count, #batches] where the first dimension represents 4 coordinates of the top left and bottom right corners of the roi rectangles, based on the input tensor width and height.
  * #batches is optional and must be the same as in inputs. roi_count is the number of ROI rectangles.  (Kernel parameter #1)
  * \param [in] pool_type [static] Of type <tt>\ref vx_nn_pooling_type_e</tt>. Only <tt>\ref VX_NN_POOLING_MAX</tt> pooling is supported.   (Kernel parameter #2)
@@ -906,13 +930,13 @@ VX_API_ENTRY vx_node VX_API_CALL vxActivationLayer(vx_graph graph, vx_tensor inp
  * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
  */
 VX_API_ENTRY vx_node VX_API_CALL vxROIPoolingLayer(vx_graph graph, vx_tensor input_data, vx_tensor input_rois, const vx_nn_roi_pool_params_t *roi_pool_params, vx_size size_of_roi_params, vx_tensor output_arr);
-                
-                
+
+
 /*! \brief [Graph] Creates a Convolutional Network Deconvolution Layer Node.
  * \details  Deconvolution denote a sort of reverse convolution, which importantly and confusingly is not actually a proper mathematical deconvolution.
  * Convolutional Network Deconvolution is up-sampling of an image by learned Deconvolution coefficients.
  * The operation is similar to convolution but can be implemented by up-sampling the inputs with zeros insertions between the inputs,
- * and convolving the Deconvolution kernels on the up-sampled result. 
+ * and convolving the Deconvolution kernels on the up-sampled result.
  * For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined,
  * and should be at least 16.\n
  * round: rounding according the <tt>vx_round_policy_e</tt> enumeration. \n
@@ -926,7 +950,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxROIPoolingLayer(vx_graph graph, vx_tensor inp
  * The relation between input to output is as follows: \n
  * \f$ width_{output} =  (width_{input} -1) * upscale_x  - 2 * padding_x + kernel_x + a_x \f$\n
  * and \n
- * \f$ height_{output} =  (height_{input} - 1) * upscale_y - 2 * padding_y + kernel_y + a_y \f$\n 
+ * \f$ height_{output} =  (height_{input} - 1) * upscale_y - 2 * padding_y + kernel_y + a_y \f$\n
  * where \f$width_{input}\f$ is the size of the input width dimension. \f$height_{input}\f$ is the size of the input height dimension.
  * \f$width_{output}\f$ is the size of the output width dimension. \f$height_{output}\f$ is the size of the output height dimension.
  * \f$kernel_x\f$ and \f$kernel_y\f$ are the convolution sizes in width and height. \f$a_x\f$ and \f$a_y\f$ are user-specified quantity used to distinguish between the \f$upscale_x\f$ and \f$upscale_y\f$ different possible output sizes.
@@ -966,7 +990,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxDeconvolutionLayer(vx_graph graph, vx_tensor
  * \ingroup group_cnn
 */
 VX_API_ENTRY vx_node VX_API_CALL vxLeakyReluLayer(
-    vx_graph                    graph, 
+    vx_graph                    graph,
     vx_tensor                   inputs,
     vx_float32                  negative_slope,
     vx_tensor                   outputs
@@ -985,7 +1009,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxLeakyReluLayer(
  * \version 0.5
 */
 VX_API_ENTRY vx_node VX_API_CALL vxPReluLayer(
-    vx_graph                    graph, 
+    vx_graph                    graph,
     vx_tensor                   inputs,
     vx_tensor                   alpha,
     vx_tensor                   outputs
@@ -1033,14 +1057,14 @@ VX_API_ENTRY vx_node VX_API_CALL vxConcat2Layer(
     vx_tensor in0,
     vx_tensor in1,
     vx_tensor out
-    ); 
+    );
 
 /*! \brief parameter for vxConcatIndefiniteLayer
  * \ingroup group_cnn
  * \version 0.4
  */
 typedef struct _vx_nn_concat_params_t
-{  
+{
     vx_uint32 axis;             /*!< \brief  The axis on which we need do concat. */
 } vx_nn_concat_params_t;
 
@@ -1085,7 +1109,7 @@ enum vx_reorg_type_e
     VX_REORG_SHUFFLE_CHANNEL,
 };
 
-/*! \brief Input parameter for reorg layer 
+/*! \brief Input parameter for reorg layer
  *\ingroup group_cnn
  *\version 0.4
  */
@@ -1108,7 +1132,7 @@ typedef struct _vx_nn_reorg_params_ext_t
 typedef struct _vx_nn_reorg_params_ext2_t
 {
     vx_nn_reorg_params_t base;      /*!< \brief vx_nn_reorg_params <tt>\ref vx_nn_reorg_params_t</tt> */
-    vx_int32 *num_group;                  
+    vx_int32 *num_group;
     vx_int32 *axis;
 } vx_nn_reorg_params_ext2_t;
 
@@ -1125,7 +1149,7 @@ typedef struct _vx_nn_reorg_params_ext2_t
  * \version 0.4
  */
 VX_API_ENTRY vx_node VX_API_CALL vxReorgLayer2(
-    vx_graph                    graph, 
+    vx_graph                    graph,
     vx_tensor                   input,
     const vx_nn_reorg_params    reorg_params,
     vx_size                     size_of_reorg_params,
@@ -1154,7 +1178,7 @@ typedef struct _vx_nn_rounding_params_t
  * \version 0.4
  */
 VX_API_ENTRY vx_node VX_API_CALL vxTensorRoundingNode(
-    vx_graph                       graph, 
+    vx_graph                       graph,
     vx_tensor                      input,
     const vx_nn_rounding_params    rounding_params,
     vx_size                        size_of_rounding_params,
@@ -1189,7 +1213,7 @@ typedef struct _vx_nn_hashlut_params_t
  * \version 0.4
  */
 VX_API_ENTRY vx_node VX_API_CALL vxHashTableLookupLayer(
-    vx_graph                    graph, 
+    vx_graph                    graph,
     vx_tensor                   input,
     const vx_nn_hashlut_params  hashlut_params,
     vx_size                     size_of_hashlut_params,
@@ -1235,7 +1259,7 @@ typedef struct _vx_nn_lshproj_params_t
  * \param [in] lshproj_params Pointer to parameters of type <tt>\ref vx_nn_lshproj_params</tt>
  * \param [in] size_of_lshproj_params [static] Size in bytes of vx_nn_lshproj_params.
  * \param [out] output The output tensor data.
- *  If the projection type is sparse: 
+ *  If the projection type is sparse:
  *    Output.Dim == { Tensor[0].Dim[0] }
  *    A tensor that represents hash signatures.
  *  If the projection type is Dense:
@@ -1248,7 +1272,7 @@ typedef struct _vx_nn_lshproj_params_t
  * \version 0.4
  */
 VX_API_ENTRY vx_node VX_API_CALL vxLSHProjectionLayer(
-    vx_graph                    graph, 
+    vx_graph                    graph,
     vx_tensor                   input,
     const vx_nn_lshproj_params  lshproj_params,
     vx_size                     size_of_lshproj_params,
@@ -1261,7 +1285,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxLSHProjectionLayer(
  */
 typedef struct _vx_nn_reshape_params_t
 {
-    vx_tensor dims;                    /*!< \brief  dimension. */  
+    vx_tensor dims;                    /*!< \brief  dimension. */
 } vx_nn_reshape_params_t, * vx_nn_reshape_params;
 
 /*! \brief [Graph] Creates a Reshape Layer Node.
@@ -1277,7 +1301,7 @@ typedef struct _vx_nn_reshape_params_t
  * \version 0.4
  */
 VX_API_ENTRY vx_node VX_API_CALL vxTensorReshapeNode(
-    vx_graph                    graph, 
+    vx_graph                    graph,
     vx_tensor                   input,
     const vx_nn_reshape_params  reshape_params,
     vx_size                     size_of_reshape_params,
@@ -1306,7 +1330,7 @@ typedef struct _vx_nn_scale_params_t
  * \version 0.4
  */
 VX_API_ENTRY vx_node VX_API_CALL vxTensorScaleNode(
-    vx_graph                    graph, 
+    vx_graph                    graph,
     vx_tensor                   input,
     const vx_nn_scale_params    scale_params,
     vx_size                     size_of_scale_params,
@@ -1370,7 +1394,7 @@ typedef struct _vx_nn_rnn_params_t
  * \details A basic recurrent neural network layer.
  *      This layer implements the operation:
  *      outputs = state = activation(inputs * input_weights + state * recurrent_weights + bias)
- *      
+ *
  *      Where:
  *      "input_weights" is a weight matrix that multiplies the inputs;
  *      "recurrent_weights" is a weight matrix that multiplies the current
@@ -1392,7 +1416,7 @@ typedef struct _vx_nn_rnn_params_t
  * \version 0.4
  */
 VX_API_ENTRY vx_node VX_API_CALL vxRNNLayer(
-    vx_graph                    graph, 
+    vx_graph                    graph,
     vx_tensor                   input,
     const vx_nn_rnn_params      rnn_params,
     vx_size                     size_of_rnn_params,
@@ -1432,7 +1456,7 @@ typedef struct _vx_nn_softmax_params_ext_t
  * \version 0.4
  */
 VX_API_ENTRY vx_node VX_API_CALL vxSoftmaxLayer2(
-    vx_graph                    graph, 
+    vx_graph                    graph,
     vx_tensor                   input,
     const vx_nn_softmax_params  softmax_params,
     vx_size                     size_of_softmax_params,
@@ -1458,25 +1482,25 @@ typedef struct _vx_nn_svdf_params_t
  *          densely connected layer that's processing a sequence of input frames can
  *          be approximated by using a singular value decomposition of each of its
  *          nodes. The implementation is based on:
- *        
+ *
  *          https://research.google.com/pubs/archive/43813.pdf
- *          
+ *
  *          P. Nakkiran, R. Alvarez, R. Prabhavalkar, C. Parada.
  *          "Compressing Deep Neural Networks using a Rank-Constrained Topology".
  *          INTERSPEECH, 2015.
- *          
+ *
  *          It processes the incoming input using a 2-stage filtering mechanism:
  *          stage 1 performs filtering on the "features" dimension, whose outputs get
  *          pushed into a memory of fixed-size memory_size.
  *          stage 2 performs filtering on the "time" dimension of the memory_size
  *          memoized outputs of stage 1.
- *          
+ *
  *          Specifically, for rank 1, this layer implements the operation:
- *          
+ *
  *             memory = push(conv1d(inputs, weights_feature, feature_dim,
  *                           "PADDING_VALID"));
  *             outputs = activation(memory * weights_time + bias);
- *          
+ *
  *          Where:
  *          "weights_feature" is a weights matrix that processes the inputs (by
  *          convolving the input with every "feature filter"), and whose outputs get
@@ -1488,7 +1512,7 @@ typedef struct _vx_nn_svdf_params_t
  *          batch); and
  *          "activation" is the function passed as the "fused_activation_function"
  *          argument (if not "NONE").
- *          
+ *
  *          Each rank adds a dimension to the weights matrices by means of stacking
  *          the filters.
  * \param [in] graph The reference to the parent graph.
@@ -1506,7 +1530,7 @@ typedef struct _vx_nn_svdf_params_t
  * \version 0.4
  */
 VX_API_ENTRY vx_node VX_API_CALL vxSVDFLayer(
-    vx_graph                    graph, 
+    vx_graph                    graph,
     vx_tensor                   input,
     const vx_nn_svdf_params     svdf_params,
     vx_size                     size_of_svdf_params,
@@ -1535,7 +1559,7 @@ typedef struct _vx_nn_pooling_params_t
  * \version 0.4
  */
 typedef struct _vx_nn_pooling_params_ext_t
-{  
+{
     vx_nn_pooling_params_t base;    /*!< \brief The base definition.<tt>\ref vx_nn_pooling_params_t</tt> */
     vx_uint32 stride_x;             /*!< \brief  Skip x jump for down scale. */
     vx_uint32 stride_y;             /*!< \brief  Skip y jump for down scale. */
@@ -1569,7 +1593,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxPoolingLayer2(
 
 /*! \brief [Graph] Performs arithmetic addition on element values in the input tensor data's.
  * \param [in] graph The handle to the graph.
- * \param [in] in1 input tensor data,. 
+ * \param [in] in1 input tensor data,.
  * \param [in] in2 input tensor data, inputs must be of equal in dimensions.
  * else, If in one of the vx_mddata dimension is 1.
  * That dimension is considered as a const on all the dimension terms.
@@ -1639,7 +1663,6 @@ typedef struct _vx_nn_pad_params_t
     vx_uint8   numViewDimensions;         /*!< \brief The size of two arrays. */
     vx_enum    pad_mode;                  /*!< \brief A VX_TYPE_ENUM of the <tt>\ref vx_pad_mode_e</tt> enumeration. */
     vx_scalar  pad_const;                 /*!< \brief The order const value if setting pad mode to const, the const value is base value, not quantized value. */
-
 } vx_nn_pad_params_t, * vx_nn_pad_params;
 
 
@@ -1716,9 +1739,9 @@ typedef struct _vx_nn_l2norm_params_t
 * \retval * Node handle.
 */
 VX_API_ENTRY vx_node VX_API_CALL vxL2NormalizeLayer2(
-    vx_graph                      graph, 
-    vx_tensor                     inputs, 
-    const vx_nn_l2norm_params_t * l2norm_params, 
+    vx_graph                      graph,
+    vx_tensor                     inputs,
+    const vx_nn_l2norm_params_t * l2norm_params,
     vx_size                       size_of_l2norm_params,
     vx_tensor                     outputs);
 
@@ -1752,7 +1775,7 @@ typedef struct _vx_nn_rpn_params_t
  * \ingroup group_cnn
  */
 VX_API_ENTRY vx_node VX_API_CALL vxRPNLayer(
-    vx_graph                    graph, 
+    vx_graph                    graph,
     vx_tensor                   score,
     vx_tensor                   bbox,
     vx_tensor                   anchors,
@@ -1773,24 +1796,24 @@ typedef struct _vx_nn_lstm_params_t
     vx_tensor input2forget_weight;                 /*!< \brief  A 2-D tensor of type T, of shape [num_units, input_size].*/
     vx_tensor input2cell_weight;                   /*!< \brief  A 2-D tensor of type T, of shape [num_units, input_size].*/
     vx_tensor input2output_weight;                 /*!< \brief  A 2-D tensor of type T, of shape [num_units, input_size].*/
-    
+
     vx_tensor recurrent2input_weight;              /*!< \brief Optional A 2-D tensor of type T, of shape [num_units, output_size]. where "output_size" corresponds to either the number of cell units (i.e., "num_units"), or the second dimension of the "projection_weights", if defined.*/
     vx_tensor recurrent2forget_weight;             /*!< \brief  A 2-D tensor of type T, of shape [num_units, output_size].*/
     vx_tensor recurrent2cell_weight;               /*!< \brief  A 2-D tensor of type T, of shape [num_units, output_size].*/
     vx_tensor recurrent2output_weight;             /*!< \brief  A 2-D tensor of type T, of shape [num_units, output_size].*/
-    
+
     vx_tensor cell2input_weight;                   /*!< \brief Optional A 1-D tensor of type T, of shape [num_units].*/
     vx_tensor cell2forget_weight;                  /*!< \brief Optional A 1-D tensor of type T, of shape [num_units].*/
     vx_tensor cell2output_weight;                  /*!< \brief Optional A 1-D tensor of type T, of shape [num_units].*/
-    
+
     vx_tensor input_gate_bias;                     /*!< \brief Optional A 1-D tensor of type T, of shape [num_units].*/
     vx_tensor forget_gate_bias;                    /*!< \brief  A 1-D tensor of type T, of shape [num_units].*/
     vx_tensor cell_bias;                           /*!< \brief  A 1-D tensor of type T, of shape [num_units].*/
     vx_tensor output_gate_bias;                    /*!< \brief  A 1-D tensor of type T, of shape [num_units].*/
-    
+
     vx_tensor projection_weight;                   /*!< \brief Optional A 2-D tensor of type T, of shape [output_size, num_units].*/
     vx_tensor projection_bias;                     /*!< \brief Optional A 1-D tensor of type T, of shape [output_size].*/
-    
+
     vx_tensor activation;                          /*!< \brief Optional. An ActivationFunctionType indicating the activation function. If "NONE" is specified then it results in a linear activation.If "NONE" is specified then it results in a linear activation.*/
     vx_tensor cell_clip;                           /*!< \brief  A clipping threshold for the cell state, such that values are bound within [-cell_clip, cell_clip]. If set to 0.0 then clipping is disabled.*/
     vx_tensor proj_clip;                           /*!< \brief  A clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.*/
@@ -1805,9 +1828,9 @@ typedef struct _vx_nn_lstm_params_ext_t
     vx_tensor forget_bias;                 /*!< \brief  A bias(float 32) for the forget gate. If set to 0.0f(by default) then bias is ignored.*/
 
     vx_float32 norm_gain;                  /*!< \brief  Float32[static] The layer normalization gain initial value(default is 1.0f).*/
-    vx_float32 norm_shift;                 /*!< \brief  Float32[static] The layer normalization shift initial value(default is 0.0f).*/ 
+    vx_float32 norm_shift;                 /*!< \brief  Float32[static] The layer normalization shift initial value(default is 0.0f).*/
 
-    vx_tensor sequence_length;             /*!< \brief  Optional[static] Specifies the length of each sequence in inputs. An `int32` (tensor) size `[batch_size]`, values in `[0, time_len)` or None(by default).*/ 
+    vx_tensor sequence_length;             /*!< \brief  Optional[static] Specifies the length of each sequence in inputs. An `int32` (tensor) size `[batch_size]`, values in `[0, time_len)` or None(by default).*/
 
     /*Since ANDROID NN API level 29 there are additional inputs to this op:*/
     vx_tensor layernorm2input_weight;              /*!< \brief [Optional] The input layer normalization weights. A 1 - D tensor of shape[num_units].Used to rescale normalized inputs to activation at input gate.*/
@@ -1846,11 +1869,11 @@ typedef struct _vx_nn_lstm_layer_params_ext_t
  *     Hasim Sak, Andrew Senior, and Francoise Beaufays. "Long short-term memory
  *     recurrent neural network architectures for large scale acoustic modeling."
  *     INTERSPEECH, 2014.
- *     
+ *
  *     The coupling of input and forget gate (CIFG) is based on:
  *     http://arxiv.org/pdf/1503.04069.pdf
  *     Greff et al. "LSTM: A Search Space Odyssey"
- *     
+ *
  *     The class has the following independently optional inputs:
  *     * If input gate (if CIFG): "input_to_forget_weights",
  *       "recurrent_to_input_weights", "cell_to_input_weights", "input_gate_bias".
@@ -1870,7 +1893,7 @@ typedef struct _vx_nn_lstm_layer_params_ext_t
  * \param [out] scratch A 3-D tensor of type T, of shape [num_cell, 4, batch_size].
  * \param [out] output_state_out A 2-D tensor of type T, of shape [output_size, batch_size].
  * \param [out] cell_state_out A 2-D tensor of type T, of shape [num_units, batch_size].
- * \param [out] output A 2-D tensor of type T, of shape [output_size, batch_size]. 
+ * \param [out] output A 2-D tensor of type T, of shape [output_size, batch_size].
  *                      This is effectively the same as the current "output_state" value.
  * \return <tt> vx_node</tt>.
  * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
@@ -1905,7 +1928,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxLstmUnitLayer(
  *                    is the batching dimension.
  * \param [in] lstm_layer_params LSTM paraments <tt>\ref vx_nn_lstm_layer_params_t </tt>.
  * \param [in] size_of_lstm_layer_params [static] The size of the lstm_layer_params.
- * \param [out] output A 2-D/3D tensor of type T, of shape [output_size, batch_size] or [output_size, batch_size, time]. 
+ * \param [out] output A 2-D/3D tensor of type T, of shape [output_size, batch_size] or [output_size, batch_size, time].
  *                      This is effectively the same as the current "output_state" value.
  * \return <tt> vx_node</tt>.
  * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
@@ -1914,7 +1937,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxLstmUnitLayer(
  * \version 0.3
  */
 VX_API_ENTRY vx_node VX_API_CALL vxLstmLayer(
-    vx_graph graph, 
+    vx_graph graph,
     vx_tensor input,
     vx_tensor static_input,
     vx_tensor cont,
@@ -1975,7 +1998,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxTensorMeanNode(
 * \param [in] input A n-D tensor, specifying the tensor to be squeezed.
 * \param [in] squeeze_params paraments <tt>\ref vx_nn_squeeze_params_t </tt>.
 * \param [in] size_of_squeeze_param [static] The size of the vx_nn_squeeze_params_t.
-* \param [out] output A n-D tensor of the same type as input. Contains the same data as input, 
+* \param [out] output A n-D tensor of the same type as input. Contains the same data as input,
 *              but has one or more dimensions of size 1 removed.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
@@ -2072,6 +2095,65 @@ VX_API_ENTRY vx_status VX_API_CALL vxQueryHardwareCaps(
  */
 VX_API_ENTRY vx_node VX_API_CALL vxConv3dLayer(vx_graph graph, vx_tensor inputs, vx_tensor weights, vx_tensor biases, const vx_nn_convolution_3d_params_t *convolution_params, vx_size size_of_convolution_params, vx_tensor outputs);
 
+/*! \brief [Graph] Creates a Convolutional Network Deconvolution3d Layer Node.
+ * \details  Deconvolution denote a sort of reverse convolution, which importantly and confusingly is not actually a proper mathematical deconvolution.
+ * Convolutional Network Deconvolution is up-sampling of an image by learned Deconvolution coefficients.
+ * The operation is similar to convolution but can be implemented by up-sampling the inputs with zeros insertions between the inputs,
+ * and convolving the Deconvolution kernels on the up-sampled result.
+ * For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined,
+ * and should be at least 16.\n
+ * round: rounding according the <tt>vx_round_policy_e</tt> enumeration. \n
+ * saturate: A saturation according the <tt>vx_convert_policy_e</tt> enumeration.
+ * The following equation is implemented: \n
+ * \f$ outputs[j,k,i] =  saturate(round(\sum_{l} \sum_{m,n}(inputs_{upscaled}[j+m,k+n,l] \times weights[m,n,l,i])+biasses[j,k,i])) \f$\n
+ * Where \f$m,n\f$ are indexes on the convolution matrices. \f$ l\f$ is an index on all the convolutions per input.\f$ i\f$ is an index per output.
+ * \f$ j,k \f$ are the inputs/outputs spatial indexes.
+ * Deconvolution is done on the width and height dimensions of the <tt>\ref vx_tensor</tt>. Therefore, we use here the term x for the width dimension and y for the height dimension.\n
+ * before the Deconvolution is done, up-scaling the width and height dimensions with zeros is performed.
+ * The relation between input to output is as follows: \n
+ * \f$ width_{output} =  (width_{input} -1) * upscale_x  - 2 * padding_x + kernel_x + a_x \f$\n
+ * and \n
+ * \f$ height_{output} =  (height_{input} - 1) * upscale_y - 2 * padding_y + kernel_y + a_y \f$\n
+ * \f$ depth_{output} =  (depth_{input} - 1) * upscale_d - 2 * padding_d + kernel_d + a_d \f$\n
+ * where
+ *   \f$width_{input}\f$ is the size of the input width dimension.
+ *   \f$height_{input}\f$ is the size of the input height dimension.
+ *   \f$depth_{input}\f$ is the size of the input depth dimension.
+ *
+ *   \f$width_{output}\f$ is the size of the output width dimension.
+ *   \f$height_{output}\f$ is the size of the output height dimension.
+ *   \f$depth_{output}\f$ is the size of the output depth dimension.
+ *
+ *   \f$kernel_x\f$, \f$kernel_y\f$ and \f$kernel_d\f$ are the deconvolutioned sizes in width, height and depth.
+ *   \f$a_x\f$ and \f$a_y\f$ are user-specified quantity used to distinguish between the \f$upscale_x\f$ and \f$upscale_y\f$ different possible output sizes.
+ *   \f$upscale_x\f$, \f$upscale_y\f$ and \f$upscale_d\f$ are calculated by the relation between input and output.
+ * \f$a_x\f$ and \f$a_y\f$ must be positive and smaller then \f$upscale_x\f$ and \f$upscale_y\f$ respectively.
+ * Since the padding parameter is on the output. The effective input padding is: \n
+ * \f$ padding_{input_x} = kernel_x -padding_x -1\f$ \n
+ * \f$ padding_{input_y} = kernel_y -padding_y -1\f$ \n
+ * \f$ padding_{input_d} = kernel_d -padding_d -1\f$ \n
+ * Therfore the following constarints apply :
+ *      \f$kernel_x >= padding_x - 1\f$,
+ *      \f$kernel_y >= padding_y - 1\f$.
+ *      \f$kernel_d >= padding_d - 1\f$.
+ * rounding is done according to <tt>\ref vx_nn_rounding_type_e</tt>.
+ * Notice that this node creation function has more parameters than the corresponding kernel. Numbering of kernel parameters (required if you create this node using the generic interface) is explicitly specified here.
+ * \param [in] graph The handle to the graph.
+ * \param [in] inputs The input tensor. 4 lower dimensions represent a single input, and an optional 5th dimension for batch of inputs. Dimension layout is [width, height, depth, #IFM, #batches].
+ * See <tt>\ref vxCreateTensor2</tt> and <tt>\ref vxCreateVirtualTensor2</tt>.
+ * Implementations must support input tensor data types indicated by the extension strings 'KHR_NN_8' or 'KHR_NN_8 KHR_NN_16'.   (Kernel parameter #0)
+ * \param [in] weights [static] The 5d weights with dimensions [width, height, depth, #IFM, #OFM]. See <tt>\ref vxCreateTensor2</tt> and <tt>\ref vxCreateVirtualTensor2</tt>.  (Kernel parameter #1)
+ * \param [in] biases [static] Optional, ignored if NULL. The biases have one dimension [#OFM]. Implementations must support input tensor data type same as the inputs.  (Kernel parameter #2)
+ * \param [in] deconvolution_params [static] Pointer to parameters of type <tt>\ref vx_nn_deconvolution_params_t</tt>  (Kernel parameter #3)
+ * \param [in] size_of_deconv_params [static] Size in bytes of deconvolution_params. Note that this parameter is not counted as one of the kernel parameters.
+ * \param [out] outputs The output tensor. The output has the same number of dimensions as the input.  (Kernel parameter #4)
+ * \ingroup group_cnn
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxDeconv3dLayer(vx_graph graph, vx_tensor inputs, vx_tensor weights, vx_tensor biases, const vx_nn_deconvolution_3d_params_t *convolution_params, vx_size size_of_deconv_params, vx_tensor outputs);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
index 51bf129..0dbdcc8 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
@@ -304,6 +304,39 @@ typedef struct _vx_tensor_view_t * vx_tensor_view;
 */
 typedef struct _vx_tensor_addressing_t  * vx_tensor_addressing;
 
+/*!
+ * \brief The addressing image patch structure is used by the Host only
+ * to address pixels in an image patch. The fields of the structure are defined as:
+ * \arg dim - The dimensions of the image in logical pixel units in the x & y direction.
+ * \arg stride - The physical byte distance from a logical pixel to the next
+ * logically adjacent pixel in the positive x or y direction.
+ * \arg scale - The relationship of scaling from the primary plane (typically
+ * the zero indexed plane) to this plane. An integer down-scaling factor of \f$ f \f$ shall be
+ * set to a value equal to \f$ scale = \frac{unity}{f} \f$ and an integer up-scaling factor of \f$ f \f$
+ * shall be set to a value of \f$ scale = unity * f \f$. \f$ unity \f$ is defined as <tt>\ref VX_SCALE_UNITY</tt>.
+ * \arg step - The step is the number of logical pixel units to skip to
+ * arrive at the next physically unique pixel. For example, on a plane that is
+ * half-scaled in a dimension, the step in that dimension is 2 to indicate that
+ * every other pixel in that dimension is an alias. This is useful in situations
+ * where iteration over unique pixels is required, such as in serializing
+ * or de-serializing the image patch information.
+ * \see <tt>\ref vxMapImagePatch</tt>
+ * \ingroup group_image
+ */
+typedef struct _vx_tensorpatch_addressing_t {
+    vx_uint32 num_of_dims;    /*!< \brief Width of patch in X dimension in pixels. */
+    vx_size   *dim_sizes;     /*!< \brief Pointer to dimensions array */
+    vx_size   *strides;       /*!< \brief Pointer to strides array */
+    vx_uint16 stride_x_bits; /*!< \brief Stride in X dimension in bits. Used when stride_x is not an integer number of bytes. */
+} vx_tensorpatch_addressing_t;
+
+/*! \brief The addressing of a tensor patch structure is used by the Host only
+* to address elements in a tensor view patch.
+* \see <tt>\ref vxCopyTensorPatch2</tt>
+* \ingroup group_tensor
+*/
+typedef struct _vx_tensorpatch_addressing_t * vx_trensor_addressing;
+
 /*! \brief The weight bias parameter for fused layers
  * \ingroup group_cnn
  */
@@ -437,6 +470,8 @@ enum vx_type_e {
     /* \todo add new object types here */
     VX_TYPE_BFLOAT16        = 0x81A,/*!< \brief A <tt>\ref vx_bfloat16</tt>. */
 
+    VX_TYPE_INT4            = 0x81C,/*!< \brief A <tt>\ref signed 4bits tensor.</tt>. */
+    VX_TYPE_UINT4           = 0x81D,/*!< \brief A <tt>\ref unsigned 4bits tensor.</tt>. */
 };
 
 /*! \brief The enumeration of all status codes.
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h
index e31ba0d..f86014d 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h
@@ -53,17 +53,19 @@ VX_API_ENTRY vx_status VX_API_CALL vxSysSetVipFrequency(
     vx_uint32 shaderFscaleValue
     );
 
-/*! \brief cancel all VIP processing jobs.
+/*! \brief cancel all VIP processing jobs on a device.
  * \param [in] context The reference to the implementation context.
+ * \param [in] deviceID bound to graph.
  * \return A <tt>\ref vx_status_e</tt> enumeration.
- * \retval VX_SUCCESS Cancelled all VIP processing job successfully
+ * \retval VX_SUCCESS Cancelled all VIP processing job successfully on a device
  *                    and user can check return of vxProcessGraph() to get cancelled status.
  * \retval VX_ERROR_INVAID_PARAMETERS Invalid context reference.
  * \retval VX_ERROR_NOT_SUPPORTED Hardware does not support job cancellation.
- * \retval VX_FAILURE Failed to cancel VIP proccessing job.
+ * \retval VX_FAILURE Failed to cancel VIP proccessing job on a device.
  */
 VX_API_ENTRY vx_status VX_API_CALL vxSysCancelJob(
-    vx_context context
+    vx_context context,
+    vx_uint32  deviceID
     );
 
 #ifdef  __cplusplus
diff --git a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so
index 4831755..5f9565c 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so and b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libCLC.so b/prebuilt-sdk/x86_64_linux/lib/libCLC.so
index e9101a5..d278960 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libCLC.so and b/prebuilt-sdk/x86_64_linux/lib/libCLC.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so
index 2d30e1e..213d250 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so and b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libGAL.so b/prebuilt-sdk/x86_64_linux/lib/libGAL.so
index 690ba12..434ffc4 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libGAL.so and b/prebuilt-sdk/x86_64_linux/lib/libGAL.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so
index 6a2cefc..d88e0ce 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so and b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0
index 29fffa4..ebea7d4 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 and b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so
index e33fc05..ee7b8f8 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so and b/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so
index 0d2a6c0..2339562 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so and b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libVSC.so b/prebuilt-sdk/x86_64_linux/lib/libVSC.so
index e8b7c99..bb370e9 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libVSC.so and b/prebuilt-sdk/x86_64_linux/lib/libVSC.so differ
diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def
index 4765bd5..fa4dc17 100644
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@@ -168,3 +168,7 @@ DEF_OP(CONV3D)
 DEF_OP(DECONV3D)
 DEF_OP(PAD2)
 DEF_OP(COS)
+DEF_OP(PRE_PROCESS_RGB888_PLANAR)
+DEF_OP(GATHER_ELEMENTS)
+DEF_OP(SELU)
+DEF_OP(CELU)
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
index f5da0f1..53c4969 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
@@ -35,7 +35,7 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum
     VSI_NN_KERNEL_LUT_MISH             = 1,
     VSI_NN_KERNEL_LUT_LOG              = 2,
     VSI_NN_KERNEL_LUT_EXP              = 3,
-    VSI_NN_KERNEL_LUT_ELU              = 4,
+    VSI_NN_KERNEL_LUT_SELU             = 4,
     VSI_NN_KERNEL_LUT_NEG              = 5,
     VSI_NN_KERNEL_LUT_HSIGMOID         = 6,
     VSI_NN_KERNEL_LUT_SOFT_PLUS        = 7,
@@ -45,6 +45,7 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum
     VSI_NN_KERNEL_LUT_RELU_KERAS       = 11,
     VSI_NN_KERNEL_LUT_CLIP             = 12,
     VSI_NN_KERNEL_LUT_SQUARE           = 13,
+    VSI_NN_KERNEL_LUT_CELU             = 14,
 };
 
 #define VSI_NN_KERNEL_LUT_MAX_SIZE  (1024)
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_celu.h b/src/tim/vx/internal/include/ops/vsi_nn_op_celu.h
new file mode 100644
index 0000000..f38ac56
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_celu.h
@@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CELU_H
+#define _VSI_NN_OP_CELU_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_celu_param
+{
+    struct _celu_local_data_t* local;
+    // Add parameters here
+    float alpha;
+} vsi_nn_celu_param;
+_compiler_assert(offsetof(vsi_nn_celu_param, local) == 0, \
+    vsi_nn_celu_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_clip.h b/src/tim/vx/internal/include/ops/vsi_nn_op_clip.h
index 4c4061d..b23970c 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_clip.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_clip.h
@@ -81,12 +81,7 @@ typedef struct _vsi_nn_clip_lcl_data
 
 typedef struct _vsi_nn_clip_lcl2_data
 {
-    uint32_t    hash_idx;
-    vsi_bool    execute_on_sw;
-    vsi_bool    enable_image_2d;
-    uint32_t    sizes0[VSI_NN_MAX_DIM_NUM];
-    uint32_t    sizes1[VSI_NN_MAX_DIM_NUM];
-    uint32_t    dim_num;
+    vsi_bool is_internal_node;
 } vsi_nn_clip_lcl2_data;
 
 typedef struct _vsi_nn_clip_param
@@ -103,4 +98,3 @@ typedef struct _vsi_nn_clip_param
 #endif
 
 #endif
-
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_gather_elements.h b/src/tim/vx/internal/include/ops/vsi_nn_op_gather_elements.h
new file mode 100644
index 0000000..535d913
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gather_elements.h
@@ -0,0 +1,48 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_GATHER_ELEMENTS_H
+#define _VSI_NN_OP_GATHER_ELEMENTS_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_gather_elements_param
+{
+    struct _gather_elements_local_data_t* local;
+    // Add parameters here
+    int32_t axis;
+} vsi_nn_gather_elements_param;
+_compiler_assert(offsetof(vsi_nn_gather_elements_param, local) == 0, \
+    vsi_nn_gather_elements_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h
new file mode 100644
index 0000000..f384e4f
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h
@@ -0,0 +1,64 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_PRE_PROCESS_RGB888_PLANAR_H
+#define _VSI_NN_OP_PRE_PROCESS_RGB888_PLANAR_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_pre_process_rgb888_planar_param
+{
+    struct _pre_process_rgb888_planar_local_data_t* local;
+    // Add parameters here
+    struct
+    {
+        uint32_t left;
+        uint32_t top;
+        uint32_t width;
+        uint32_t height;
+    } rect;
+
+    struct
+    {
+        vsi_size_t *size;
+        uint32_t   dim_num;
+    } output_attr;
+
+    float r_mean;
+    float g_mean;
+    float b_mean;
+    float scale;
+} vsi_nn_pre_process_rgb888_planar_param;
+_compiler_assert(offsetof(vsi_nn_pre_process_rgb888_planar_param, local) == 0, \
+    vsi_nn_pre_process_rgb888_planar_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_selu.h b/src/tim/vx/internal/include/ops/vsi_nn_op_selu.h
new file mode 100644
index 0000000..ffbfc58
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_selu.h
@@ -0,0 +1,48 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_SELU_H
+#define _VSI_NN_OP_SELU_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_selu_param
+{
+    struct _selu_local_data_t* local;
+    // Add parameters here
+    float alpha;
+    float gamma;
+} vsi_nn_selu_param;
+_compiler_assert(offsetof(vsi_nn_selu_param, local) == 0, \
+    vsi_nn_selu_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_math.h b/src/tim/vx/internal/include/utils/vsi_nn_math.h
index 14fc5b5..18ea5e8 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_math.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_math.h
@@ -56,6 +56,7 @@ extern "C" {
     static inline vsi_##NAME##_array_t * vsi_##NAME##_array_create( size_t size ) { \
         vsi_##NAME##_array_t * array = (vsi_##NAME##_array_t *)malloc( \
                 sizeof(vsi_##NAME##_array_t) + sizeof(TYPE) * size ); \
+        if (array == NULL) return NULL; \
         array->size = size; \
         return array; \
     } \
@@ -205,6 +206,14 @@ static inline double vsi_rint
     return inter;
 } /* vsi_rint() */
 
+/**
+* Computes an approximation of the error function.
+* This is the same approximation used by Eigen.
+*
+* @param[in] the value for input float.
+*/
+float vsi_nn_erf_impl(float x);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h
index 8687247..9fb03d9 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h
@@ -58,6 +58,41 @@ extern "C" {
 
 #define BITS_PER_BYTE 8
 
+#define VSI_NN_STRINGIZE(X) VSI_NN_DO_STRINGIZE(X)
+#define VSI_NN_DO_STRINGIZE(X) #X
+
+#define VSI_NN_JOIN(X, Y) VSI_NN_DO_JOIN(X, Y)
+#define VSI_NN_DO_JOIN(X, Y) VSI_NN_DO_JOIN2(X,Y)
+#define VSI_NN_DO_JOIN2(X, Y) X##Y
+
+#if defined(_MSC_VER)
+    #define VSI_NN_DEPRECATED(symbol, hints) \
+       __declspec(deprecated(VSI_NN_STRINGIZE(hints))) symbol
+
+    #define VSI_NN_SUPPRESS_DEPRECATED_BEGIN \
+        __pragma(warning( push )) \
+        __pragma(warning(disable : 4996))
+    #define VSI_NN_SUPPRESS_DEPRECATED_END \
+        __pragma(warning(pop))
+
+#elif defined(__GNUC__)
+    #define VSI_NN_DEPRECATED(symbol, hints) \
+        symbol __attribute__((deprecated(VSI_NN_STRINGIZE(hints))))
+
+    #define VSI_NN_SUPPRESS_DEPRECATED_BEGIN \
+        _Pragma("GCC diagnostic push")  \
+        _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+
+    #define VSI_NN_SUPPRESS_DEPRECATED_END \
+        _Pragma("GCC diagnostic pop")
+#else
+    #define VSI_NN_DEPRECATED(symbol, hints) \
+        symbol
+
+    #define VSI_NN_SUPPRESS_DEPRECATED_BEGIN
+    #define VSI_NN_SUPPRESS_DEPRECATED_END
+#endif
+
 /*-------------------------------------------
                   Functions
 -------------------------------------------*/
diff --git a/src/tim/vx/internal/include/vip/virtual_device.h b/src/tim/vx/internal/include/vip/virtual_device.h
new file mode 100644
index 0000000..7feeb59
--- /dev/null
+++ b/src/tim/vx/internal/include/vip/virtual_device.h
@@ -0,0 +1,56 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VIP_VIRTUAL_DEVICE_H
+#define _VIP_VIRTUAL_DEVICE_H
+
+#include <memory>
+#include <functional>
+
+struct _vsi_nn_graph;
+typedef struct _vsi_nn_graph vsi_nn_graph_t;
+
+namespace vip {
+
+class Device;
+using func_t = std::function<bool (const void*)>;
+using data_t = const void*;
+
+class IDevice {
+    public:
+        IDevice(uint32_t id);
+        ~IDevice();
+        uint32_t Id() const;
+        bool GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data);
+        bool GraphRemove(const vsi_nn_graph_t* graph);
+        bool ThreadExit();
+        bool ThreadIdle();
+        void WaitThreadIdle();
+
+    protected:
+        Device* device_;
+};
+
+}  // namespace vip
+
+#endif
\ No newline at end of file
diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h
index 395ee2e..16f74fa 100644
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@@ -27,6 +27,7 @@
 
 #include "vsi_nn_types.h"
 #include "vsi_nn_assert.h"
+#include "utils/vsi_nn_util.h"
 #include "ops/vsi_nn_op_activations.h"
 #include "ops/vsi_nn_op_batch_norm.h"
 #include "ops/vsi_nn_op_multiply.h"
@@ -185,6 +186,10 @@
 #include "ops/vsi_nn_op_deconv3d.h"
 #include "ops/vsi_nn_op_reduce_mean_internal.h"
 #include "ops/vsi_nn_op_pad2.h"
+#include "ops/vsi_nn_op_pre_process_rgb888_planar.h"
+#include "ops/vsi_nn_op_gather_elements.h"
+#include "ops/vsi_nn_op_selu.h"
+#include "ops/vsi_nn_op_celu.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 
@@ -210,7 +215,7 @@ typedef union _vsi_nn_nn_param
     vsi_nn_multiply_param           multiply;
     vsi_nn_proposal_param           proposal;
     vsi_nn_deconv_param             deconv;
-    vsi_nn_reshape_param            reshape;
+    vsi_nn_reshape_param            VSI_NN_DEPRECATED(reshape, "Replace with reshape2");
     vsi_nn_permute_param            permute;
     vsi_nn_upsample_param           upsample;
     vsi_nn_resize_param             resize;
@@ -356,6 +361,10 @@ typedef union _vsi_nn_nn_param
     vsi_nn_deconv3d_param           deconv3d;
     vsi_nn_reduce_mean_internal_param reduce_mean_internal;
     vsi_nn_pad2_param               pad2;
+    vsi_nn_pre_process_rgb888_planar_param pre_process_rgb888_planar;
+    vsi_nn_gather_elements_param    gather_elements;
+    vsi_nn_selu_param               selu;
+    vsi_nn_celu_param               celu;
     void*                         client_param;
 
     /* custom node data struct define */
diff --git a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
index 386123c..124ac48 100644
--- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
+++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
@@ -84,6 +84,7 @@ typedef enum
     VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR,
     VSI_NN_SOURCE_FORMAT_IMAGE_YUV444,
     VSI_NN_SOURCE_FORMAT_IMAGE_NV12,
+    VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP,
 } vsi_nn_preprocess_source_format_e;
 
 /**
@@ -235,6 +236,13 @@ OVXLIB_API vsi_status vsi_nn_AddGraphPostProcess
     uint32_t count
     );
 
+OVXLIB_API vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam
+    (
+        vsi_nn_graph_t* graph,
+        vsi_nn_node_id_t* enable_nodes,
+        uint32_t enable_nodes_count
+    );
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h
index 328aa19..faab685 100644
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@@ -33,7 +33,7 @@ extern "C"{
 
 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 39
+#define VSI_NN_VERSION_PATCH 43
 #define VSI_NN_VERSION \
     (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
 
diff --git a/src/tim/vx/internal/src/kernel/cl/clip_cl.c b/src/tim/vx/internal/src/kernel/cl/clip_cl.c
index f40c56e..38defcc 100644
--- a/src/tim/vx/internal/src/kernel/cl/clip_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c
@@ -45,7 +45,7 @@ __BEGIN_DECLS
 #define CLIP_HASH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d ) \
         (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (_image_2d))
 
-#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+#define PACK_KERNEL_MAP_3D( IN_DTYPE, OUT_DTYPE ) \
         { CLIP_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0 ), \
           CVIVANTE_NAMESPACE("cl.clip_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
           _CLIP_KERNEL_SOURCE(IN_DTYPE) }
@@ -64,16 +64,22 @@ typedef struct
 
 static const _kernel_map_type _clip_kernel_map[] =
 {
-    PACK_KERNEL_MAP(F32,     F32),
-    PACK_KERNEL_MAP(F32,     U8),
-    PACK_KERNEL_MAP(U8,      U8),
-    PACK_KERNEL_MAP(U8,      F32),
-    PACK_KERNEL_MAP(BF16,    BF16),
-    PACK_KERNEL_MAP_2D(F32,  F32),
-    PACK_KERNEL_MAP_2D(F32,  U8),
-    PACK_KERNEL_MAP_2D(U8,   U8),
-    PACK_KERNEL_MAP_2D(U8,   F32),
-    PACK_KERNEL_MAP_2D(BF16, BF16),
+    PACK_KERNEL_MAP_3D(F32,   F32),
+    PACK_KERNEL_MAP_3D(F32,   U8),
+    PACK_KERNEL_MAP_3D(F32,   I32),
+    PACK_KERNEL_MAP_3D(U8,    U8),
+    PACK_KERNEL_MAP_3D(U8,    F32),
+    PACK_KERNEL_MAP_3D(I32,   I32),
+    PACK_KERNEL_MAP_3D(I32,   F32),
+    PACK_KERNEL_MAP_3D(BF16,  BF16),
+    PACK_KERNEL_MAP_2D(F32,   F32),
+    PACK_KERNEL_MAP_2D(F32,   U8),
+    PACK_KERNEL_MAP_2D(F32,   I32),
+    PACK_KERNEL_MAP_2D(U8,    U8),
+    PACK_KERNEL_MAP_2D(U8,    F32),
+    PACK_KERNEL_MAP_2D(I32,   I32),
+    PACK_KERNEL_MAP_2D(I32,   F32),
+    PACK_KERNEL_MAP_2D(BF16,  BF16),
 };
 
 
@@ -100,9 +106,6 @@ static vx_param_description_t _clip_kernel_param_def[] =
 #define SCALAR_OUTPUT_SCALE       (6)
 #define SCALAR_OUTPUT_TAIL        (7)
 
-#define CLIP_PARAM_NUM         4
-#define CLIP_QUANT_PARAM_NUM   _cnt_of_array( _clip_kernel_param_def )
-
 /*
  * Kernel initializer
  */
@@ -122,7 +125,7 @@ DEF_KERNEL_INITIALIZER(_clip_initializer)
         {0, 0, 0}
         };
     vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
-    vsi_size_array_t * out_shape                 = NULL;
+    vsi_size_array_t * out_shape                = NULL;
 
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
@@ -149,8 +152,6 @@ final:
     return status;
 } /* _clip_initializer() */
 
-
-
 /*
  * Query kernel
  */
@@ -159,8 +160,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t * kernel,
     vsi_nn_tensor_t * const * const inputs,
     vsi_nn_tensor_t * const * const outputs,
-    vsi_bool image_2d,
-    vsi_bool *is_use_u8_kernel
+    vsi_bool image_2d
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -178,37 +178,47 @@ static vsi_status _query_kernel
     in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    if (F16 == in_dtype)
-    {
-        in_dtype = F32;
-    }
+#define _PACK_SELECT_KEY( in_type, out_type ) \
+    ( ( in_type ) | ( out_type << 8 ))
 
-    if (F16 == out_dtype)
+    switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
     {
-        out_dtype = F32;
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+        key = CLIP_HASH_KEY( F32, F32, image_2d );
+        break;
+    case _PACK_SELECT_KEY(F32, I8):
+    case _PACK_SELECT_KEY(F16, I16):
+    case _PACK_SELECT_KEY(F16, I32):
+        key = CLIP_HASH_KEY( F32, I32, image_2d );
+        break;
+    case _PACK_SELECT_KEY(I8,  I8):
+    case _PACK_SELECT_KEY(I16, I16):
+    case _PACK_SELECT_KEY(I32, I32):
+        key = CLIP_HASH_KEY( I32, I32, image_2d );
+        break;
+    case _PACK_SELECT_KEY(I8,  F16):
+    case _PACK_SELECT_KEY(I16, F16):
+    case _PACK_SELECT_KEY(I32, F16):
+    case _PACK_SELECT_KEY(I8,  F32):
+    case _PACK_SELECT_KEY(I16, F32):
+    case _PACK_SELECT_KEY(I32, F32):
+        key = CLIP_HASH_KEY( I32, F32, image_2d );
+        break;
+    default:
+        key = CLIP_HASH_KEY( in_dtype, out_dtype, image_2d );
+        break;
     }
+#undef _PACK_SELECT_KEY
 
-   if ((U8 == in_dtype) || (U8 == out_dtype))
-    {
-        param_def_size    = CLIP_QUANT_PARAM_NUM;
-        *is_use_u8_kernel = TRUE;
-    }
-    else
-    {
-        param_def_size    = CLIP_PARAM_NUM;
-        *is_use_u8_kernel = FALSE;
-    }
-
-    key = CLIP_HASH_KEY( in_dtype, out_dtype, image_2d );
-
-    for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
     {
         if( kernel_map[i].key == key )
         {
             break;
         }
     }
-    if( i < (uint32_t)kernel_map_size )
+    if ( i < (uint32_t)kernel_map_size )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
         kernel->info.parameters  = param_def;
@@ -246,7 +256,6 @@ static vsi_nn_kernel_node_t _setup
     float    outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
     float    inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
     float    inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
-    vsi_bool is_use_u8_kernel = FALSE;
     float    min_value    = vsi_nn_kernel_param_get_float32( params, "min_value" );
     float    max_value    = vsi_nn_kernel_param_get_float32( params, "max_value" );
 
@@ -261,40 +270,31 @@ static vsi_nn_kernel_node_t _setup
 
     image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
 
-    status = _query_kernel( kernel, inputs, outputs, image_2d, &is_use_u8_kernel);
+    status = _query_kernel( kernel, inputs, outputs, image_2d);
 
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status )
     {
-        size_t node_params_num = CLIP_PARAM_NUM;
-
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             /* Set inputs and outputs */
             vsi_nn_kernel_node_pack_io( node_params, _CLIP_PARAM_NUM,
                     inputs, input_num, outputs, output_num );
             node_params[SCALAR_MIN_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &min_value );
             node_params[SCALAR_MAX_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &max_value );
-           if (is_use_u8_kernel)
-            {
-                node_params[SCALAR_INPUT_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
-                node_params[SCALAR_INPUT_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &inputTail );
-                node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
-                node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
-                node_params_num = CLIP_QUANT_PARAM_NUM;
-            }
+            node_params[SCALAR_INPUT_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
+            node_params[SCALAR_INPUT_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &inputTail );
+            node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
             /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CLIP_PARAM_NUM );
             VSI_ASSERT( status == VSI_SUCCESS );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_MIN_VALUE] );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_VALUE] );
-            if (is_use_u8_kernel)
-            {
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
-            }
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
         }
     }
     return node;
diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
index ef10ea5..7bf6d36 100644
--- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
@@ -45,13 +45,14 @@ typedef enum
     UNARY_COS,
     UNARY_EXP,
     UNARY_LOG,
-    UNARY_ELU,
     UNARY_NEG,
     UNARY_HSIGMOID,
     UNARY_MISH,
     UNARY_ROUND,
     UNARY_GELU,
-    UNARY_HGELU
+    UNARY_HGELU,
+    UNARY_SELU,
+    UNARY_CELU,
 } unary_type_e;
 
 /*
@@ -60,16 +61,18 @@ typedef enum
 #define HASH_UNARY_KEY(_type, _input_type, _output_type, _image_2d) \
     ((_type << 20) | (_input_type << 12) | (_output_type << 4) | (_image_2d))
 
- #define VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() \
-    "eltwise_unary"
+#define _UNARY_KERNEL_SOURCE0_NAME() \
+    "eltwise_unary_0"
+#define _UNARY_KERNEL_SOURCE1_NAME() \
+    "eltwise_unary_1"
 
 #define HASH_UNARY_SH_KERNEL_NAME(FUNC_NAME, SRC_TYPE, DST_TYPE) \
     CVIVANTE_NAMESPACE("cl."#FUNC_NAME"_"#SRC_TYPE"to"#DST_TYPE)
 
-#define TENSOR_UNARY_KERNELS(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE) \
+#define TENSOR_UNARY_KERNELS_3D(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE) \
     {   HASH_UNARY_KEY(TYPE, SRC_TYPE, OUT_TYPE, 0), \
         HASH_UNARY_SH_KERNEL_NAME(FUNC_NAME, SRC_TYPE, OUT_TYPE), \
-        VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
+        _UNARY_KERNEL_SOURCE1_NAME() },
 
 #define HASH_UNARY_SH_KERNEL_2D_NAME(FUNC_NAME, SRC_TYPE, DST_TYPE) \
     CVIVANTE_NAMESPACE("cl."#FUNC_NAME"_"#SRC_TYPE"to"#DST_TYPE"_2D")
@@ -77,29 +80,20 @@ typedef enum
 #define TENSOR_UNARY_KERNELS_2D(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE) \
     {   HASH_UNARY_KEY(TYPE, SRC_TYPE, OUT_TYPE, 1), \
         HASH_UNARY_SH_KERNEL_2D_NAME(FUNC_NAME, SRC_TYPE, OUT_TYPE), \
-        VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
-
-#define TENSOR_UNARY_KERNELS_FLOAT(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE) \
-    {   HASH_UNARY_KEY(TYPE, SRC_TYPE, OUT_TYPE, 0), \
-        HASH_UNARY_SH_KERNEL_NAME(FUNC_NAME, F32, F32), \
-        VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
-
-#define TENSOR_UNARY_KERNELS_FLOAT_2D(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE) \
-    {   HASH_UNARY_KEY(TYPE, SRC_TYPE, OUT_TYPE, 1), \
-        HASH_UNARY_SH_KERNEL_2D_NAME(FUNC_NAME, F32, F32), \
-        VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
+        _UNARY_KERNEL_SOURCE0_NAME() },
 
 #define SIN_OPERATION           sin
 #define COS_OPERATION           cos
 #define EXP_OPERATION           exp
 #define LOG_OPERATION           log
-#define ELU_OPERATION           elu
 #define NEG_OPERATION           neg
 #define HSIGMOID_OPERATION      hard_sigmoid
 #define MISH_OPERATION          mish
 #define ROUND_OPERATION         round
 #define GELU_OPERATION          gelu
 #define HGELU_OPERATION         hard_gelu
+#define SELU_OPERATION          selu
+#define CELU_OPERATION          celu
 
 static const struct {
         uint32_t key;
@@ -107,77 +101,59 @@ static const struct {
         const char* source_name;
     } kernel_map[] =
 {
-    TENSOR_UNARY_KERNELS_FLOAT(SIN_OPERATION,      UNARY_SIN,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(SIN_OPERATION,      UNARY_SIN,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(COS_OPERATION,      UNARY_COS,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(COS_OPERATION,      UNARY_COS,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(EXP_OPERATION,      UNARY_EXP,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(EXP_OPERATION,      UNARY_EXP,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(LOG_OPERATION,      UNARY_LOG,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(LOG_OPERATION,      UNARY_LOG,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(ELU_OPERATION,      UNARY_ELU,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(ELU_OPERATION,      UNARY_ELU,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(NEG_OPERATION,      UNARY_NEG,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(NEG_OPERATION,      UNARY_NEG,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(MISH_OPERATION,     UNARY_MISH,     F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(MISH_OPERATION,     UNARY_MISH,     F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION,    UNARY_ROUND,    F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION,    UNARY_ROUND,    F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(GELU_OPERATION,     UNARY_GELU,     F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(GELU_OPERATION,     UNARY_GELU,     F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(HGELU_OPERATION,    UNARY_HGELU,    F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(HGELU_OPERATION,    UNARY_HGELU,    F16, F16)
+    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,      UNARY_SIN,      F32, F32)
+    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,      UNARY_COS,      F32, F32)
+    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,      UNARY_EXP,      F32, F32)
+    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,      UNARY_LOG,      F32, F32)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,      UNARY_NEG,      F32, F32)
+    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32)
+    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION,     UNARY_MISH,     F32, F32)
+    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION,    UNARY_ROUND,    F32, F32)
+    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION,     UNARY_GELU,     F32, F32)
+    TENSOR_UNARY_KERNELS_3D(HGELU_OPERATION,    UNARY_HGELU,    F32, F32)
+    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION,     UNARY_SELU,     F32, F32)
+    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION,     UNARY_CELU,     F32, F32)
 
-    TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION,      UNARY_SIN,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION,      UNARY_SIN,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(COS_OPERATION,      UNARY_COS,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(COS_OPERATION,      UNARY_COS,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(EXP_OPERATION,      UNARY_EXP,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(EXP_OPERATION,      UNARY_EXP,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(LOG_OPERATION,      UNARY_LOG,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(LOG_OPERATION,      UNARY_LOG,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(ELU_OPERATION,      UNARY_ELU,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(ELU_OPERATION,      UNARY_ELU,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(NEG_OPERATION,      UNARY_NEG,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(NEG_OPERATION,      UNARY_NEG,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(MISH_OPERATION,     UNARY_MISH,     F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(MISH_OPERATION,     UNARY_MISH,     F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION,    UNARY_ROUND,    F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION,    UNARY_ROUND,    F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(GELU_OPERATION,     UNARY_GELU,     F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(GELU_OPERATION,     UNARY_GELU,     F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(HGELU_OPERATION,    UNARY_HGELU,    F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(HGELU_OPERATION,    UNARY_HGELU,    F16, F16)
+    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,      UNARY_SIN,      F32, F32)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,      UNARY_COS,      F32, F32)
+    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,      UNARY_EXP,      F32, F32)
+    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,      UNARY_LOG,      F32, F32)
+    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,      UNARY_NEG,      F32, F32)
+    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32)
+    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION,     UNARY_MISH,     F32, F32)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION,    UNARY_ROUND,    F32, F32)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION,     UNARY_GELU,     F32, F32)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION,    UNARY_HGELU,    F32, F32)
+    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION,     UNARY_SELU,     F32, F32)
+    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION,     UNARY_CELU,     F32, F32)
 
-    TENSOR_UNARY_KERNELS(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
-    TENSOR_UNARY_KERNELS(COS_OPERATION,      UNARY_COS,      U8,  U8)
-    TENSOR_UNARY_KERNELS(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
-    TENSOR_UNARY_KERNELS(LOG_OPERATION,      UNARY_LOG,      U8,  U8)
-    TENSOR_UNARY_KERNELS(ELU_OPERATION,      UNARY_ELU,      U8,  U8)
-    TENSOR_UNARY_KERNELS(NEG_OPERATION,      UNARY_NEG,      U8,  U8)
-    TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,  U8)
-    TENSOR_UNARY_KERNELS(MISH_OPERATION,     UNARY_MISH,     U8,  U8)
-    TENSOR_UNARY_KERNELS(ROUND_OPERATION,    UNARY_ROUND,    U8,  U8)
-    TENSOR_UNARY_KERNELS(GELU_OPERATION,     UNARY_GELU,     U8,  U8)
-    TENSOR_UNARY_KERNELS(HGELU_OPERATION,    UNARY_HGELU,    U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,      UNARY_COS,      U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,      UNARY_LOG,      U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,      UNARY_NEG,      U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION,     UNARY_MISH,     U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION,    UNARY_ROUND,    U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION,     UNARY_GELU,     U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(HGELU_OPERATION,    UNARY_HGELU,    U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION,     UNARY_SELU,     U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION,     UNARY_CELU,     U8,  U8)
 
     TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
     TENSOR_UNARY_KERNELS_2D(COS_OPERATION,      UNARY_COS,      U8,  U8)
     TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
     TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,      UNARY_LOG,      U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(ELU_OPERATION,      UNARY_ELU,      U8,  U8)
     TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,      UNARY_NEG,      U8,  U8)
     TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,  U8)
     TENSOR_UNARY_KERNELS_2D(MISH_OPERATION,     UNARY_MISH,     U8,  U8)
     TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION,    UNARY_ROUND,    U8,  U8)
     TENSOR_UNARY_KERNELS_2D(GELU_OPERATION,     UNARY_GELU,     U8,  U8)
     TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION,    UNARY_HGELU,    U8,  U8)
+    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION,     UNARY_SELU,     U8,  U8)
+    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION,     UNARY_CELU,     U8,  U8)
 
-    TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I32,  I32)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I32,  I32)
 
     TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I32,  I32)
 };
@@ -186,13 +162,14 @@ static const struct {
 #undef COS_OPERATION
 #undef EXP_OPERATION
 #undef LOG_OPERATION
-#undef ELU_OPERATION
 #undef NEG_OPERATION
 #undef HSIGMOID_OPERATION
 #undef MISH_OPERATION
 #undef ROUND_OPERATION
 #undef GELU_OPERATION
 #undef HGELU_OPERATION
+#undef SELU_OPERATION
+#undef CELU_OPERATION
 /*
  * Kernel params
  */
@@ -284,7 +261,21 @@ static vsi_status _query_kernel
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-    key = HASH_UNARY_KEY( type, input_dtype, output_dtype, image_2d );
+
+#define _PACK_SELECT_KEY( in_type, out_type ) \
+    ( ( in_type ) | ( out_type << 8 ))
+
+    switch (_PACK_SELECT_KEY(input_dtype, output_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+        key = HASH_UNARY_KEY( type, F32, F32, image_2d );
+        break;
+    default:
+        key = HASH_UNARY_KEY( type, input_dtype, output_dtype, image_2d );
+        break;
+    }
+#undef _PACK_SELECT_KEY
 
     for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
     {
@@ -336,6 +327,15 @@ static vsi_nn_kernel_node_t _setup
     float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" );
     float beta = vsi_nn_kernel_param_get_float32( params, "beta" );
 
+    if (unary_type == UNARY_SELU)
+    {
+        alpha = alpha * beta;
+    }
+    else if (unary_type == UNARY_CELU)
+    {
+        beta = 1.0f / alpha;
+    }
+
     ret = vsi_nn_kernel_optimize_element_shape(
             inputs[0]->attr.size, inputs[0]->attr.dim_num,
             shape, &new_rank );
@@ -450,11 +450,12 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( sin,          UNARY_SIN )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( cos,          UNARY_COS )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( exp,          UNARY_EXP )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( log,          UNARY_LOG )
-REGISTER_ELTWISE_UNARY_BACKEND_CL( elu,          UNARY_ELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( neg,          UNARY_NEG )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_sigmoid, UNARY_HSIGMOID )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( mish,         UNARY_MISH )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( round,        UNARY_ROUND )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( gelu,         UNARY_GELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_gelu,    UNARY_HGELU )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( selu,         UNARY_SELU )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( celu,         UNARY_CELU )
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c
new file mode 100644
index 0000000..a8d56a2
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c
@@ -0,0 +1,282 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_GATHER_ELEMENTS,
+} _internal_kernel_e;
+
+#define _GATHER_ELEMENTS_KERNEL_SOURCE      "gather_elements"
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMG_2D ) \
+        (( AXIS ) | ( IN0_DTYPE << 2 ) | ( IN1_DTYPE << 10 ) | ( OUT_DTYPE << 18 ) | ( IMG_2D << 26 ))
+#define PACK_KERNEL_3D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+    { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 ), \
+    CVIVANTE_NAMESPACE("cl.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
+    _GATHER_ELEMENTS_KERNEL_SOURCE}
+
+#define PACK_KERNEL_2D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+    { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \
+    CVIVANTE_NAMESPACE("cl.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
+    _GATHER_ELEMENTS_KERNEL_SOURCE}
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _gather_elements_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_3D_MAP( 0, F32, I32, F32 ),
+    PACK_KERNEL_3D_MAP( 0, I32, I32, I32 ),
+    PACK_KERNEL_3D_MAP( 0, U32, I32, U32 ),
+    PACK_KERNEL_3D_MAP( 1, F32, I32, F32 ),
+    PACK_KERNEL_3D_MAP( 1, I32, I32, I32 ),
+    PACK_KERNEL_3D_MAP( 1, U32, I32, U32 ),
+    PACK_KERNEL_3D_MAP( 2, F32, I32, F32 ),
+    PACK_KERNEL_3D_MAP( 2, I32, I32, I32 ),
+    PACK_KERNEL_3D_MAP( 2, U32, I32, U32 ),
+
+    PACK_KERNEL_2D_MAP( 0, F32, I32, F32 ),
+    PACK_KERNEL_2D_MAP( 0, I32, I32, I32 ),
+    PACK_KERNEL_2D_MAP( 0, U32, I32, U32 ),
+    PACK_KERNEL_2D_MAP( 1, F32, I32, F32 ),
+    PACK_KERNEL_2D_MAP( 1, I32, I32, I32 ),
+    PACK_KERNEL_2D_MAP( 1, U32, I32, U32 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _gather_elements_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _GATHER_ELEMENTS_PARAM_NUM  _cnt_of_array( _gather_elements_kernel_param_def )
+#define SCALAR_INPUT_SCALE        (3)
+#define SCALAR_INPUT_TAIL         (4)
+#define SCALAR_INPUT_AXIS_SIZE    (5)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_gather_elements_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
+    vsi_size_array_t * out_shape              = NULL;
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    out_shape  = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(output_attr);
+    return status;
+} /* _gather_elements_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t axis
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _gather_elements_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _gather_elements_kernel_map );
+    vx_param_description_t * param_def  = _gather_elements_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _gather_elements_initializer;
+    int32_t img_2d = (outputs[0]->attr.dim_num < 3 || outputs[0]->attr.size[2] == 1) ? 1 : 0;
+    uint32_t key = 0;
+    uint32_t i;
+
+    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+#define _PACK_SELECT_KEY( in0_type, out_type ) \
+    ( ( in0_type ) | ( out_type << 8 ))
+
+    switch (_PACK_SELECT_KEY(in0_dtype, out_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+        key = GATHER_ELEMENTS_HASH_KEY( axis, F32, in1_dtype, F32, img_2d );
+        break;
+    case _PACK_SELECT_KEY(U32, U32):
+    case _PACK_SELECT_KEY(U16, U16):
+    case _PACK_SELECT_KEY(U8,  U8):
+        key = GATHER_ELEMENTS_HASH_KEY( axis, U32, in1_dtype, U32, img_2d );
+        break;
+    case _PACK_SELECT_KEY(I32, I32):
+    case _PACK_SELECT_KEY(I16, I16):
+    case _PACK_SELECT_KEY(I8,  I8):
+        key = GATHER_ELEMENTS_HASH_KEY( axis, I32, in1_dtype, I32, img_2d );
+        break;
+    default:
+        break;
+    }
+#undef _PACK_SELECT_KEY
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _gather_elements_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GATHER_ELEMENTS_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
+    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float input_tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
+    int32_t axis_size = (int32_t)inputs[0]->attr.size[axis];
+
+    status = _query_kernel( kernel, inputs, outputs, axis );
+    if ( VSI_SUCCESS == status)
+    {
+        input_scale = input_scale / output_scale;
+        input_tail = output_zp - input_tail * input_scale;
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _GATHER_ELEMENTS_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_INPUT_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
+            node_params[SCALAR_INPUT_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &input_tail );
+            node_params[SCALAR_INPUT_AXIS_SIZE] = vsi_nn_kernel_scalar_create(graph, I32, &axis_size );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GATHER_ELEMENTS_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS_SIZE] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( gather_elements, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c
index e516df5..7b2f50a 100644
--- a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c
@@ -59,9 +59,11 @@ typedef struct
 static const _kernel_map_type _l2normalizescale_kernel_map[] =
 {
     HASH_L2NORMALIZESCALE_KERNELS_2D( 0, F32, F32, F32 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F32, U8  )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8,  F32, U8  )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I32, F32, I32  )
     HASH_L2NORMALIZESCALE_KERNELS_2D( 1, F32, F32, F32 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F32, U8  )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8,  F32, U8  )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I32, F32, I32  )
 };
 
 
@@ -91,9 +93,6 @@ static vx_param_description_t _l2normalizescale_kernel_param_def[] =
 #define SCALAR_OUTPUT_SCALE         (8)
 #define SCALAR_OUTPUT_TAIL          (9)
 
-#define L2NORMSCALE_PARAM_NUM         6
-#define L2NORMSCALE_QUANT_PARAM_NUM   _cnt_of_array( _l2normalizescale_kernel_param_def )
-
 /*
  * Kernel initializer
  */
@@ -168,8 +167,7 @@ static vsi_status _query_kernel
     vsi_nn_tensor_t * const * const inputs,
     vsi_nn_tensor_t * const * const outputs,
     int32_t axis,
-    vsi_bool image_2d,
-    vsi_bool *is_use_u8_kernel
+    vsi_bool image_2d
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -193,6 +191,10 @@ static vsi_status _query_kernel
     {
         in0_dtype = F32;
     }
+    else if (I8 == in0_dtype || I16 == in0_dtype)
+    {
+        in0_dtype = I32;
+    }
 
     if (F16 == in1_dtype)
     {
@@ -203,16 +205,9 @@ static vsi_status _query_kernel
     {
         out_dtype = F32;
     }
-
-   if ((U8 == in0_dtype) || (U8 == out_dtype))
+    else if (I8 == out_dtype || I16 == out_dtype)
     {
-        param_def_size = L2NORMSCALE_QUANT_PARAM_NUM;
-        *is_use_u8_kernel = TRUE;
-    }
-    else
-    {
-        param_def_size = L2NORMSCALE_PARAM_NUM;
-        *is_use_u8_kernel = FALSE;
+        out_dtype = I32;
     }
 
     key = HASH_L2NORMALIZESCALE_HASH_KEY(axis, in0_dtype, in1_dtype, out_dtype, image_2d);
@@ -265,7 +260,6 @@ static vsi_nn_kernel_node_t _setup
     float    inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
     float    epsilon      = (float)10e-12;
     float    rsEps        = 1.0f / sqrtf(epsilon);
-    vsi_bool is_use_u8_kernel = FALSE;
 
     outputScale = 1.0f / outputScale;
     inputTail   = -(inputTail * inputScale);
@@ -282,7 +276,7 @@ static vsi_nn_kernel_node_t _setup
     }
 
     image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
-    status = _query_kernel( kernel, inputs, outputs, axis, image_2d, &is_use_u8_kernel );
+    status = _query_kernel( kernel, inputs, outputs, axis, image_2d );
     axis_size = inputs[0]->attr.size[axis];
 
 
@@ -291,7 +285,6 @@ static vsi_nn_kernel_node_t _setup
         node = vsi_nn_kernel_create_node( graph, kernel );
         if( node )
         {
-            size_t node_params_num = L2NORMSCALE_PARAM_NUM;
             /* Set inputs and outputs */
             vsi_nn_kernel_node_pack_io( node_params, _L2NORMALIZESCALE_PARAM_NUM,
                     inputs, input_num, outputs, output_num );
@@ -301,27 +294,21 @@ static vsi_nn_kernel_node_t _setup
                     graph, I32, &axis_size );
             node_params[SCALAR_EPS_VALUE] = vsi_nn_kernel_scalar_create(
                     graph, F32, &rsEps );
-            if (is_use_u8_kernel)
-            {
-                node_params[SCALAR_INPUT_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
-                node_params[SCALAR_INPUT_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &inputTail );
-                node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
-                node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
-                node_params_num = L2NORMSCALE_QUANT_PARAM_NUM;
-            }
+            node_params[SCALAR_INPUT_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
+            node_params[SCALAR_INPUT_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &inputTail );
+            node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
+
             /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _L2NORMALIZESCALE_PARAM_NUM );
             VSI_ASSERT( status == VSI_SUCCESS );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_AXIS_SIZE] );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_EPS_VALUE] );
-            if (is_use_u8_kernel)
-            {
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
-            }
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
         }
     }
     return node;
diff --git a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
index 322bd22..c692265 100644
--- a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
@@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "vsi_nn_tensor_util.h"
-#include "math.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
@@ -43,8 +42,6 @@ __BEGIN_DECLS
  * Define kernel meta.
  */
 #define KERNEL_SOURCE_1    "maximum",
-#define KERNEL_SOURCE_2    "maximum_fp16",
-#define KERNEL_SOURCE_3    "maximum_i16"
 
 #define HASH_MAXIMUM_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
     ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
@@ -198,16 +195,25 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+#define CONVERT_I8_OR_I16TOI32(dtype) \
+    dtype = (dtype == I8 || dtype == I16) ? I32 : dtype
+
+    CONVERT_I8_OR_I16TOI32(input0_dtype);
+    CONVERT_I8_OR_I16TOI32(input1_dtype);
+    CONVERT_I8_OR_I16TOI32(output_dtype);
+#undef CONVERT_I8_OR_I16TOI32
+
     key = HASH_MAXIMUM_KEY( input0_dtype, input1_dtype, output_dtype, image_2d );
 
-    for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(kernel_map); i ++ )
     {
-        if( kernel_map[i].key == key )
+        if ( kernel_map[i].key == key )
         {
             break;
         }
     }
-    if( i < _cnt_of_array(kernel_map) )
+    if ( i < _cnt_of_array(kernel_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
         kernel->info.parameters = kernel_param_def;
@@ -248,7 +254,7 @@ static vsi_nn_kernel_node_t _setup
 
     outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
 
-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
@@ -256,11 +262,11 @@ static vsi_nn_kernel_node_t _setup
 
     image_2d = (outputs[0]->attr.dim_num == 2);
     status = _query_kernel( inputs, outputs, image_2d, kernel );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
 
-        if( node )
+        if ( node )
         {
             vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
                     inputs, 2, outputs, 1 );
diff --git a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
index 40b9977..e5fe695 100644
--- a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
@@ -42,8 +42,6 @@ __BEGIN_DECLS
  * Define kernel meta.
  */
 #define KERNEL_SOURCE_1    "minimum",
-#define KERNEL_SOURCE_2    "minimum_fp16",
-#define KERNEL_SOURCE_3    "minimum_i16"
 
 #define HASH_MINIMUM_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
     ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
@@ -197,16 +195,25 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+#define CONVERT_I8_OR_I16TOI32(dtype) \
+    dtype = (dtype == I8 || dtype == I16) ? I32 : dtype
+
+    CONVERT_I8_OR_I16TOI32(input0_dtype);
+    CONVERT_I8_OR_I16TOI32(input1_dtype);
+    CONVERT_I8_OR_I16TOI32(output_dtype);
+#undef CONVERT_I8_OR_I16TOI32
+
     key = HASH_MINIMUM_KEY( input0_dtype, input1_dtype, output_dtype, image_2d );
 
-    for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(kernel_map); i ++ )
     {
-        if( kernel_map[i].key == key )
+        if ( kernel_map[i].key == key )
         {
             break;
         }
     }
-    if( i < _cnt_of_array(kernel_map) )
+    if ( i < _cnt_of_array(kernel_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
         kernel->info.parameters = kernel_param_def;
@@ -247,7 +254,7 @@ static vsi_nn_kernel_node_t _setup
 
     outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
 
-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
@@ -255,11 +262,11 @@ static vsi_nn_kernel_node_t _setup
 
     image_2d = (outputs[0]->attr.dim_num == 2);
     status = _query_kernel( inputs, outputs, image_2d, kernel );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
 
-        if( node )
+        if ( node )
         {
             vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
                     inputs, 2, outputs, 1 );
diff --git a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
index 33b575f..4369bea 100644
--- a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
@@ -176,6 +176,10 @@ static vsi_status _query_kernel
     {
         in_dtype = F32;
     }
+    else if (in_dtype == I16 || in_dtype == I8)
+    {
+        in_dtype = I32;
+    }
 
     if (out_dtype == F16)
     {
diff --git a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
index 47c896c..bed0f91 100644
--- a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 
 __BEGIN_DECLS
@@ -146,6 +145,7 @@ DEF_KERNEL_INITIALIZER(_roi_align_initializer)
 
 final:
 #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(rois_attr);
     SAFE_FREE_TENSOR_ATTR(output_attr);
 
     return status;
@@ -212,7 +212,6 @@ static vsi_status _query_kernel
     }
 
     return status;
-
 } /* _query_kernel() */
 
 #define _INPUT_NUM          (3)
@@ -326,4 +325,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( roi_align, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c b/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c
index 293304d..f4b6eee 100644
--- a/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -76,8 +75,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
@@ -93,7 +92,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -243,4 +241,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( add_mean_std_norm, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c
index 52a5572..6bb8eeb 100644
--- a/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c
@@ -36,7 +36,6 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 __BEGIN_DECLS
 
 #define _CPU_ARG_NUM            (1)
@@ -138,20 +137,6 @@ static vx_param_description_t kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
 };
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _argmax_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -159,7 +144,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _argmax_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -210,4 +199,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( argmax, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c b/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c
index 09aa235..3c9d6b9 100644
--- a/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c
@@ -36,7 +36,6 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -139,20 +138,6 @@ static vx_param_description_t kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
 };
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _argmin_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -160,7 +145,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _argmin_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -211,4 +200,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( argmin, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c b/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c
index 8903139..9d39e21 100644
--- a/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c
@@ -108,8 +108,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -128,7 +128,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for (i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -276,4 +275,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( axis_aligned_bbox_transform, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c b/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c
index 0edbb7f..dcf7940 100644
--- a/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c
@@ -34,7 +34,6 @@
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
 
 __BEGIN_DECLS
@@ -160,20 +159,6 @@ static vx_param_description_t kernel_param_def[] =
 
 #define SCALAR_INPUT_EPS          (6)
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _batch_norm_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -181,7 +166,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _batch_norm_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -231,4 +220,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( batchnorm_single, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c b/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c
index 53af44c..28a5763 100644
--- a/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c
@@ -34,7 +34,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -225,8 +224,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     int32_t* int32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
     int32_t* int32_out_buffer[_OUTPUT_NUM] = {0};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
diff --git a/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c b/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c
index fdc462f..79cacfc 100644
--- a/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 #include "utils/vsi_nn_dtype_util_prv.h"
 
 __BEGIN_DECLS
@@ -72,8 +71,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -92,7 +91,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i]->asymm.zero_point  = 0;
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -217,4 +215,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( cast, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c b/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c
index 70c40cb..5bb08de 100644
--- a/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -77,8 +76,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -93,7 +92,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -212,10 +210,8 @@ static vsi_nn_kernel_node_t _setup
     }
 
     return node;
-
 } /* _setup() */
 
 __END_DECLS
 
 REGISTER_BACKEND_CPU( clip, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c b/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c
index 7e27725..a43f2f3 100644
--- a/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c
@@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -206,20 +205,6 @@ static vx_param_description_t kernel_param_def[] =
 
 #define INPUT_FUNC_OP           (3)
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _comparisons_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -227,7 +212,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _comparisons_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c b/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c
index 4bdba10..dd820df 100644
--- a/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c
@@ -34,7 +34,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -182,7 +181,6 @@ final:
     }
 
     return status;
-
 } /* _compute() */
 
 /*
diff --git a/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c
index 8217783..dea83c9 100644
--- a/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -150,20 +149,6 @@ static vx_param_description_t _depth2space_crd_kernel_param_def[] =
 };
 #define _DEPTH2SPACE_CRD_PARAM_NUM  _cnt_of_array( _depth2space_crd_kernel_param_def )
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _depth2space_crd_exec,
-    _depth2space_crd_kernel_param_def,
-    _cnt_of_array( _depth2space_crd_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -171,7 +156,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _depth2space_crd_exec;
+    kernel->info.parameters  = _depth2space_crd_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _depth2space_crd_kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -220,4 +209,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( depth2space_internal, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/depthwise_conv1d_cpu.c b/src/tim/vx/internal/src/kernel/cpu/depthwise_conv1d_cpu.c
index 46de624..e6c787b 100644
--- a/src/tim/vx/internal/src/kernel/cpu/depthwise_conv1d_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/depthwise_conv1d_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 #include "cpu_backend/npuref_interface.h"
 
 __BEGIN_DECLS
@@ -272,4 +271,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( depthwise_conv1d, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/detect_post_box_cpu.c b/src/tim/vx/internal/src/kernel/cpu/detect_post_box_cpu.c
index 8fd8da0..48de41c 100644
--- a/src/tim/vx/internal/src/kernel/cpu/detect_post_box_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/detect_post_box_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -82,8 +81,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -102,7 +101,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for ( i = 0; i < _OUTPUT_NUM; i++ )
     {
@@ -252,4 +250,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( detect_post_box, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/detect_post_nms_cpu.c b/src/tim/vx/internal/src/kernel/cpu/detect_post_nms_cpu.c
index 4952873..3092350 100644
--- a/src/tim/vx/internal/src/kernel/cpu/detect_post_nms_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/detect_post_nms_cpu.c
@@ -22,7 +22,6 @@
 *
 *****************************************************************************/
 
-
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -35,7 +34,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -199,8 +197,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -222,7 +220,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for ( i = 0; i < _OUTPUT_NUM; i++ )
     {
@@ -524,4 +521,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( detect_post_nms, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
index 17b7be6..7c6c480 100644
--- a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
@@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -43,13 +42,14 @@ typedef enum
     UNARY_COS,
     UNARY_EXP,
     UNARY_LOG,
-    UNARY_ELU,
     UNARY_NEG,
     UNARY_HSIGMOID,
     UNARY_MISH,
     UNARY_ROUND,
     UNARY_GELU,
     UNARY_HGELU,
+    UNARY_SELU,
+    UNARY_CELU,
 } unary_type_e;
 
 
@@ -80,11 +80,6 @@ static float log_eval(float data)
     return logf(data);
 }
 
-static float elu_eval(float data, float alpha)
-{
-    return data >=0 ? data : expf(data) * alpha - alpha;
-}
-
 static float neg_eval(float data)
 {
     return data * -1.0f;
@@ -117,45 +112,9 @@ static float round_eval(float data)
     return data;
 }
 
-static float erf_eval(float x)
-{
-    float res = 0;
-    float tmp = x;
-    float factorial = 1; /*n!*/
-    float x_pow = x;
-    int32_t one = 1;
-    int32_t n = 1;
-
-    if (x <= -3)
-    {
-        return -1;
-    }
-    else if (x >= 3)
-    {
-        return 1;
-    }
-
-    while (vsi_abs(tmp) > 1e-5)
-    {
-        res += tmp;
-
-        factorial *= n;
-        one *= -1;
-        x_pow *= x * x;
-        tmp = one / factorial * x_pow / ( 2 * n + 1);
-
-        n ++;
-    }
-#define VSI_MUL2_RSQRTPI    (1.1283791670955126f)
-
-    res *= VSI_MUL2_RSQRTPI;
-
-    return res;
-}
-
 static float gelu_eval(float data)
 {
-    data = (float)(0.5f * data * (1 + erf_eval(data / (float)sqrt(2.0f))));
+    data = (float)(0.5f * data * (1 + vsi_nn_erf_impl(data / (float)sqrt(2.0f))));
 
     return data;
 }
@@ -169,6 +128,23 @@ static float hgelu_eval(float data)
     return data * cdf;
 }
 
+static float selu_eval(float data, float alpha, float gamma)
+{
+    float y0 = alpha * gamma * expf(data) - alpha * gamma;
+    float y1 = gamma * data;
+    float y = data <= 0 ? y0 : y1;
+
+    return y;
+}
+
+static float celu_eval(float x, float alpha)
+{
+    float positive = vsi_nn_max(0, x);
+    float negative = vsi_nn_min(alpha * (expf(x / alpha) - 1), 0);
+
+    return positive + negative;
+}
+
 DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
     (
     vsi_nn_kernel_node_t node,
@@ -227,9 +203,6 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
         case UNARY_LOG:
             data = log_eval(data);
             break;
-        case UNARY_ELU:
-            data = elu_eval(data, alpha);
-            break;
         case UNARY_NEG:
             data = neg_eval(data);
             break;
@@ -248,6 +221,12 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
         case UNARY_HGELU:
             data = hgelu_eval(data);
             break;
+        case UNARY_SELU:
+            data = selu_eval(data, alpha, beta);
+            break;
+        case UNARY_CELU:
+            data = celu_eval(data, alpha);
+            break;
         default:
             break;
         }
@@ -287,20 +266,6 @@ static vx_param_description_t kernel_param_def[] =
 #define INPUT_SCALAR_ALPHA        (3)
 #define INPUT_SCALAR_BETA         (4)
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _eltwise_unary_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -308,7 +273,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _eltwise_unary_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -384,10 +353,11 @@ REGISTER_ELTWISE_UNARY_BACKEND_CPU( sin,          UNARY_SIN )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( cos,          UNARY_COS )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( exp,          UNARY_EXP )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( log,          UNARY_LOG )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( elu,          UNARY_ELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( neg,          UNARY_NEG )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_sigmoid, UNARY_HSIGMOID )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( mish,         UNARY_MISH )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( round,        UNARY_ROUND )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( gelu,         UNARY_GELU )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_gelu,    UNARY_HGELU )
\ No newline at end of file
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_gelu,    UNARY_HGELU )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( selu,         UNARY_SELU )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( celu,         UNARY_CELU )
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c b/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c
index 12c789c..cf427f7 100644
--- a/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -58,7 +57,6 @@ static vx_param_description_t _erf_kernel_param_def[] =
 };
 #define _ERF_PARAM_NUM  _cnt_of_array( _erf_kernel_param_def )
 
-
 /*
  * Kernel function
  */
@@ -74,8 +72,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -101,34 +99,10 @@ DEF_KERNEL_EXECUTOR(_compute)
         CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
         memset( f32_out_buffer[i], 0, out_bytes[i] );
     }
-#define VSI_ERF_PI  3.141592653589793
     for (i = 0; i < out_elements[0]; i ++)
     {
-        /* 2 / sqrt(pi) * (sum[(-1)^n! * x ^ (2n + 1)] + x) */
-        float x = vsi_clamp(f32_in_buffer[0][i], -2, 2);
-        float res = 0;
-        float tmp = x;
-        float factorial = 1; /*n!*/
-        float x_pow = x;
-        int32_t one = 1;
-        int32_t n = 1;
-
-        while (vsi_abs(tmp) > 1e-5)
-        {
-            res += tmp;
-
-            factorial *= n;
-            one *= -1;
-            x_pow *= x * x;
-            tmp = one / factorial * x_pow / ( 2 * n + 1);
-
-            n ++;
-        }
-
-
-        res *= 2.0f / (float)sqrt(VSI_ERF_PI);
-
-        f32_out_buffer[0][i] = res;
+        float x = vsi_nn_erf_impl(f32_in_buffer[0][i]);
+        f32_out_buffer[0][i] = x;
     }
 
     /* save data */
diff --git a/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c b/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c
index 0625cd6..371aead 100644
--- a/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/floordiv_cpu.c b/src/tim/vx/internal/src/kernel/cpu/floordiv_cpu.c
index f8c0ed8..99ca050 100644
--- a/src/tim/vx/internal/src/kernel/cpu/floordiv_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/floordiv_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -94,8 +93,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
@@ -110,7 +109,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -236,4 +234,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( floordiv, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c
index b91dabd..aa02a41 100644
--- a/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -181,20 +180,6 @@ static vx_param_description_t _gather_kernel_param_def[] =
 };
 #define _GATHER_PARAM_NUM  _cnt_of_array( _gather_kernel_param_def )
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _gather_exec,
-    _gather_kernel_param_def,
-    _cnt_of_array( _gather_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -202,7 +187,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _gather_exec;
+    kernel->info.parameters  = _gather_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _gather_kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -260,4 +249,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( gather, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_elements_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_elements_cpu.c
new file mode 100644
index 0000000..65778e5
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/gather_elements_cpu.c
@@ -0,0 +1,228 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _ARG_NUM            (1)
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.gather_elements")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _gather_elements_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _GATHER_ELEMENTS_PARAM_NUM  _cnt_of_array( _gather_elements_kernel_param_def )
+
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    float * buffer[2] = { NULL };
+    int32_t* buffer_idx = NULL;
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
+    vsi_size_t a = 0;
+    vsi_size_t o = 0;
+    vsi_size_t i = 0;
+    vsi_size_t outer_size[2] = {1, 1};
+    vsi_size_t inner_size[2] = {1, 1};
+    vsi_size_t axis_size[2] = {1, 1};
+    int32_t axis = 0;
+
+    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
+    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
+
+    buffer_idx = (int32_t*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE );
+    CHECK_PTR_FAIL_GOTO( buffer_idx, "Create input1 buffer fail.", final );
+
+    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
+    memset( buffer[1], 0, out_elements * sizeof(float) );
+
+    axis_size[0] = attr[0]->shape->data[axis];
+    axis_size[1] = attr[2]->shape->data[axis];
+    for (i = 0; i < (vsi_size_t)axis; ++i)
+    {
+        inner_size[0] *= attr[0]->shape->data[i];
+        inner_size[1] *= attr[2]->shape->data[i];
+    }
+
+    for (i = axis + 1; i < attr[2]->shape->size; ++i)
+    {
+        outer_size[0] *= attr[0]->shape->data[i];
+        outer_size[1] *= attr[2]->shape->data[i];
+    }
+
+    for (o = 0; o < outer_size[1]; o++)
+    {
+        for (a = 0; a < axis_size[1]; a++)
+        {
+            for (i = 0; i < inner_size[1]; i++)
+            {
+                vsi_ssize_t index = 0;
+                vsi_size_t index0 = (o * axis_size[1] + a) * inner_size[1] + i;
+                vsi_size_t index1 = 1;
+
+                index = (vsi_ssize_t)buffer_idx[index0];
+                index = index < 0 ? index + (vsi_ssize_t)axis_size[0] : index;
+                index1 = (o * axis_size[0] + index) * inner_size[0] + i;
+
+                buffer[1][index0] = buffer[0][index1];
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
+            buffer[1], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+final:
+    if ( buffer_idx )
+    {
+        free( buffer_idx );
+    }
+    for ( i = 0; i < 2; i ++ )
+    {
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+    }
+    for ( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _gather_elements_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _gather_elements_kernel_param_def );
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GATHER_ELEMENTS_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _GATHER_ELEMENTS_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GATHER_ELEMENTS_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( gather_elements, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
index e446623..d57cfd4 100644
--- a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -166,20 +165,6 @@ static vx_param_description_t _gather_nd_kernel_param_def[] =
 };
 #define _GATHER_ND_PARAM_NUM  _cnt_of_array( _gather_nd_kernel_param_def )
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _gather_nd_exec,
-    _gather_nd_kernel_param_def,
-    _cnt_of_array( _gather_nd_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -187,7 +172,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _gather_nd_exec;
+    kernel->info.parameters  = _gather_nd_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _gather_nd_kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -238,4 +227,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( gather_nd, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/generate_proposals_cpu.c b/src/tim/vx/internal/src/kernel/cpu/generate_proposals_cpu.c
index f764c18..86e0c7e 100644
--- a/src/tim/vx/internal/src/kernel/cpu/generate_proposals_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/generate_proposals_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel.h"
 
 __BEGIN_DECLS
 
@@ -195,8 +194,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -215,7 +214,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for (i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -504,4 +502,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( generate_proposals, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c
index 570a1a2..82b2482 100644
--- a/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 
 __BEGIN_DECLS
@@ -187,20 +186,6 @@ static vx_param_description_t _group_normalization_kernel_param_def[] =
 };
 #define _GROUP_NORMALIZATION_PARAM_NUM  _cnt_of_array( _group_normalization_kernel_param_def )
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _group_norm_exec,
-    _group_normalization_kernel_param_def,
-    _cnt_of_array( _group_normalization_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -208,7 +193,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _group_norm_exec;
+    kernel->info.parameters  = _group_normalization_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _group_normalization_kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -312,4 +301,3 @@ final:
 __END_DECLS
 
 REGISTER_BACKEND_CPU( group_norm, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_cpu.c b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_cpu.c
index d3dc822..1468b26 100644
--- a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_cpu.c
@@ -34,7 +34,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -474,7 +473,9 @@ static vsi_nn_kernel_node_t _setup
         if( node )
         {
             _inputs = (vsi_nn_tensor_t**)malloc(input_count * sizeof(vsi_nn_tensor_t**));
+            CHECK_PTR_FAIL_GOTO( _inputs, "Create buffer fail.", final );
             node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count);
+            CHECK_PTR_FAIL_GOTO( node_params, "Create buffer fail.", final );
             for(i = 0; i < input_count; i++)
             {
                 _inputs[i] = inputs[i];
@@ -504,6 +505,7 @@ static vsi_nn_kernel_node_t _setup
         }
     }
 
+final:
     vsi_nn_safe_free(_inputs);
     vsi_nn_safe_free(node_params);
     return node;
@@ -512,4 +514,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( grucell_activation, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_sma_cpu.c b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_sma_cpu.c
index 0d9c46c..783f779 100644
--- a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_sma_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_sma_cpu.c
@@ -33,7 +33,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -179,4 +178,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( grucell_activation_sma, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c b/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c
index 9af5fa4..61f6cd2 100644
--- a/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c
@@ -36,7 +36,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -124,8 +123,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
diff --git a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
index cf9bb0e..24a1db4 100644
--- a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -187,20 +186,6 @@ static vx_param_description_t _instance_normalization_kernel_param_def[] =
 };
 #define _INSTANCE_NORMALIZATION_PARAM_NUM  _cnt_of_array( _instance_normalization_kernel_param_def )
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _instance_norm_exec,
-    _instance_normalization_kernel_param_def,
-    _cnt_of_array( _instance_normalization_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -208,7 +193,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _instance_norm_exec;
+    kernel->info.parameters  = _instance_normalization_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _instance_normalization_kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/l2normalizescale_cpu.c b/src/tim/vx/internal/src/kernel/cpu/l2normalizescale_cpu.c
index 79a0e30..c220601 100644
--- a/src/tim/vx/internal/src/kernel/cpu/l2normalizescale_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/l2normalizescale_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -77,8 +76,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -99,7 +98,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -246,4 +244,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( l2normalizescale, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c
index dffd119..1329ce3 100644
--- a/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -184,20 +183,6 @@ static vx_param_description_t _layer_normalization_kernel_param_def[] =
 };
 #define _LAYER_NORMALIZATION_PARAM_NUM  _cnt_of_array( _layer_normalization_kernel_param_def )
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _layer_norm_exec,
-    _layer_normalization_kernel_param_def,
-    _LAYER_NORMALIZATION_PARAM_NUM,
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -205,7 +190,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _layer_norm_exec;
+    kernel->info.parameters  = _layer_normalization_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _layer_normalization_kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -252,4 +241,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( layer_norm, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c
index 1733ac9..67e0d84 100644
--- a/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c
@@ -36,7 +36,6 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 __BEGIN_DECLS
 
 #define _CPU_ARG_NUM            (2)
@@ -153,20 +152,6 @@ static vx_param_description_t kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
 };
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _log_softmax_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -174,7 +159,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _log_softmax_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -232,4 +221,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( log_softmax, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/logical_not_cpu.c b/src/tim/vx/internal/src/kernel/cpu/logical_not_cpu.c
index b03a413..9bcdcab 100644
--- a/src/tim/vx/internal/src/kernel/cpu/logical_not_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/logical_not_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -72,8 +71,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -85,7 +84,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -197,4 +195,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( logical_not, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/logical_ops_cpu.c b/src/tim/vx/internal/src/kernel/cpu/logical_ops_cpu.c
index d90d1af..07deb44 100644
--- a/src/tim/vx/internal/src/kernel/cpu/logical_ops_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/logical_ops_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -96,8 +95,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
@@ -113,7 +112,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -264,4 +262,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( logical_ops, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/lstmunit_activation_cpu.c b/src/tim/vx/internal/src/kernel/cpu/lstmunit_activation_cpu.c
index 2cc2209..ade68ef 100644
--- a/src/tim/vx/internal/src/kernel/cpu/lstmunit_activation_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/lstmunit_activation_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -156,7 +155,6 @@ DEF_KERNEL_EXECUTOR(_compute)
             f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
             CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
         }
-
     }
 
     for( i = 0; i < _OUTPUT_NUM; i++ )
@@ -308,7 +306,6 @@ final:
     }
 
     return status;
-
 } /* _compute() */
 
 
@@ -331,7 +328,6 @@ static vsi_status _query_kernel
     status                   = VSI_SUCCESS;
 
     return status;
-
 } /* _query_kernel() */
 
 
@@ -397,10 +393,8 @@ static vsi_nn_kernel_node_t _setup
     }
 
     return node;
-
 } /* _setup() */
 
 __END_DECLS
 
 REGISTER_BACKEND_CPU( lstmunit_activation, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c b/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c
index 20130bb..846df68 100644
--- a/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -190,20 +189,6 @@ static vx_param_description_t _matrixmul_kernel_param_def[] =
 };
 #define _MATIRXMUL_PARAM_NUM  _cnt_of_array( _matrixmul_kernel_param_def )
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _matrixmul_exec,
-    _matrixmul_kernel_param_def,
-    _cnt_of_array( _matrixmul_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -211,7 +196,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _matrixmul_exec;
+    kernel->info.parameters  = _matrixmul_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _matrixmul_kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -261,4 +250,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( matrixmul, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c
index 74a06ae..e109349 100644
--- a/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c
@@ -36,7 +36,6 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -147,21 +146,6 @@ static vx_param_description_t kernel_param_def[] =
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
 };
 
-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _maximum_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -169,7 +153,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _maximum_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -211,4 +199,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( maximum, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c
index cfac7cd..61d94c6 100644
--- a/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c
@@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -143,21 +142,6 @@ static vx_param_description_t kernel_param_def[] =
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
 };
 
-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _minimum_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -165,7 +149,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _minimum_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -207,4 +195,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( minimum, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c b/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c
index 981342c..431eee7 100644
--- a/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -218,20 +217,6 @@ static vx_param_description_t _moments_kernel_param_def[] =
 };
 #define _MOMENTS_PARAM_NUM  _cnt_of_array( _moments_kernel_param_def )
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _moments_exec,
-    _moments_kernel_param_def,
-    _cnt_of_array( _moments_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -239,7 +224,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _moments_exec;
+    kernel->info.parameters  = _moments_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _moments_kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -315,4 +304,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( moments, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c b/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c
index 62e695f..f387d81 100644
--- a/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c b/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c
index 146b093..5508499 100644
--- a/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/poolwithargmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/poolwithargmax_cpu.c
index 3a8feca..19a6e85 100644
--- a/src/tim/vx/internal/src/kernel/cpu/poolwithargmax_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/poolwithargmax_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -86,8 +85,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -112,7 +111,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -335,4 +333,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( poolwithargmax, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c
index 77d1036..39d53dd 100644
--- a/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c
@@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -146,21 +145,6 @@ static vx_param_description_t kernel_param_def[] =
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
 };
 
-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _pow_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -168,7 +152,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _pow_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -210,4 +198,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( pow, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c
index 6a78ee9..bca6300 100644
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c
@@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -283,21 +282,6 @@ static vx_param_description_t kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 
-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _pre_process_bgra_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -305,7 +289,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _pre_process_bgra_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -381,4 +369,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( pre_process_bgra, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c
index 088b6a5..f7d4248 100644
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c
@@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -194,21 +193,6 @@ static vx_param_description_t kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 
-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _pre_process_gray_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -216,7 +200,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _pre_process_gray_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -280,4 +268,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( pre_process_gray, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c
index 78c7cf5..f9c47f9 100644
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c
@@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -256,21 +255,6 @@ static vx_param_description_t kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 
-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _pre_process_nv12_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -278,7 +262,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _pre_process_nv12_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -354,4 +342,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( pre_process_nv12, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c
new file mode 100644
index 0000000..1af66f0
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c
@@ -0,0 +1,297 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _CPU_ARG_NUM        (8)
+#define _CPU_INPUT_NUM      (3)
+#define _CPU_OUTPUT_NUM     (3)
+#define _CPU_IO_NUM         (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM      (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.pre_process_rgb888_planar")
+
+#define DESCALE(x) (((x) + (1<<19)) >> 20)
+/*
+ * Kernel params
+ */
+static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM  _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def )
+
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VX_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    float * buffer[_CPU_IO_NUM] = { NULL };
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
+    uint32_t i = 0;
+    int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0;
+    float mean[3] = {0}, scale = 1;
+
+    for (i = 0; i < _CPU_IO_NUM; i++)
+    {
+        tensors[i] = (vsi_nn_kernel_tensor_t)param[i];
+        attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
+        CHECK_PTR_FAIL_GOTO( attr[i], "Create tensor attr buffer fail.", final );
+    }
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
+
+    i = 6;
+    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &mean[0]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &mean[1]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &mean[2]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &scale);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    for (i = 0; i < 3; i++)
+    {
+        buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( buffer[i], "Create input0 buffer fail.", final );
+
+        buffer[i + 3] = (float *)malloc( out_elements * sizeof(float) );
+        CHECK_PTR_FAIL_GOTO( buffer[i + 3], "Create output buffer fail.", final );
+        memset( buffer[i + 3], 0, out_elements * sizeof(float) );
+    }
+
+    {
+        int32_t line1[2], line2[2];
+        int32_t dx = 0, dy = 0, idx = 0;
+        int32_t src_width = (int32_t)attr[0]->shape->data[0];
+        int32_t dst_width = (int32_t)attr[3]->shape->data[0];
+        int32_t dst_height = (int32_t)attr[3]->shape->data[1];
+        uint8_t result = 0;
+
+        for ( idx = 0; idx < 3; idx ++)
+        {
+            for ( dy = 0; dy < (int32_t)dst_height; dy ++)
+            {
+                for ( dx = 0; dx < (int32_t)dst_width; dx ++)
+                {
+                    int32_t source_index = 0;
+                    int32_t output_index = dx + dy * dst_width;
+                    float finalVal = 0.0f;
+
+                    if(xRatio != (1 << 15) || yRatio != (1 << 15))
+                    {
+                        int32_t fx = (dx * xRatio + (xRatio >> 1)) - (1 << 14);
+                        int32_t sx = fx & 0xffff8000; // Floor
+                        int32_t fy = 0, sy = 0;
+                        int32_t temp1 = 0;
+                        int32_t temp2 = 0;
+
+                        fx -= sx;
+                        sx = sx >> 15;
+
+                        sx = sx < 0 ? 0 : sx;
+                        sx = sx > src_width ? src_width - 1: sx;
+
+                        fx = (fx +(1 << 4)) >> 5;
+
+                        // for y
+                        fy = (dy * yRatio + (yRatio >> 1)) - (1<< 14);
+                        sy = fy & 0xffff8000; // Floor
+                        fy -= sy;
+                        sy = sy >> 15;
+
+                        sy = sy < 0 ? 0 : sy;
+                        fy = fy < 0 ? 0 : fy;
+
+                        fy = (fy + (1<< 4)) >> 5;
+
+                        sx += xOffset;
+                        sy += yOffset;
+                        source_index = (sx + sy * src_width);
+
+                        line1[0] = (int32_t)buffer[idx][source_index];
+                        line1[1] = (int32_t)buffer[idx][source_index + 1];
+                        line2[0] = (int32_t)buffer[idx][source_index + src_width];
+                        line2[1] = (int32_t)buffer[idx][source_index + src_width + 1];
+
+                        temp1 = fx * (line1[1] - line1[0]) + (line1[0] << 10);
+                        temp2 = fx * (line2[1] - line2[0]) + (line2[0] << 10);
+                        temp1 = fy * (temp2 - temp1) + (temp1 << 10);
+                        result = (uint8_t)(DESCALE(temp1));
+                        finalVal = (result - mean[idx]) * scale;
+                        buffer[idx + 3][output_index] = finalVal;
+                    }
+                    else
+                    {
+                        int32_t offset = xOffset + yOffset * src_width;
+                        source_index = dx + dy * src_width + offset;
+                        finalVal = (buffer[0][source_index] - mean[idx]) * scale;
+                        buffer[1][output_index] = finalVal;
+                    }
+                }
+            }
+        }
+    }
+    for (i = 3; i < _CPU_IO_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( tensors[i], attr[i],
+                buffer[i], out_elements );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    for ( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _pre_process_rgb888_planar_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def );
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_PRE_PROCESS_RGB888_PLANAR_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        uint32_t index = 6;
+        int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
+        int32_t scale_y  = vsi_nn_kernel_param_get_int32( params, "scale_y" );
+        int32_t left     = vsi_nn_kernel_param_get_int32( params, "left" );
+        int32_t top      = vsi_nn_kernel_param_get_int32( params, "top" );
+        float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
+        float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
+        float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
+        float scale      = vsi_nn_kernel_param_get_float32( params, "scale" );
+
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+            vsi_nn_kernel_scalar_release( &node_params[13] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( pre_process_rgb888_planar, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c
index b505d0c..16068b6 100644
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c
@@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -282,21 +281,6 @@ static vx_param_description_t kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 
-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _pre_process_rgb_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -304,7 +288,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _pre_process_rgb_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -380,4 +368,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( pre_process_rgb, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c
index efd2b60..aa814f2 100644
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c
@@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -331,21 +330,6 @@ static vx_param_description_t kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 
-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _pre_process_yuv420_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -353,7 +337,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _pre_process_yuv420_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -429,4 +417,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( pre_process_yuv420, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c
index c5e8d6e..007d9c8 100644
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c
@@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -325,21 +324,6 @@ static vx_param_description_t kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 
-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _pre_process_yuv444_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -347,7 +331,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _pre_process_yuv444_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -423,4 +411,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( pre_process_yuv444, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c b/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c
index eff26ed..7209c9a 100644
--- a/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c
@@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -144,21 +143,6 @@ static vx_param_description_t kernel_param_def[] =
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
 };
 
-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _prelu_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -166,7 +150,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _prelu_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -216,4 +204,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( prelu, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c b/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c
index 8d3e8b5..3bd40d6 100644
--- a/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c
@@ -38,7 +38,6 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -259,4 +258,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( random_multinomial, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/reduceall_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reduceall_internal_cpu.c
index a994f5b..5999b8c 100644
--- a/src/tim/vx/internal/src/kernel/cpu/reduceall_internal_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/reduceall_internal_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -75,8 +74,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -95,7 +94,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -236,4 +234,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( reduceall_internal, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/reduceany_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reduceany_internal_cpu.c
index b15437d..39a2ff4 100644
--- a/src/tim/vx/internal/src/kernel/cpu/reduceany_internal_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/reduceany_internal_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -75,8 +74,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -95,7 +94,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -236,4 +234,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( reduceany_internal, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/reducemax_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reducemax_internal_cpu.c
index 06479eb..c1f688c 100644
--- a/src/tim/vx/internal/src/kernel/cpu/reducemax_internal_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/reducemax_internal_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -75,8 +74,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -95,7 +94,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -236,4 +234,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( reducemax_internal, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/reducemin_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reducemin_internal_cpu.c
index 22f9b68..3151853 100644
--- a/src/tim/vx/internal/src/kernel/cpu/reducemin_internal_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/reducemin_internal_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -75,8 +74,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -95,7 +94,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -237,4 +235,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( reducemin_internal, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/reduceprod_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reduceprod_internal_cpu.c
index d5cd781..64b87c8 100644
--- a/src/tim/vx/internal/src/kernel/cpu/reduceprod_internal_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/reduceprod_internal_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -74,8 +73,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -94,7 +93,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -235,4 +233,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( reduceprod_internal, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/relu_keras_cpu.c b/src/tim/vx/internal/src/kernel/cpu/relu_keras_cpu.c
index ffee85d..3c4630d 100644
--- a/src/tim/vx/internal/src/kernel/cpu/relu_keras_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/relu_keras_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -79,8 +78,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -96,7 +95,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -226,4 +224,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( relu_keras, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c b/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c
index f434ad1..3021604 100644
--- a/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -215,20 +214,6 @@ static vx_param_description_t _repeat_kernel_param_def[] =
 };
 #define _REPEAT_PARAM_NUM  _cnt_of_array( _repeat_kernel_param_def )
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _repeat_exec,
-    _repeat_kernel_param_def,
-    _cnt_of_array( _repeat_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -236,7 +221,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _repeat_exec;
+    kernel->info.parameters  = _repeat_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _repeat_kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -283,4 +272,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( repeat, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c
index 161d2c2..ed1eff5 100644
--- a/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -77,8 +76,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -100,7 +99,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for (i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -268,4 +266,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( resize_1d_bilinear, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c
index 3f500bd..195353d 100644
--- a/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -77,8 +76,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -97,7 +96,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for (i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -268,4 +266,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( resize_1d_nearest, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c
index 611bbfa..6b7a3d9 100644
--- a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -77,8 +76,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_nhwc_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_nhwc_cpu.c
index adb0620..61690c3 100644
--- a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_nhwc_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_nhwc_cpu.c
@@ -76,8 +76,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_nearest_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_nearest_cpu.c
index ba0039c..d74f6cb 100644
--- a/src/tim/vx/internal/src/kernel/cpu/resize_nearest_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/resize_nearest_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -77,8 +76,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -102,7 +101,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -246,7 +244,6 @@ final:
     }
 
     return status;
-
 } /* _compute() */
 
 
@@ -269,7 +266,6 @@ static vsi_status _query_kernel
     status = VSI_SUCCESS;
 
     return status;
-
 } /* _query_kernel() */
 
 
@@ -310,10 +306,8 @@ static vsi_nn_kernel_node_t _setup
     }
 
     return node;
-
 } /* _setup() */
 
 __END_DECLS
 
 REGISTER_BACKEND_CPU( resize_nearest, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
index 3ec3b56..a5f0467 100644
--- a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -150,8 +149,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -179,7 +178,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for (i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -369,10 +367,8 @@ static vsi_nn_kernel_node_t _setup
     }
 
     return node;
-
 } /* _setup() */
 
 __END_DECLS
 
 REGISTER_BACKEND_CPU( roi_align, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c
index b610ac2..030487a 100644
--- a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -182,20 +181,6 @@ static vx_param_description_t _scatter_nd_kernel_param_def[] =
     // Add kererl parameters here
 };
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _scatter_nd_exec,
-    _scatter_nd_kernel_param_def,
-    _cnt_of_array( _scatter_nd_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -203,7 +188,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _scatter_nd_exec;
+    kernel->info.parameters  = _scatter_nd_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _scatter_nd_kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -256,4 +245,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( scatter_nd, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_update_cpu.c b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_update_cpu.c
index 3156df2..564e861 100644
--- a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_update_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_update_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -208,20 +207,6 @@ static vx_param_description_t _scatter_nd_update_kernel_param_def[] =
     // Add kererl parameters here
 };
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _scatter_nd_update_exec,
-    _scatter_nd_update_kernel_param_def,
-    _cnt_of_array( _scatter_nd_update_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -229,7 +214,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _scatter_nd_update_exec;
+    kernel->info.parameters  = _scatter_nd_update_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _scatter_nd_update_kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -282,4 +271,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( scatter_nd_update, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/select_cpu.c b/src/tim/vx/internal/src/kernel/cpu/select_cpu.c
index ca4ff58..b172325 100644
--- a/src/tim/vx/internal/src/kernel/cpu/select_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/select_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -94,8 +93,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
@@ -110,7 +109,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -236,4 +234,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( select, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c b/src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c
index b4b0e66..6844f4a 100644
--- a/src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c
@@ -36,7 +36,6 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -121,21 +120,6 @@ static vx_param_description_t kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 
-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _sequence_mask_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -143,7 +127,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _sequence_mask_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -245,4 +233,3 @@ final:
 __END_DECLS
 
 REGISTER_BACKEND_CPU( sequence_mask, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/slice_cpu.c b/src/tim/vx/internal/src/kernel/cpu/slice_cpu.c
index 9b326dd..11d475c 100644
--- a/src/tim/vx/internal/src/kernel/cpu/slice_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/slice_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -75,8 +74,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
diff --git a/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c
index 1d2770d..0a518b0 100644
--- a/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -154,20 +153,6 @@ static vx_param_description_t _space2depth_internal_kernel_param_def[] =
 };
 #define _DEPTH2SPACE_CRD_PARAM_NUM  _cnt_of_array( _space2depth_internal_kernel_param_def )
 
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _space2depth_internal_exec,
-    _space2depth_internal_kernel_param_def,
-    _cnt_of_array( _space2depth_internal_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -175,7 +160,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _space2depth_internal_exec;
+    kernel->info.parameters  = _space2depth_internal_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _space2depth_internal_kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -227,4 +216,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( space2depth_internal, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/spatial_transformer_cpu.c b/src/tim/vx/internal/src/kernel/cpu/spatial_transformer_cpu.c
index 5d8ed88..243294c 100644
--- a/src/tim/vx/internal/src/kernel/cpu/spatial_transformer_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/spatial_transformer_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/swish_cpu.c b/src/tim/vx/internal/src/kernel/cpu/swish_cpu.c
index f17e876..65d4a4c 100644
--- a/src/tim/vx/internal/src/kernel/cpu/swish_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/swish_cpu.c
@@ -38,7 +38,6 @@
 #include "utils/vsi_nn_math.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -78,8 +77,8 @@ DEF_KERNEL_EXECUTOR(_swish_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -93,7 +92,6 @@ DEF_KERNEL_EXECUTOR(_swish_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -173,7 +171,6 @@ DEF_KERNEL_EXECUTOR(_hswish_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -295,4 +292,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( swish, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/sync_host_cpu.c b/src/tim/vx/internal/src/kernel/cpu/sync_host_cpu.c
index 7c71221..ee6c564 100644
--- a/src/tim/vx/internal/src/kernel/cpu/sync_host_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/sync_host_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -72,8 +71,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     void *in_buffer[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     size_t   out_bytes[_OUTPUT_NUM] = {0};
     uint32_t  i = 0;
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/tensorstackconcat_cpu.c b/src/tim/vx/internal/src/kernel/cpu/tensorstackconcat_cpu.c
index ae98921..60b33ac 100644
--- a/src/tim/vx/internal/src/kernel/cpu/tensorstackconcat_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/tensorstackconcat_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -74,8 +73,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     size_t   out_elements[_OUTPUT_NUM] = {0};
     uint32_t  i = 0;
     vsi_size_t  depth = 0;
diff --git a/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c b/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c
index 343dbe4..3126c31 100644
--- a/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c
@@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -154,21 +153,6 @@ static vx_param_description_t kernel_param_def[] =
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
 };
 
-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _tile_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -176,7 +160,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _tile_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
     return VSI_SUCCESS;
 } /* _query_kernel() */
 
@@ -218,6 +206,3 @@ static vsi_nn_kernel_node_t _setup
 REGISTER_BACKEND_CPU( tile, _setup )
 
 __END_DECLS
-
-
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c b/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c
index 303b3fb..8c04a2a 100644
--- a/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -115,8 +114,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -283,4 +282,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( topk, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c b/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c
index 2b745ec..77b16a2 100644
--- a/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c
@@ -36,7 +36,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -103,7 +102,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         }
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -261,10 +259,8 @@ static vsi_nn_kernel_node_t _setup
     }
 
     return node;
-
 } /* _setup() */
 
 __END_DECLS
 
 REGISTER_BACKEND_CPU( upsample, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c b/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c
index 980b7c4..b3b4bb4 100644
--- a/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -78,8 +77,8 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
     float *f32_in_buffer[_INPUT_NUM] = {NULL};
     float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
     vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
     vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
     vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@@ -101,7 +100,6 @@ DEF_KERNEL_EXECUTOR(_compute)
         in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
         f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
         CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
     }
     for(i = 0; i < _OUTPUT_NUM; i ++)
     {
@@ -150,7 +148,6 @@ DEF_KERNEL_EXECUTOR(_compute)
                         f32_out_buffer[0][idx] = data;
                     }
                 }
-
             }
         }
     }
@@ -261,4 +258,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CPU( upsamplescale, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/clip_evis.c b/src/tim/vx/internal/src/kernel/evis/clip_evis.c
index 219190a..1140cd5 100644
--- a/src/tim/vx/internal/src/kernel/evis/clip_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/clip_evis.c
@@ -22,7 +22,6 @@
 *
 *****************************************************************************/
 
-
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -35,7 +34,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 #include "utils/vsi_nn_dtype_util.h"
 
 __BEGIN_DECLS
@@ -287,7 +285,6 @@ DEF_KERNEL_INITIALIZER(_clip_initializer)
             uniDataMulAndPostShift_2x8.data[7] |= (postShift & 0x1F);
             status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP", multAndoutZP);
             status |= vsi_nn_kernel_gpu_add_param( node, "uniDataMulAndPostShift_2x8", &uniDataMulAndPostShift_2x8);
-
         }
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
@@ -502,11 +499,8 @@ final:
     SAFE_FREE_TENSOR_ATTR(output_attr);
     SAFE_FREE_TENSOR_ATTR(input_attr);
     return status;
-
 } /* _clip_initializer() */
 
-
-
 /*
  * Query kernel
  */
@@ -535,14 +529,21 @@ static vsi_status _query_kernel
 
     key = CLIP_HASH_KEY( in_dtype, out_dtype, image_2d );
 
-    for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    if ( ( in_dtype == I8 || in_dtype == I16 ) &&
+         ( inputs[0]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_DFP &&
+           inputs[0]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_NONE ) )
     {
-        if( kernel_map[i].key == key )
+        return VSI_FAILURE;
+    }
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
         {
             break;
         }
     }
-    if( i < (uint32_t)kernel_map_size )
+    if ( i < (uint32_t)kernel_map_size )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
         kernel->info.parameters  = param_def;
@@ -579,7 +580,7 @@ static vsi_nn_kernel_node_t _setup
     float   min_value  = vsi_nn_kernel_param_get_float32( params, "min_value" );
     float   max_value  = vsi_nn_kernel_param_get_float32( params, "max_value" );
 
-    if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
                 inputs[0]->attr.dim_num ) )
     {
         return NULL;
@@ -587,10 +588,10 @@ static vsi_nn_kernel_node_t _setup
 
     image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
     status = _query_kernel( kernel, inputs, outputs, image_2d );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             /* Set inputs and outputs */
             vsi_nn_kernel_node_pack_io( node_params, _CLIP_PARAM_NUM,
@@ -610,4 +611,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( clip, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
index d74b7be..8e5d05e 100644
--- a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
+#include "vsi_nn_error.h"
 
 __BEGIN_DECLS
 
@@ -435,6 +435,8 @@ DEF_KERNEL_INITIALIZER(_conv1d_ovxlib_initializer)
 final:
 #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
     SAFE_FREE_TENSOR_ATTR(input_attr);
+    SAFE_FREE_TENSOR_ATTR(weights_attr);
+    SAFE_FREE_TENSOR_ATTR(output_attr);
 
     return status;
 } /* _conv1d_ovxlib_initializer() */
@@ -513,6 +515,7 @@ static vsi_nn_tensor_t* _create_new_bias_tensor
     uint32_t  i, j;
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     weight_data = vsi_nn_ConvertTensorToData(graph, weight);
+    CHECK_PTR_FAIL_GOTO( weight_data, "Create buffer fail.", final );
 
     if (bias == NULL)
     {
@@ -539,6 +542,7 @@ static vsi_nn_tensor_t* _create_new_bias_tensor
     }
 
     new_bias_data_ptr = (int32_t *)malloc(attr.size[0] * sizeof(int32_t));
+    CHECK_PTR_FAIL_GOTO( new_bias_data_ptr, "Create buffer fail.", final );
     memset((void *)new_bias_data_ptr, 0, sizeof(int32_t) * attr.size[0]);
 
     if (input->attr.dtype.zero_point != 0)
@@ -564,6 +568,7 @@ static vsi_nn_tensor_t* _create_new_bias_tensor
 
     new_bias = vsi_nn_CreateTensorFromData(graph, (uint8_t *)new_bias_data_ptr, &attr);
 
+final:
     vsi_nn_safe_free( new_bias_data_ptr );
     vsi_nn_safe_free( bias_data );
     vsi_nn_safe_free( weight_data );
diff --git a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
index feab3a0..45c4073 100644
--- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -759,8 +758,10 @@ static vsi_nn_kernel_node_t _setup
 
     weights = vsi_nn_pad_tensor(graph, reshape_tensors[1], weight_pad_front, weight_pad_end,
         reshape_tensors[1]->attr.dim_num, VSI_NN_PAD_MODE_CONSTANT, 0);
+    CHECK_PTR_FAIL_GOTO( weights, "Create tensor afail.", final );
 
     biases = vsi_nn_merge_input_zeropoint_to_bias(graph, reshape_tensors[0], reshape_tensors[1], reshape_tensors[2]);
+    CHECK_PTR_FAIL_GOTO( biases, "Create tensor afail.", final );
 
     temp_tensor[0] = reshape_tensors[0];
     temp_tensor[1] = weights;
@@ -809,6 +810,7 @@ static vsi_nn_kernel_node_t _setup
         }
     }
 
+final:
     if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
     {
         vsi_nn_ReleaseTensor( &reshape_tensors[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
index 1e15e71..6e85e40 100644
--- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
@@ -45,13 +45,14 @@ typedef enum
     UNARY_COS,
     UNARY_EXP,
     UNARY_LOG,
-    UNARY_ELU,
     UNARY_NEG,
     UNARY_HSIGMOID,
     UNARY_MISH,
     UNARY_ROUND,
     UNARY_GELU,
     UNARY_HGELU,
+    UNARY_SELU,
+    UNARY_CELU,
 } unary_type_e;
 
 /*
@@ -60,13 +61,15 @@ typedef enum
 #define HASH_UNARY_KEY(_type, _input_type, _output_type, _image_2d) \
     ((_type << 20) | (_input_type << 12) | (_output_type << 4) | (_image_2d))
 
-#define KERNEL_SOURCE_2D    "eltwise_unary_2d",
-#define KERNEL_SOURCE_3D    "eltwise_unary_3d",
+#define KERNEL_SOURCE0_2D   "eltwise_unary_2d_0",
+#define KERNEL_SOURCE1_2D   "eltwise_unary_2d_1",
+#define KERNEL_SOURCE0_3D   "eltwise_unary_3d_0",
+#define KERNEL_SOURCE1_3D   "eltwise_unary_3d_1",
 
 #define HASH_UNARY_SH_KERNEL_NAME(FUNC_NAME, SRC_TYPE, DST_TYPE) \
     CVIVANTE_NAMESPACE("evis."#FUNC_NAME"_"#SRC_TYPE"to"#DST_TYPE)
 
-#define TENSOR_UNARY_KERNELS(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE, SOURCE) \
+#define TENSOR_UNARY_KERNELS_3D(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE, SOURCE) \
     {   HASH_UNARY_KEY(TYPE, SRC_TYPE, OUT_TYPE, 0), \
         HASH_UNARY_SH_KERNEL_NAME(FUNC_NAME, SRC_TYPE, OUT_TYPE), \
         SOURCE },
@@ -83,13 +86,14 @@ typedef enum
 #define COS_OPERATION           cos
 #define EXP_OPERATION           exp
 #define LOG_OPERATION           log
-#define ELU_OPERATION           elu
 #define NEG_OPERATION           neg
 #define HSIGMOID_OPERATION      hard_sigmoid
 #define MISH_OPERATION          mish
 #define ROUND_OPERATION         round
 #define GELU_OPERATION          gelu
 #define HGELU_OPERATION         hard_gelu
+#define SELU_OPERATION          selu
+#define CELU_OPERATION          celu
 
 static const struct {
         uint32_t key;
@@ -97,261 +101,283 @@ static const struct {
         const char* source_name;
     } _eltwise_unary_evis_kernel_map[] =
 {
-    TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, F16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, F16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, F16,  U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, F16,  I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, I16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, I16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, U8,   U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, U8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, I8,   I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, I8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, BF16, BF16 , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16,  U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16,  I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, U8,   U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, U8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I8,   I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, BF16, BF16 , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16,  U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16,  I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, I16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, I16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, U8,   U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, U8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, I8,   I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, I8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, BF16, BF16 , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, F16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, F16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, F16,  U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, F16,  I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, I16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, I16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, U8,   U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, U8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, I8,   I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, I8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, BF16, BF16 , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, F16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, F16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, F16,  U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, F16,  I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, I16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, I16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, U8,   U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, U8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, I8,   I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, I8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, BF16, BF16 , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, F16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, F16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, F16,  U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, F16,  I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, U8,   U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, U8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I8,   I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, BF16, BF16 , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, F16,  F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, F16,  I16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, F16,  U8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, F16,  I8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, I16,  I16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, I16,  F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, U8,   U8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, U8,   F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, I8,   I8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, I8,   F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,  UNARY_SIN, BF16, BF16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, F16,  F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, F16,  I16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, F16,  U8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, F16,  I8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, I16,  I16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, I16,  F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, U8,   U8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, U8,   F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, I8,   I8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, I8,   F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,  UNARY_COS, BF16, BF16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, F16,  F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, F16,  I16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, F16,  U8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, F16,  I8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, I16,  I16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, I16,  F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, U8,   U8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, U8,   F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, I8,   I8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, I8,   F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,  UNARY_EXP, BF16, BF16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, F16,  F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, F16,  I16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, F16,  U8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, F16,  I8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, I16,  I16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, I16,  F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, U8,   U8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, U8,   F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, I8,   I8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, I8,   F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,  UNARY_LOG, BF16, BF16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16,  F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16,  I16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16,  U8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16,  I8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I16,  I16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I16,  F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, U8,   U8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, U8,   F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I8,   I8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I8,   F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, BF16, BF16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  F16,  F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  F16,  I16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  F16,  U8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  F16,  I8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  I16,  I16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  I16,  F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  U8,   U8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  U8,   F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  I8,   I8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  I8,   F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,  UNARY_NEG,  BF16, BF16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16,  F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16,  I16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16,  U8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16,  I8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I16,  I16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I16,  F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, U8,   U8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, U8,   F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I8,   I8, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I8,   F16, KERNEL_SOURCE1_3D)
+    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, BF16, BF16, KERNEL_SOURCE1_3D)
 
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16,  U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16,  I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8,   U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8,   I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, BF16, BF16 , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16,  U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16,  I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8,   U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I8,   I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, BF16, BF16 , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16,  U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16,  I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8,   U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I8,   I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, BF16, BF16 , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16,  U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16,  I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, U8,   U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, U8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I8,   I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, BF16, BF16 , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, F16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, F16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, F16,  U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, F16,  I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, I16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, I16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, U8,   U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, U8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, I8,   I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, I8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, BF16, BF16 , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16,  U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16,  I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, U8,   U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, U8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I8,   I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, BF16, BF16 , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, F16,  F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, F16,  I16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, F16,  U8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, F16,  I8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, I16,  I16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, I16,  F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, U8,   U8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, U8,   F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, I8,   I8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, I8,   F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,  UNARY_SIN, BF16, BF16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, F16,  F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, F16,  I16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, F16,  U8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, F16,  I8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, I16,  I16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, I16,  F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, U8,   U8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, U8,   F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, I8,   I8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, I8,   F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,  UNARY_COS, BF16, BF16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, F16,  F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, F16,  I16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, F16,  U8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, F16,  I8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, I16,  I16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, I16,  F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, U8,   U8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, U8,   F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, I8,   I8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, I8,   F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,  UNARY_EXP, BF16, BF16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, F16,  F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, F16,  I16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, F16,  U8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, F16,  I8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, I16,  I16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, I16,  F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, U8,   U8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, U8,   F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, I8,   I8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, I8,   F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,  UNARY_LOG, BF16, BF16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16,  F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16,  I16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16,  U8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16,  I8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I16,  I16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I16,  F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, U8,   U8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, U8,   F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I8,   I8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I8,   F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, BF16, BF16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  F16,  F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  F16,  I16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  F16,  U8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  F16,  I8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  I16,  I16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  I16,  F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  U8,   U8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  U8,   F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  I8,   I8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  I8,   F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,  UNARY_NEG,  BF16, BF16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16,  F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16,  I16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16,  U8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16,  I8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I16,  I16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I16,  F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, U8,   U8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, U8,   F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I8,   I8, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I8,   F16, KERNEL_SOURCE1_2D)
+    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, BF16, BF16, KERNEL_SOURCE1_2D)
 
-    TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,   U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8,   I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, BF16, BF16 , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  F16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  I16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  U8, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  I8, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16,  I16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16,  F16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,   U8, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,   F16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8,   I8, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8,   F16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, BF16, BF16, KERNEL_SOURCE0_3D)
 
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,   U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8,   I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, BF16, BF16 , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  I16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  U8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16,  I8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16,  I16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16,  F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,   U8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,   F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8,   I8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8,   F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, BF16, BF16, KERNEL_SOURCE0_2D)
 
-    TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, F16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, F16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, F16,  U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, F16,  I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, I16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, I16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, U8,   U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, U8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, I8,   I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, I8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, BF16, BF16 , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16,  F16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16,  I16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16,  U8, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16,  I8, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I16,  I16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I16,  F16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, U8,   U8, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, U8,   F16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I8,   I8, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I8,   F16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, BF16, BF16, KERNEL_SOURCE0_3D)
 
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16,  U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16,  I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8,   U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8,   I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, BF16, BF16 , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16,  F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16,  I16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16,  U8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16,  I8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I16,  I16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I16,  F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8,   U8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8,   F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8,   I8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8,   F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, BF16, BF16, KERNEL_SOURCE0_2D)
 
-    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16,  U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16,  I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, U8,   U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, U8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I8,   I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, BF16, BF16 , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16,  F16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16,  I16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16,  U8, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16,  I8, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I16,  I16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I16,  F16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, U8,   U8, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, U8,   F16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I8,   I8, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I8,   F16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16, KERNEL_SOURCE0_3D)
 
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8,   U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8,   I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16 , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  I16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  U8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  I8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16,  I16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16,  F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8,   U8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8,   F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8,   I8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8,   F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16, KERNEL_SOURCE0_2D)
 
-    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, F16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, F16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, F16,  U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, F16,  I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, I16,  I16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, I16,  F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, U8,   U8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, U8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, I8,   I8   , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, I8,   F16  , KERNEL_SOURCE_3D)
-    TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, BF16, BF16 , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16,  F16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16,  I16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16,  U8, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16,  I8, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I16,  I16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I16,  F16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, U8,   U8, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, U8,   F16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I8,   I8, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I8,   F16, KERNEL_SOURCE0_3D)
+    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, BF16, BF16, KERNEL_SOURCE0_3D)
 
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8,   U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8,   I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, BF16, BF16 , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  I16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  U8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16,  I8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16,  I16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16,  F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8,   U8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8,   F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8,   I8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8,   F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, BF16, BF16, KERNEL_SOURCE0_2D)
 
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16,  I16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16,  F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8,   U8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8,   I8   , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8,   F16  , KERNEL_SOURCE_2D)
-    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, BF16, BF16 , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  I16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  U8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16,  I8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16,  I16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16,  F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8,   U8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8,   F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8,   I8, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8,   F16, KERNEL_SOURCE0_2D)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, BF16, BF16, KERNEL_SOURCE0_2D)
 };
 
 #undef SIN_OPERATION
 #undef COS_OPERATION
 #undef EXP_OPERATION
 #undef LOG_OPERATION
-#undef ELU_OPERATION
+#undef SELU_OPERATION
 #undef NEG_OPERATION
 #undef HSIGMOID_OPERATION
 #undef MISH_OPERATION
 #undef ROUND_OPERATION
 #undef GELU_OPERATION
 #undef HGELU_OPERATION
-
+#undef CELU_OPERATION
 /*
  * Kernel params
  */
@@ -410,6 +436,10 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
     status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[INPUT_SCALAR_BETA], &beta);
     CHECK_STATUS_FAIL_GOTO(status, final );
 
+    if (UNARY_SELU == type)
+    {
+        alpha = alpha * beta;
+    }
     out_shape  = attr[1]->shape;
 
     if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
@@ -471,13 +501,14 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
         case _PACK_SELECT_KEY( UNARY_COS, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_EXP, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_LOG, BF16, BF16 ):
-        case _PACK_SELECT_KEY( UNARY_ELU, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_SELU, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_NEG, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_HSIGMOID, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_MISH, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_ROUND, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_GELU, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_HGELU, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_CELU, BF16, BF16 ):
         {
             gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
                 0x11111111, // TCfg
@@ -518,8 +549,11 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
                     "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
             status |= vsi_nn_kernel_gpu_add_param( node,
                     "alpha", &alpha );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "beta", &beta );
+            if ( type == UNARY_HSIGMOID || type == UNARY_SELU || type == UNARY_CELU)
+            {
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "beta", &beta );
+            }
             CHECK_STATUS_FAIL_GOTO(status, final );
         }
         break;
@@ -580,8 +614,11 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
                     "outputZP", &outputZP );
             status |= vsi_nn_kernel_gpu_add_param( node,
                     "alpha", &alpha );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "beta", &beta );
+            if ( type == UNARY_HSIGMOID || type == UNARY_SELU || type == UNARY_CELU )
+            {
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "beta", &beta );
+            }
 
             if (attr[1]->dtype == F16)
             {
@@ -678,7 +715,7 @@ static vsi_nn_kernel_node_t _setup
     ret = vsi_nn_kernel_optimize_element_shape(
             inputs[0]->attr.size, inputs[0]->attr.dim_num,
             shape, &new_rank );
-    if( ret )
+    if ( ret )
     {
         rs_tensors[0] = vsi_nn_reshape_tensor( graph,
                 inputs[0], shape, new_rank );
@@ -692,13 +729,18 @@ static vsi_nn_kernel_node_t _setup
         goto OnError;
     }
 
+    if ( unary_type == UNARY_CELU )
+    {
+        beta = 1.0f / alpha;
+    }
+
     image_2d = (rs_tensors[0]->attr.dim_num == 2 || rs_tensors[0]->attr.size[2] == 1);
     status = _query_kernel( rs_tensors, &rs_tensors[1], unary_type, image_2d, kernel );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
 
-        if( node )
+        if ( node )
         {
             vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
                     rs_tensors, 1, &rs_tensors[1], 1 );
@@ -765,12 +807,13 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( sin, UNARY_SIN )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( cos, UNARY_COS )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( exp, UNARY_EXP )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( log, UNARY_LOG )
-REGISTER_ELTWISE_UNARY_BACKEND_EVIS( elu, UNARY_ELU )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( neg, UNARY_NEG )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_sigmoid, UNARY_HSIGMOID )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( mish, UNARY_MISH )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( round, UNARY_ROUND )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( gelu, UNARY_GELU )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_gelu, UNARY_HGELU )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( selu, UNARY_SELU )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( celu, UNARY_CELU )
 
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c b/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c
index 0c7f277..be1bd17 100644
--- a/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c
@@ -132,9 +132,9 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
     int32_t                      input0_fl    = 0;
     int32_t                      input1_fl    = 0;
     int32_t                      output_fl    = 0;
-    float                        inScale0     = 0;
-    float                        inScale1     = 0;
-    float                        outScale     = 0;
+    float                        inScale0     = 1.0f;
+    float                        inScale1     = 1.0f;
+    float                        outScale     = 1.0f;
     float                        in0Tail      = 0;
     float                        in1Tail      = 0;
     float                        outZp        = 0;
@@ -178,16 +178,11 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
         {
             inScale0 = (float)((int64_t)1 << -input0_fl);
         }
-        status  = vsi_nn_kernel_gpu_add_param( node, "in_scale0", &inScale0 );
-        CHECK_STATUS_FAIL_GOTO(status, final );
     }
     else if( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
     {
         inScale0   = input0_attr->asymm.scale;
         in0Tail    = -inScale0 * ((float)input0_attr->asymm.zero_point);
-        status     = vsi_nn_kernel_gpu_add_param( node, "in_scale0", &inScale0 );
-        status    |= vsi_nn_kernel_gpu_add_param( node, "in0Tail", &in0Tail );
-        CHECK_STATUS_FAIL_GOTO(status, final );
     }
 
     if( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
@@ -201,16 +196,11 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
         {
             inScale1 = (float)((int64_t)1 << -input1_fl);
         }
-        status  = vsi_nn_kernel_gpu_add_param( node, "in_scale1", &inScale1 );
-        CHECK_STATUS_FAIL_GOTO(status, final );
     }
     else if( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
     {
         inScale1   = input1_attr->asymm.scale;
         in1Tail    = -inScale1 * ((float)input1_attr->asymm.zero_point);
-        status     = vsi_nn_kernel_gpu_add_param( node, "in_scale1", &inScale1 );
-        status    |= vsi_nn_kernel_gpu_add_param( node, "in1Tail", &in1Tail );
-        CHECK_STATUS_FAIL_GOTO(status, final );
     }
 
     if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
@@ -224,16 +214,11 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
         {
             outScale = 1.0f / (float)((int64_t)1 << -output_fl);
         }
-        status  = vsi_nn_kernel_gpu_add_param( node, "out_scale", &outScale );
-        CHECK_STATUS_FAIL_GOTO(status, final );
     }
     else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
     {
         outScale    = 1.0f / output_attr->asymm.scale;
         outZp       = (float)(output_attr->asymm.zero_point);
-        status  = vsi_nn_kernel_gpu_add_param( node, "out_scale", &outScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "out_zp", &outZp );
-        CHECK_STATUS_FAIL_GOTO(status, final );
     }
 
     if (BF16 == input0_dtype)
@@ -317,6 +302,12 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
                     "uniConvertFstToFp32_4x4", &uniConvertFstToFp32_4x4 );
         status |= vsi_nn_kernel_gpu_add_param( node,
                     "uniConvertSecToFp32_4x4", &uniConvertSecToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "in_scale0", &inScale0 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "in0Tail", &in0Tail );
+        status |= vsi_nn_kernel_gpu_add_param( node, "in_scale1", &inScale1 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "in1Tail", &in1Tail );
+        status |= vsi_nn_kernel_gpu_add_param( node, "out_scale", &outScale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "out_zp", &outZp );
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
 
@@ -439,4 +430,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( floordiv, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_elements_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_elements_evis.c
new file mode 100644
index 0000000..0554d11
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/gather_elements_evis.c
@@ -0,0 +1,279 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_GATHER_ELEMENTS,
+} _internal_kernel_e;
+
+#define _GATHER_ELEMENTS_KERNEL_SOURCE      "gather_elements"
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMG_2D ) \
+        (( AXIS ) | ( IN0_DTYPE << 2 ) | ( IN1_DTYPE << 10 ) | ( OUT_DTYPE << 18 ) | ( IMG_2D << 26 ))
+#define PACK_KERNEL_3D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+  { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 ), \
+  CVIVANTE_NAMESPACE("evis.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
+  _GATHER_ELEMENTS_KERNEL_SOURCE}
+
+#define PACK_KERNEL_2D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+  { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \
+  CVIVANTE_NAMESPACE("evis.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
+  _GATHER_ELEMENTS_KERNEL_SOURCE}
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _gather_elements_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_3D_MAP( 0, F16, I32, F16 ),
+    PACK_KERNEL_3D_MAP( 0, I16, I32, I16 ),
+    PACK_KERNEL_3D_MAP( 0, I8,  I32, I8 ),
+    PACK_KERNEL_3D_MAP( 0, U8,  I32, U8 ),
+    PACK_KERNEL_3D_MAP( 1, F16, I32, F16 ),
+    PACK_KERNEL_3D_MAP( 1, I16, I32, I16 ),
+    PACK_KERNEL_3D_MAP( 1, I8,  I32, I8 ),
+    PACK_KERNEL_3D_MAP( 1, U8,  I32, U8 ),
+    PACK_KERNEL_3D_MAP( 2, F16, I32, F16 ),
+    PACK_KERNEL_3D_MAP( 2, I16, I32, I16 ),
+    PACK_KERNEL_3D_MAP( 2, I8,  I32, I8 ),
+    PACK_KERNEL_3D_MAP( 2, U8,  I32, U8 ),
+
+    PACK_KERNEL_2D_MAP( 0, F16, I32, F16 ),
+    PACK_KERNEL_2D_MAP( 0, I16, I32, I16 ),
+    PACK_KERNEL_2D_MAP( 0, I8,  I32, I8 ),
+    PACK_KERNEL_2D_MAP( 0, U8,  I32, U8 ),
+    PACK_KERNEL_2D_MAP( 1, F16, I32, F16 ),
+    PACK_KERNEL_2D_MAP( 1, I16, I32, I16 ),
+    PACK_KERNEL_2D_MAP( 1, I8,  I32, I8 ),
+    PACK_KERNEL_2D_MAP( 1, U8,  I32, U8 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _gather_elements_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _GATHER_ELEMENTS_PARAM_NUM  _cnt_of_array( _gather_elements_kernel_param_def )
+#define SCALAR_INPUT_AXIS        (3)
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_gather_elements_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {1, 1, 1},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * input_attr  = NULL;
+    vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
+    vsi_size_array_t * out_shape              = NULL;
+    int32_t axis = 0;
+    int32_t axis_size = 0;
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_AXIS], &axis);
+
+    out_shape = output_attr->shape;
+    axis_size = (int32_t)input_attr->shape->data[axis];
+    if (axis == 0)
+    {
+        gpu_param.global_scale[0] = 4;
+    }
+
+    gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+    status  = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    status |= vsi_nn_kernel_gpu_add_param( node, "axis_size", &axis_size );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(input_attr);
+    SAFE_FREE_TENSOR_ATTR(output_attr);
+    return status;
+} /* _gather_elements_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t axis
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _gather_elements_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _gather_elements_kernel_map );
+    vx_param_description_t * param_def  = _gather_elements_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _gather_elements_initializer;
+    int32_t img_2d = (outputs[0]->attr.dim_num < 3 || outputs[0]->attr.size[2] == 1) ? 1 : 0;
+
+    uint32_t key;
+    uint32_t i;
+
+    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (in0_dtype == BF16)
+    {
+        in0_dtype = F16;
+    }
+    if (out_dtype == BF16)
+    {
+        out_dtype = F16;
+    }
+
+    key = GATHER_ELEMENTS_HASH_KEY( axis, in0_dtype, in1_dtype, out_dtype, img_2d );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _gather_elements_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GATHER_ELEMENTS_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
+
+    if ( vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, axis );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _GATHER_ELEMENTS_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_INPUT_AXIS]  = vsi_nn_kernel_scalar_create( graph, I32, &axis );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GATHER_ELEMENTS_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( gather_elements, _setup )
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
index 3be3996..3dc67d2 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -289,9 +288,9 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
     vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
     vsi_size_array_t * input1_shape = NULL;
     int32_t     src0ZP     = 0;
-    float       src0Scale  = 0;
+    float       src0Scale  = 1;
     int32_t     dstZP      = 0;
-    float       dstScale   = 0;
+    float       dstScale   = 1;
 
     uint32_t pack_key = 0;
 
@@ -307,10 +306,6 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    src0ZP     = attr[0]->asymm.zero_point;
-    src0Scale  = attr[0]->asymm.scale;
-    dstZP      = attr[2]->asymm.zero_point;
-    dstScale   = attr[2]->asymm.scale;
     if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         if (attr[0]->dfp.fl > 0)
@@ -322,9 +317,10 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
             src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
         }
     }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
     {
-        src0Scale = 1;
+        src0Scale = attr[0]->asymm.scale;
+        src0ZP = attr[0]->asymm.zero_point;
     }
 
     if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
@@ -337,11 +333,11 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
         {
             dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
         }
-        dstScale = 1.0f/dstScale;
     }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
     {
-        dstScale = 1;
+        dstScale = 1.0f / attr[2]->asymm.scale;
+        dstZP = attr[2]->asymm.zero_point;
     }
 
     input1_shape  = attr[1]->shape;
@@ -404,7 +400,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
         case _PACK_SELECT_KEY( I8, F16):
         case _PACK_SELECT_KEY( I16, F16):
             {
-                gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift);
+                gpu_quantize_multiplier_16bit( (double)src0Scale * dstScale, &M0, &postShift);
                 multAndoutZP0[0] = (uint32_t)(M0);
                 multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0);
 
@@ -420,7 +416,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
         case _PACK_SELECT_KEY( F16, I16):
             {
                 int32_t  postShift0       = 0;
-                gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0);
+                gpu_quantize_multiplier_16bit( (double)src0Scale * dstScale, &M0, &postShift0);
 
                 multAndoutZP1[0] = (uint32_t)(M0);
                 multAndoutZP1[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0);
@@ -489,9 +485,9 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
     vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
     vsi_size_array_t * input1_shape = NULL;
     int32_t     src0ZP     = 0;
-    float       src0Scale  = 0;
+    float       src0Scale  = 1;
     int32_t     dstZP      = 0;
-    float       dstScale   = 0;
+    float       dstScale   = 1;
 
     uint32_t pack_key = 0;
 
@@ -505,10 +501,6 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    src0ZP     = attr[0]->asymm.zero_point;
-    src0Scale  = attr[0]->asymm.scale;
-    dstZP      = attr[2]->asymm.zero_point;
-    dstScale   = attr[2]->asymm.scale;
     if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         if (attr[0]->dfp.fl > 0)
@@ -520,9 +512,10 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
             src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
         }
     }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
     {
-        src0Scale = 1;
+        src0Scale = attr[0]->asymm.scale;
+        src0ZP = attr[0]->asymm.zero_point;
     }
 
     if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
@@ -535,11 +528,11 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
         {
             dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
         }
-        dstScale = 1.0f/dstScale;
     }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
     {
-        dstScale = 1;
+        dstScale = 1.0f / attr[2]->asymm.scale;
+        dstZP = attr[2]->asymm.zero_point;
     }
 
     input1_shape  = attr[1]->shape;
@@ -609,7 +602,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
         case _PACK_SELECT_KEY( I8, F16):
         case _PACK_SELECT_KEY( I16, F16):
             {
-                gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift);
+                gpu_quantize_multiplier_16bit( (double)src0Scale * dstScale, &M0, &postShift);
                 multAndoutZP0[0] = (uint32_t)(M0);
                 multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0);
 
@@ -625,7 +618,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
         case _PACK_SELECT_KEY( F16, I16):
             {
                 int32_t  postShift0       = 0;
-                gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0);
+                gpu_quantize_multiplier_16bit( (double)src0Scale * dstScale, &M0, &postShift0);
 
                 multAndoutZP1[0] = (uint32_t)(M0);
                 multAndoutZP1[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0);
@@ -840,4 +833,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( gather, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
index 78e9efe..0692c07 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -213,11 +212,10 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
     int32_t       block_size  = 0;
     int32_t       indices_num = 1;
-
     int32_t     src0ZP     = 0;
-    float       src0Scale  = 0;
+    float       src0Scale  = 1;
     int32_t     dstZP      = 0;
-    float       dstScale   = 0;
+    float       dstScale   = 1;
 
     uint32_t pack_key = 0;
 
@@ -231,10 +229,6 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    src0ZP     = attr[0]->asymm.zero_point;
-    src0Scale  = attr[0]->asymm.scale;
-    dstZP      = attr[2]->asymm.zero_point;
-    dstScale   = attr[2]->asymm.scale;
     if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         if (attr[0]->dfp.fl > 0)
@@ -246,9 +240,10 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
             src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
         }
     }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
     {
-        src0Scale = 1;
+        src0Scale = attr[0]->asymm.scale;
+        src0ZP = attr[0]->asymm.zero_point;
     }
 
     if( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
@@ -261,11 +256,11 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
         {
             dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
         }
-        dstScale = 1.0f / dstScale;
     }
-    else if( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
     {
-        dstScale = 1;
+        dstScale = 1.0f / attr[2]->asymm.scale;
+        dstZP = attr[2]->asymm.zero_point;
     }
 
     indices_num = (int32_t)(attr[1]->shape->data[1]);
@@ -319,7 +314,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
         case _PACK_SELECT_KEY( I8, F16 ):
         case _PACK_SELECT_KEY( I16, F16 ):
             {
-                gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift);
+                gpu_quantize_multiplier_16bit( (double)src0Scale * dstScale, &M0, &postShift);
                 multAndoutZP0[0] = (uint32_t)(M0);
                 multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0);
 
@@ -335,7 +330,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
         case _PACK_SELECT_KEY( F16, I16 ):
             {
                 int32_t  postShift0       = 0;
-                gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0);
+                gpu_quantize_multiplier_16bit( (double)src0Scale * dstScale, &M0, &postShift0);
 
                 multAndoutZP1[0] = (uint32_t)(M0);
                 multAndoutZP1[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0);
@@ -502,4 +497,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( gather_nd, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
index c7326d4..34c51f8 100644
--- a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
@@ -218,46 +218,6 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
             0x00000000, 0x00000000, 0x00000000, 0x00000000,
             0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniIntegerSquareLo_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x00000000, // BSelt
-            0x00010000, 0x00030002, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000,
-            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniIntegerSquareHi_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00050004, 0x00070006, // ABin
-            0x00000000, // BSelt
-            0x00050004, 0x00070006, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000,
-            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniDataSquareAddU32Lo_4x4 = {{
-            0x0d0d0d0d, // TCfg
-            0x04040404, // ASelt
-            0x00110000, 0x00330022, // ABin
-            0x00000000, // BSelt
-            0x00010000, 0x00030002, // BBin
-            0x00005400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000,
-            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniDataSquareAddU32Hi_4x4 = {{
-            0x0d0d0d0d, // TCfg
-            0x04040404, // ASelt
-            0x00150004, 0x00370026, // ABin
-            0x00000000, // BSelt
-            0x00050004, 0x00070006, // BBin
-            0x00005400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000,
-            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
         gpu_dp_inst_t uniUInt8SquareLo_4x4 = {{
             0x69696969, // TCfg
             0x40404040, // ASelt
@@ -307,19 +267,7 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
                 status |= vsi_nn_kernel_gpu_add_param( node, "UniFp16MulLo_dp4x4", &UniFp16MulLo_dp4x4);
                 status |= vsi_nn_kernel_gpu_add_param( node, "UniFp16MulHi_dp4x4", &UniFp16MulHi_dp4x4);
             }
-            else if(I8 == input_dtype)
-            {
-                status |= vsi_nn_kernel_gpu_add_param( node, "r_inputScale", &r_inputScale);
-                status |= vsi_nn_kernel_gpu_add_param( node, "uniDataSquareAddU32Lo_4x4", &uniDataSquareAddU32Lo_4x4);
-                status |= vsi_nn_kernel_gpu_add_param( node, "uniDataSquareAddU32Hi_4x4", &uniDataSquareAddU32Hi_4x4);
-            }
-            else if(I16 == input_dtype)
-            {
-                status |= vsi_nn_kernel_gpu_add_param( node, "r_inputScale", &r_inputScale);
-                status |= vsi_nn_kernel_gpu_add_param( node, "uniIntegerSquareLo_4x4", &uniIntegerSquareLo_4x4);
-                status |= vsi_nn_kernel_gpu_add_param( node, "uniIntegerSquareHi_4x4", &uniIntegerSquareHi_4x4);
-            }
-            else if(U8 == input_dtype)
+            else
             {
                 status |= vsi_nn_kernel_gpu_add_param( node, "r_inputScale", &r_inputScale);
                 status |= vsi_nn_kernel_gpu_add_param( node, "uniUInt8SquareLo_4x4", &uniUInt8SquareLo_4x4);
@@ -337,12 +285,7 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
             vsi_nn_kernel_gpu_add_param( node, "inputWidthRemain256", &inputWidthRemain256);
             vsi_nn_kernel_gpu_add_param( node, "inputWidthCount", &inputWidthCount);
             vsi_nn_kernel_gpu_add_param( node, "uniSumSqrt_16x1", &uniSumSqrt_16x1);
-            if (I16 == input_dtype || I8 == input_dtype)
-            {
-                status = vsi_nn_kernel_gpu_add_param( node, "r_inputScale", &r_inputScale);
-                CHECK_STATUS_FAIL_GOTO(status, final );
-            }
-            else if(U8 == input_dtype)
+            if (U8 == input_dtype || I16 == input_dtype || I8 == input_dtype)
             {
                 float zP2x = 2 * (float)inputZP;
                 float zpSqrt16x =  16 * (float)inputZP * (float)inputZP;
@@ -477,14 +420,14 @@ static vsi_status _query_kernel
 
     key = HASH_L2NORMALIZESCALE_HASH_KEY(axis, in0_dtype, in1_dtype, out_dtype, image_2d);
 
-    for( i = 0; i < kernel_map_size; i ++ )
+    for ( i = 0; i < kernel_map_size; i ++ )
     {
-        if( kernel_map[i].key == key )
+        if ( kernel_map[i].key == key )
         {
             break;
         }
     }
-    if( i < kernel_map_size )
+    if ( i < kernel_map_size )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
         kernel->info.parameters  = param_def;
@@ -521,7 +464,7 @@ static vsi_nn_kernel_node_t _setup
 
     axis = vsi_nn_kernel_param_get_int32(params, "axis");
 
-    if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
                 inputs[0]->attr.dim_num )
      || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num )
@@ -532,10 +475,10 @@ static vsi_nn_kernel_node_t _setup
 
     image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
     status = _query_kernel( kernel, inputs, outputs, axis, image_2d );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             vx_border_t border;
             border.mode = VX_BORDER_CONSTANT;
diff --git a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
index 5bade3b..317e8a0 100644
--- a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
@@ -38,65 +38,33 @@
 
 __BEGIN_DECLS
 
-#define KERNEL_NAME_MAXIMUM_F16F16TOF16             CVIVANTE_NAMESPACE("evis.maximum_F16F16toF16")
-#define KERNEL_NAME_MAXIMUM_F16F16TOF16_2D          CVIVANTE_NAMESPACE("evis.maximum_F16F16toF16_2D")
-#define KERNEL_NAME_MAXIMUM_I8I8TOI8                CVIVANTE_NAMESPACE("evis.maximum_I8I8toI8")
-#define KERNEL_NAME_MAXIMUM_I8I8TOI8_2D             CVIVANTE_NAMESPACE("evis.maximum_I8I8toI8_2D")
-#define KERNEL_NAME_MAXIMUM_I8F16TOI8               CVIVANTE_NAMESPACE("evis.maximum_I8F16toI8")
-#define KERNEL_NAME_MAXIMUM_I8F16TOI8_2D            CVIVANTE_NAMESPACE("evis.maximum_I8F16toI8_2D")
-#define KERNEL_NAME_MAXIMUM_I8F16TOF16              CVIVANTE_NAMESPACE("evis.maximum_I8F16toF16")
-#define KERNEL_NAME_MAXIMUM_I8F16TOF16_2D           CVIVANTE_NAMESPACE("evis.maximum_I8F16toF16_2D")
-#define KERNEL_NAME_MAXIMUM_U8F16TOF16              CVIVANTE_NAMESPACE("evis.maximum_U8F16toF16")
-#define KERNEL_NAME_MAXIMUM_U8F16TOF16_2D           CVIVANTE_NAMESPACE("evis.maximum_U8F16toF16_2D")
-#define KERNEL_NAME_MAXIMUM_U8F16TOU8               CVIVANTE_NAMESPACE("evis.maximum_U8F16toU8")
-#define KERNEL_NAME_MAXIMUM_U8F16TOU8_2D            CVIVANTE_NAMESPACE("evis.maximum_U8F16toU8_2D")
-#define KERNEL_NAME_MAXIMUM_U8U8TOU8                CVIVANTE_NAMESPACE("evis.maximum_U8U8toU8")
-#define KERNEL_NAME_MAXIMUM_U8U8TOU8_2D             CVIVANTE_NAMESPACE("evis.maximum_U8U8toU8_2D")
-#define KERNEL_NAME_MAXIMUM_I16I16TOI16             CVIVANTE_NAMESPACE("evis.maximum_I16I16toI16")
-#define KERNEL_NAME_MAXIMUM_I16I16TOI16_2D          CVIVANTE_NAMESPACE("evis.maximum_I16I16toI16_2D")
-#define KERNEL_NAME_MAXIMUM_I16F16TOI16             CVIVANTE_NAMESPACE("evis.maximum_I16F16toI16")
-#define KERNEL_NAME_MAXIMUM_I16F16TOI16_2D          CVIVANTE_NAMESPACE("evis.maximum_I16F16toI16_2D")
-#define KERNEL_NAME_MAXIMUM_I16F16TOF16             CVIVANTE_NAMESPACE("evis.maximum_I16F16toF16")
-#define KERNEL_NAME_MAXIMUM_I16F16TOF16_2D          CVIVANTE_NAMESPACE("evis.maximum_I16F16toF16_2D")
-#define KERNEL_NAME_MAXIMUM_F16F16TOU8              CVIVANTE_NAMESPACE("evis.maximum_F16F16toU8")
-#define KERNEL_NAME_MAXIMUM_F16F16TOU8_2D           CVIVANTE_NAMESPACE("evis.maximum_F16F16toU8_2D")
-#define KERNEL_NAME_MAXIMUM_F16F16TOI8              CVIVANTE_NAMESPACE("evis.maximum_F16F16toI8")
-#define KERNEL_NAME_MAXIMUM_F16F16TOI8_2D           CVIVANTE_NAMESPACE("evis.maximum_F16F16toI8_2D")
-#define KERNEL_NAME_MAXIMUM_F16F16TOI16             CVIVANTE_NAMESPACE("evis.maximum_F16F16toI16")
-#define KERNEL_NAME_MAXIMUM_F16F16TOI16_2D          CVIVANTE_NAMESPACE("evis.maximum_F16F16toI16_2D")
-#define KERNEL_NAME_MAXIMUM_I16I16TOU8              CVIVANTE_NAMESPACE("evis.maximum_I16I16toU8")
-#define KERNEL_NAME_MAXIMUM_I16I16TOU8_2D           CVIVANTE_NAMESPACE("evis.maximum_I16I16toU8_2D")
-#define KERNEL_NAME_MAXIMUM_U8U8TOI16               CVIVANTE_NAMESPACE("evis.maximum_U8U8toI16")
-#define KERNEL_NAME_MAXIMUM_U8U8TOI16_2D            CVIVANTE_NAMESPACE("evis.maximum_U8U8toI16_2D")
-
-#define KERNEL_SOURCE_1    "maximum",
-#define KERNEL_SOURCE_2    "maximum_fp16",
-#define KERNEL_SOURCE_3    "maximum_i16"
+#define KERNEL_SOURCE_0    "maximum_0",
+#define KERNEL_SOURCE_1    "maximum_1",
 
 #define HASH_MAXIMUM_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
     ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
 
+#define HASH_MAXIMUM_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.maximum_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE)
+
+#define HASH_MAXIMUM_SH_KERNEL_2D_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.maximum_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_2D")
+
 #define TENSOR_MAX_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
     { HASH_MAXIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
-        KERNEL_NAME_MAXIMUM_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE, \
+        HASH_MAXIMUM_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
         SOURCE },
 
 #define TENSOR_MAX_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
     { HASH_MAXIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \
-        KERNEL_NAME_MAXIMUM_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE##_2D, \
+        HASH_MAXIMUM_SH_KERNEL_2D_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
         SOURCE },
 
-#define HASH_MAXIMUM_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.maximum_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE)
-
 #define TENSOR_MAX_KERNELS_HALF(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
     { HASH_MAXIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
         HASH_MAXIMUM_SH_KERNEL_NAME(F16, F16, F16), \
         SOURCE },
 
-#define HASH_MAXIMUM_SH_KERNEL_2D_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.maximum_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_2D")
-
 #define TENSOR_MAX_KERNELS_2D_HALF(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
     { HASH_MAXIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \
         HASH_MAXIMUM_SH_KERNEL_2D_NAME(F16, F16, F16), \
@@ -108,43 +76,47 @@ static const struct {
         const char* source_name;
     } kernel_map[] =
 {
-    TENSOR_MAX_KERNELS_HALF(F16, F16, F16,       KERNEL_SOURCE_1)
-    TENSOR_MAX_KERNELS_HALF(BF16, BF16, BF16,    KERNEL_SOURCE_1)
-    TENSOR_MAX_KERNELS(F16, F16, I8,        KERNEL_SOURCE_1)
-    TENSOR_MAX_KERNELS(I8,  I8, I8,         KERNEL_SOURCE_1)
-    TENSOR_MAX_KERNELS(U8,  U8, U8,         KERNEL_SOURCE_1)
-    TENSOR_MAX_KERNELS(U8,  U8, I16,        KERNEL_SOURCE_1)
-    TENSOR_MAX_KERNELS(I16, I16, I16,       KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS_HALF(F16, F16, F16,       KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS_HALF(BF16, BF16, BF16,    KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS(I8,  I8, I8,         KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS(U8,  U8, U8,         KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS(U8,  U8, I16,        KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS(I16, I16, I16,       KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS(I16, I16, U8,        KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS(F16, F16, I8,        KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS(F16, F16, U8,        KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS(F16, F16, I16,       KERNEL_SOURCE_0)
 
-    TENSOR_MAX_KERNELS(F16, F16, U8,        KERNEL_SOURCE_2)
-    TENSOR_MAX_KERNELS(I8,  F16, I8,        KERNEL_SOURCE_2)
-    TENSOR_MAX_KERNELS(I8,  F16, F16,       KERNEL_SOURCE_2)
-    TENSOR_MAX_KERNELS(U8,  F16, U8,        KERNEL_SOURCE_2)
-    TENSOR_MAX_KERNELS(U8,  F16, F16,       KERNEL_SOURCE_2)
+    TENSOR_MAX_KERNELS(I8,  I8,  F16,       KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS(I8,  F16, I8,        KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS(I8,  F16, F16,       KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS(U8,  U8,  F16,       KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS(U8,  F16, U8,        KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS(I16, I16, F16,       KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS(U8,  F16, F16,       KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS(I16, F16, I16,       KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS(I16, F16, F16,       KERNEL_SOURCE_1)
 
-    TENSOR_MAX_KERNELS(I16, F16, I16,       KERNEL_SOURCE_3)
-    TENSOR_MAX_KERNELS(I16, F16, F16,       KERNEL_SOURCE_3)
-    TENSOR_MAX_KERNELS(F16, F16, I16,       KERNEL_SOURCE_3)
-    TENSOR_MAX_KERNELS(I16, I16, U8,        KERNEL_SOURCE_3)
+    TENSOR_MAX_KERNELS_2D_HALF(F16, F16, F16,    KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS_2D_HALF(BF16, BF16, BF16, KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS_2D(I8,  I8, I8,      KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS_2D(U8,  U8, U8,      KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS_2D(U8,  U8,  I16,    KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS_2D(I16, I16, I16,    KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS_2D(I16, I16, U8,     KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS_2D(F16, F16, I8,     KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS_2D(F16, F16, U8,     KERNEL_SOURCE_0)
+    TENSOR_MAX_KERNELS_2D(F16, F16, I16,    KERNEL_SOURCE_0)
 
-    TENSOR_MAX_KERNELS_2D_HALF(F16, F16, F16,    KERNEL_SOURCE_1)
-    TENSOR_MAX_KERNELS_2D_HALF(BF16, BF16, BF16, KERNEL_SOURCE_1)
-    TENSOR_MAX_KERNELS_2D(F16, F16, I8,     KERNEL_SOURCE_1)
-    TENSOR_MAX_KERNELS_2D(I8,  I8, I8,      KERNEL_SOURCE_1)
-    TENSOR_MAX_KERNELS_2D(U8,  U8, U8,      KERNEL_SOURCE_1)
-    TENSOR_MAX_KERNELS_2D(U8,  U8,  I16,    KERNEL_SOURCE_1)
-    TENSOR_MAX_KERNELS_2D(I16, I16, I16,    KERNEL_SOURCE_1)
-
-    TENSOR_MAX_KERNELS_2D(F16, F16, U8,     KERNEL_SOURCE_2)
-    TENSOR_MAX_KERNELS_2D(I8,  F16, I8,     KERNEL_SOURCE_2)
-    TENSOR_MAX_KERNELS_2D(I8,  F16, F16,    KERNEL_SOURCE_2)
-    TENSOR_MAX_KERNELS_2D(U8,  F16, U8,     KERNEL_SOURCE_2)
-    TENSOR_MAX_KERNELS_2D(U8,  F16, F16,    KERNEL_SOURCE_2)
-
-    TENSOR_MAX_KERNELS_2D(I16, F16, I16,    KERNEL_SOURCE_3)
-    TENSOR_MAX_KERNELS_2D(I16, F16, F16,    KERNEL_SOURCE_3)
-    TENSOR_MAX_KERNELS_2D(F16, F16, I16,    KERNEL_SOURCE_3)
-    TENSOR_MAX_KERNELS_2D(I16, I16, U8,     KERNEL_SOURCE_3)
+    TENSOR_MAX_KERNELS_2D(I8,  I8,  F16,    KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS_2D(I8,  F16, I8,     KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS_2D(I8,  F16, F16,    KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS_2D(U8,  U8,  F16,    KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS_2D(U8,  F16, U8,     KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS_2D(U8,  F16, F16,    KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS_2D(I16, I16, F16,    KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS_2D(I16, F16, I16,    KERNEL_SOURCE_1)
+    TENSOR_MAX_KERNELS_2D(I16, F16, F16,    KERNEL_SOURCE_1)
 };
 
 static vx_param_description_t kernel_param_def[] =
@@ -170,19 +142,12 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
         {0, 0, 0},
         {0, 0, 0}
         };
-    uint8_t     in0_fl     = 0;
-    int32_t     src0ZP     = 0;
-    float       src0Scale  = 1.0f;
-    uint8_t     in1_fl     = 0;
-    int32_t     src1ZP     = 0;
-    float       src1Scale  = 1.0f;
-    uint8_t     out_fl     = 0;
-    int32_t     dstZP      = 0;
-    float       dstScale   = 1.0f;
-    float       output_zp  = 0.0f;
-
-    int32_t shift0 = 0;
-    int32_t shift1 = 0;
+    int32_t input0_zp    = 0;
+    float   input0_scale = 1.0f;
+    int32_t input1_zp    = 0;
+    float   input1_scale = 1.0f;
+    int32_t output_zp    = 0;
+    float   output_scale = 1.0f;
 
     vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
     vsi_size_array_t * out_shape = NULL;
@@ -199,65 +164,60 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
 
     if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
-        in0_fl = (uint8_t)attr[0]->dfp.fl;
-        if (in0_fl > 0)
+        int32_t fl = attr[0]->dfp.fl;
+        if (fl > 0)
         {
-            src0Scale = 1.0f / (float) ((int64_t)1 << in0_fl);
+            input0_scale = 1.0f / (float) ((int64_t)1 << fl);
         }
         else
         {
-            src0Scale = (float)((int64_t)1 << -in0_fl);
+            input0_scale = (float)((int64_t)1 << -fl);
         }
     }
     else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
         || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM)
     {
-        src0ZP     = attr[0]->asymm.zero_point;
-        src0Scale  = attr[0]->asymm.scale;
+        input0_zp     = attr[0]->asymm.zero_point;
+        input0_scale  = attr[0]->asymm.scale;
     }
 
     if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
-        in1_fl = (uint8_t)attr[1]->dfp.fl;
-        if (in1_fl > 0)
+        int32_t fl = attr[1]->dfp.fl;
+        if (fl > 0)
         {
-            src0Scale = 1.0f / (float) ((int64_t)1 << in1_fl);
+            input1_scale = 1.0f / (float) ((int64_t)1 << fl);
         }
         else
         {
-            src0Scale = (float)((int64_t)1 << -in1_fl);
+            input1_scale = (float)((int64_t)1 << -fl);
         }
     }
     else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
         || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
     {
-        src1ZP     = attr[1]->asymm.zero_point;
-        src1Scale  = attr[1]->asymm.scale;
+        input1_zp     = attr[1]->asymm.zero_point;
+        input1_scale  = attr[1]->asymm.scale;
     }
 
     if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
-        out_fl = (uint8_t)attr[2]->dfp.fl;
-        if (out_fl > 0)
+        int32_t fl = (uint8_t)attr[2]->dfp.fl;
+        if (fl > 0)
         {
-            dstScale = (float) ((int64_t)1 << out_fl);
+            output_scale = (float) ((int64_t)1 << fl);
         }
         else
         {
-            dstScale = 1.0f / (float)((int64_t)1 << -out_fl);
+            output_scale = 1.0f / (float)((int64_t)1 << -fl);
         }
-        dstZP = 0;
     }
     else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
         || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM)
     {
-        dstZP     = attr[2]->asymm.zero_point;
-        dstScale  = attr[2]->asymm.scale;
+        output_zp     = attr[2]->asymm.zero_point;
+        output_scale  = 1.0f / attr[2]->asymm.scale;
     }
-    output_zp = (float)dstZP;
-
-    shift0 = in0_fl - out_fl;
-    shift1 = in1_fl - out_fl;
 
 #define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE )    \
         (IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16))
@@ -265,17 +225,16 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
     pack_key = _PACK_SELECT_KEY( attr[0]->dtype,
             attr[1]->dtype, attr[2]->dtype );
 
-    if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16)
-        || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16))
-        || (attr[0]->dtype == I16 && attr[1]->dtype == I16 && attr[2]->dtype == U8) )
+    if ( ( attr[0]->dtype == I8 && attr[1]->dtype == I8 && attr[2]->dtype == I8  ) ||
+         ( attr[0]->dtype == U8 && attr[1]->dtype == U8 && attr[2]->dtype == U8  ) )
     {
-        gpu_param.global_scale[0] = 8;
+        gpu_param.global_scale[0] = 16;
         gpu_param.global_scale[1] = 1;
         gpu_param.global_scale[2] = 1;
     }
     else
     {
-        gpu_param.global_scale[0] = 16;
+        gpu_param.global_scale[0] = 8;
         gpu_param.global_scale[1] = 1;
         gpu_param.global_scale[2] = 1;
     }
@@ -290,127 +249,8 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
 
     switch( pack_key )
     {
-        case _PACK_SELECT_KEY( I8, I8, I8 ):
-        case _PACK_SELECT_KEY( I8, F16, I8 ):
-        {
-            gpu_dp_inst_t uniConvertI8toI8_0_part0_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniConvertI8toI8_0_part1_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x0b0a0908, 0x0f0e0d0c, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniConvertI8toI8_1_part0_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniConvertI8toI8_1_part1_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x0b0a0908, 0x0f0e0d0c, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            gpu_dp_inst_update_postshfit( &uniConvertI8toI8_0_part0_2x8, shift0 );
-            gpu_dp_inst_update_postshfit( &uniConvertI8toI8_0_part1_2x8, shift0 );
-
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvertI8toI8_0_part0_2x8", &uniConvertI8toI8_0_part0_2x8 );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvertI8toI8_0_part1_2x8", &uniConvertI8toI8_0_part1_2x8 );
-            CHECK_STATUS_FAIL_GOTO(status, final );
-
-            if ( attr[1]->dtype == F16 )
-            {
-                gpu_dp_inst_t uinConvertFp16ToInt8_2x8 = {{
-                    0x11111111, // TCfg
-                    0x00000000, // ASelt
-                    0x03020100, 0x07060504, // ABin
-                    0x22222222, // BSelt
-                    0x00000000, 0x00000000, // BBin
-                    0x00000600, // AccumType, ConstantType, and PostShift
-                    0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                    0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-                }, GPU_DP_TYPE_16 };
-
-                gpu_dp_inst_update_postshfit( &uinConvertFp16ToInt8_2x8, shift1 );
-                status = vsi_nn_kernel_gpu_add_param( node,
-                        "uinConvertFp16ToInt8_2x8", &uinConvertFp16ToInt8_2x8 );
-                CHECK_STATUS_FAIL_GOTO(status, final );
-            }
-            else
-            {
-                gpu_dp_inst_update_postshfit( &uniConvertI8toI8_1_part0_2x8, shift1 );
-                gpu_dp_inst_update_postshfit( &uniConvertI8toI8_1_part1_2x8, shift1 );
-                status = vsi_nn_kernel_gpu_add_param( node,
-                        "uniConvertI8toI8_1_part0_2x8", &uniConvertI8toI8_1_part0_2x8 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                        "uniConvertI8toI8_1_part1_2x8", &uniConvertI8toI8_1_part1_2x8 );
-                CHECK_STATUS_FAIL_GOTO(status, final );
-            }
-        }
-        break;
-    case _PACK_SELECT_KEY( I16, I16, I16 ):
-        {
-            gpu_dp_inst_t uniConvertI16toI16_0_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniConvertI16toI16_1_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            gpu_dp_inst_update_postshfit( &uniConvertI16toI16_0_2x8, shift0 );
-            gpu_dp_inst_update_postshfit( &uniConvertI16toI16_1_2x8, shift1 );
-
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvertI16toI16_0_2x8", &uniConvertI16toI16_0_2x8 );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvertI16toI16_1_2x8", &uniConvertI16toI16_1_2x8 );
-            CHECK_STATUS_FAIL_GOTO(status, final );
-        }
-        break;
-    case _PACK_SELECT_KEY( U8, U8, U8 ):
-    case _PACK_SELECT_KEY( U8, F16, U8 ):
-    case _PACK_SELECT_KEY( F16, F16, U8 ):
-    case _PACK_SELECT_KEY( U8, U8, I16 ):
-    case _PACK_SELECT_KEY( I16, I16, U8 ):
+    case _PACK_SELECT_KEY( I8,  I8,  I8  ):
+    case _PACK_SELECT_KEY( U8,  U8,  U8  ):
         {
             uint16_t M0               = 0;
             uint16_t M1               = 0;
@@ -440,91 +280,57 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
                 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
             }, GPU_DP_TYPE_16 };
 
-            gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0);
-            gpu_quantize_multiplier_16bit( (double)src1Scale / dstScale, &M1, &postShift1);
+            gpu_quantize_multiplier_16bit( (double)input0_scale * output_scale, &M0, &postShift0);
+            gpu_quantize_multiplier_16bit( (double)input1_scale * output_scale, &M1, &postShift1);
 
             multAndoutZP0[0] = (uint32_t)(M0);
-            multAndoutZP0[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0);
+            multAndoutZP0[1] = (uint32_t)((output_zp << postShift0) - input0_zp * M0);
             multAndoutZP1[0] = (uint32_t)(M1);
-            multAndoutZP1[1] = (uint32_t)((dstZP << postShift1) - src1ZP * M1);
+            multAndoutZP1[1] = (uint32_t)((output_zp << postShift1) - input1_zp * M1);
 
             gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift0 );
             gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift0 );
 
-            status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
+            status  = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
+            status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                "uniU8MulAndPostShift0_Lo_2x8",  &uniU8MulAndPostShift_Lo_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                "uniU8MulAndPostShift0_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
             CHECK_STATUS_FAIL_GOTO(status, final );
 
-            if (attr[0]->dtype == U8 || attr[0]->dtype == I16)
-            {
-                status = vsi_nn_kernel_gpu_add_param( node,
-                        "uniU8MulAndPostShift0_Lo_2x8",  &uniU8MulAndPostShift_Lo_2x8 );
-                if (attr[0]->dtype != I16)
-                {
-                    status |= vsi_nn_kernel_gpu_add_param( node,
-                            "uniU8MulAndPostShift0_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
-                }
-                status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
-                CHECK_STATUS_FAIL_GOTO(status, final );
-            }
-
-            if ( attr[1]->dtype == F16 )
-            {
-                gpu_dp_inst_t uniConvertFp16toU8_2x8 = {{
-                    0xdddddddd, // TCfg
-                    0x44444444, // ASelt
-                    0x13121110, 0x17161514, // ABin
-                    0x11111111, // BSelt
-                    0x00000000, 0x00000000, // BBin
-                    0x00002600, // AccumType, ConstantType, and PostShift
-                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                    0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-                }, GPU_DP_TYPE_16 };
-
-                gpu_dp_inst_update_postshfit( &uniConvertFp16toU8_2x8, postShift1 );
-                status = vsi_nn_kernel_gpu_add_param( node,
-                        "uniConvertFp16toU8_2x8", &uniConvertFp16toU8_2x8 );
-                CHECK_STATUS_FAIL_GOTO(status, final );
-            }
-            else
-            {
-                gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift1 );
-                gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 );
-                status = vsi_nn_kernel_gpu_add_param( node,
-                        "uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
-                if (attr[0]->dtype != I16)
-                {
-                    status |= vsi_nn_kernel_gpu_add_param( node,
-                            "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
-                }
-                CHECK_STATUS_FAIL_GOTO(status, final );
-            }
-        }
-        break;
-    case _PACK_SELECT_KEY( I8, F16, F16 ):
-        {
-            gpu_dp_inst_t uniConvertInt8toFp16_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            gpu_dp_inst_update_postshfit( &uniConvertInt8toFp16_2x8, shift0 );
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvertInt8toFp16_2x8", &uniConvertInt8toFp16_2x8 );
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift1 );
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 );
+            status  = vsi_nn_kernel_gpu_add_param( node,
+                "uniU8MulAndPostShift1_Lo_2x8",  &uniU8MulAndPostShift_Lo_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                "uniU8MulAndPostShift1_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
             CHECK_STATUS_FAIL_GOTO(status, final );
         }
         break;
-    case _PACK_SELECT_KEY( U8, F16, F16 ):
+    case _PACK_SELECT_KEY( I8,  I8,  F16 ):
+    case _PACK_SELECT_KEY( I8,  F16, I8  ):
+    case _PACK_SELECT_KEY( I8,  F16, F16 ):
+    case _PACK_SELECT_KEY( F16, F16, I8  ):
+    case _PACK_SELECT_KEY( U8,  U8,  I16 ):
+    case _PACK_SELECT_KEY( U8,  U8,  F16 ):
+    case _PACK_SELECT_KEY( U8,  F16, F16 ):
+    case _PACK_SELECT_KEY( U8,  F16, U8  ):
+    case _PACK_SELECT_KEY( F16, F16, U8  ):
+    case _PACK_SELECT_KEY( I16, I16, F16 ):
+    case _PACK_SELECT_KEY( I16, F16, F16 ):
+    case _PACK_SELECT_KEY( I16, I16, U8  ):
+    case _PACK_SELECT_KEY( I16, I16, I16 ):
+    case _PACK_SELECT_KEY( I16, F16, I16 ):
+    case _PACK_SELECT_KEY( F16, F16, I16 ):
         {
             uint16_t M0               = 0;
-            int32_t  postShift        = 0;
+            uint16_t M1               = 0;
+            int32_t  postShift0       = 0;
+            int32_t  postShift1       = 0;
             uint32_t multAndoutZP0[2] = {0};
-            gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{
+            uint32_t multAndoutZP1[2] = {0};
+            gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{
                 0xdddddddd, // TCfg
                 0x44444444, // ASelt
                 0x13121110, 0x17161514, // ABin
@@ -535,134 +341,25 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
                 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
             }, GPU_DP_TYPE_16 };
 
-            gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift);
+            gpu_quantize_multiplier_16bit( (double)input0_scale * output_scale, &M0, &postShift0);
+            gpu_quantize_multiplier_16bit( (double)input1_scale * output_scale, &M1, &postShift1);
+
             multAndoutZP0[0] = (uint32_t)(M0);
-            multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0);
+            multAndoutZP0[1] = (uint32_t)((output_zp << postShift0) - input0_zp * M0);
+            multAndoutZP1[0] = (uint32_t)(M1);
+            multAndoutZP1[1] = (uint32_t)((output_zp << postShift1) - input1_zp * M1);
 
-            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift );
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift0 );
             status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 );
+                    "uniU8MulAndPostShift0_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
             status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
-            CHECK_STATUS_FAIL_GOTO(status, final );
-        }
-        break;
-    case _PACK_SELECT_KEY( I16, F16, I16 ):
-        {
-            gpu_dp_inst_t uniConvertI16toI16_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uinConvertFp16ToInt16_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            gpu_dp_inst_update_postshfit( &uniConvertI16toI16_2x8, shift0 );
-            gpu_dp_inst_update_postshfit( &uinConvertFp16ToInt16_2x8, shift1 );
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvertI16toI16_2x8", &uniConvertI16toI16_2x8 );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uinConvertFp16ToInt16_2x8", &uinConvertFp16ToInt16_2x8 );
-            CHECK_STATUS_FAIL_GOTO(status, final );
-        }
-        break;
-    case _PACK_SELECT_KEY( F16, F16, I16 ):
-        {
-            gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
-                0x33333333, // TCfg
-                0x11110000, // ASelt
-                0x03020100, 0x03020100, // ABin
-                0x00000000, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00002400, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniConvert1stFp16ToFp32_4x4 = {{
-                0x01010101, // TCfg
-                0x00000000, // ASelt
-                0x00010000, 0x00030002, // ABin
-                0x02020202, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniConvert2ndFp16ToFp32_4x4 = {{
-                0x01010101, // TCfg
-                0x00000000, // ASelt
-                0x00050004, 0x00070006, // ABin
-                0x02020202, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-                || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM)
+            if ( attr[1]->dtype != F16 || attr[2]->dtype != F16)
             {
-                dstScale  = 1.0f / dstScale;
+                gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift1 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
             }
-
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "outputScale", &dstScale );
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "output_zp", &output_zp );
-            status |= vsi_nn_kernel_gpu_add_param(node,
-                    "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
-            status |= vsi_nn_kernel_gpu_add_param(node,
-                    "uniConvert1stFp16ToFp32_4x4", &uniConvert1stFp16ToFp32_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node,
-                    "uniConvert2ndFp16ToFp32_4x4", &uniConvert2ndFp16ToFp32_4x4);
-            CHECK_STATUS_FAIL_GOTO(status, final );
-        }
-        break;
-    case _PACK_SELECT_KEY( I16, F16, F16 ):
-        {
-            gpu_dp_inst_t uniConvertInt16toFp16_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            gpu_dp_inst_update_postshfit( &uniConvertInt16toFp16_2x8, shift0 );
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvertInt16toFp16_2x8", &uniConvertInt16toFp16_2x8 );
-            CHECK_STATUS_FAIL_GOTO(status, final );
-        }
-        break;
-    case _PACK_SELECT_KEY( F16, F16, I8 ):
-        {
-            gpu_dp_inst_t uinConvertFp16ToInt8_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            gpu_dp_inst_update_postshfit( &uinConvertFp16ToInt8_2x8, shift0 );
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uinConvertFp16ToInt8_2x8", &uinConvertFp16ToInt8_2x8 );
             CHECK_STATUS_FAIL_GOTO(status, final );
         }
         break;
diff --git a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
index 9a64243..30dfc93 100644
--- a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
@@ -38,65 +38,33 @@
 
 __BEGIN_DECLS
 
-#define KERNEL_NAME_MINIMUM_F16F16TOF16             CVIVANTE_NAMESPACE("evis.minimum_F16F16toF16")
-#define KERNEL_NAME_MINIMUM_F16F16TOF16_2D          CVIVANTE_NAMESPACE("evis.minimum_F16F16toF16_2D")
-#define KERNEL_NAME_MINIMUM_I8I8TOI8                CVIVANTE_NAMESPACE("evis.minimum_I8I8toI8")
-#define KERNEL_NAME_MINIMUM_I8I8TOI8_2D             CVIVANTE_NAMESPACE("evis.minimum_I8I8toI8_2D")
-#define KERNEL_NAME_MINIMUM_I8F16TOI8               CVIVANTE_NAMESPACE("evis.minimum_I8F16toI8")
-#define KERNEL_NAME_MINIMUM_I8F16TOI8_2D            CVIVANTE_NAMESPACE("evis.minimum_I8F16toI8_2D")
-#define KERNEL_NAME_MINIMUM_I8F16TOF16              CVIVANTE_NAMESPACE("evis.minimum_I8F16toF16")
-#define KERNEL_NAME_MINIMUM_I8F16TOF16_2D           CVIVANTE_NAMESPACE("evis.minimum_I8F16toF16_2D")
-#define KERNEL_NAME_MINIMUM_U8F16TOF16              CVIVANTE_NAMESPACE("evis.minimum_U8F16toF16")
-#define KERNEL_NAME_MINIMUM_U8F16TOF16_2D           CVIVANTE_NAMESPACE("evis.minimum_U8F16toF16_2D")
-#define KERNEL_NAME_MINIMUM_U8F16TOU8               CVIVANTE_NAMESPACE("evis.minimum_U8F16toU8")
-#define KERNEL_NAME_MINIMUM_U8F16TOU8_2D            CVIVANTE_NAMESPACE("evis.minimum_U8F16toU8_2D")
-#define KERNEL_NAME_MINIMUM_U8U8TOU8                CVIVANTE_NAMESPACE("evis.minimum_U8U8toU8")
-#define KERNEL_NAME_MINIMUM_U8U8TOU8_2D             CVIVANTE_NAMESPACE("evis.minimum_U8U8toU8_2D")
-#define KERNEL_NAME_MINIMUM_I16I16TOI16             CVIVANTE_NAMESPACE("evis.minimum_I16I16toI16")
-#define KERNEL_NAME_MINIMUM_I16I16TOI16_2D          CVIVANTE_NAMESPACE("evis.minimum_I16I16toI16_2D")
-#define KERNEL_NAME_MINIMUM_I16F16TOI16             CVIVANTE_NAMESPACE("evis.minimum_I16F16toI16")
-#define KERNEL_NAME_MINIMUM_I16F16TOI16_2D          CVIVANTE_NAMESPACE("evis.minimum_I16F16toI16_2D")
-#define KERNEL_NAME_MINIMUM_I16F16TOF16             CVIVANTE_NAMESPACE("evis.minimum_I16F16toF16")
-#define KERNEL_NAME_MINIMUM_I16F16TOF16_2D          CVIVANTE_NAMESPACE("evis.minimum_I16F16toF16_2D")
-#define KERNEL_NAME_MINIMUM_F16F16TOU8              CVIVANTE_NAMESPACE("evis.minimum_F16F16toU8")
-#define KERNEL_NAME_MINIMUM_F16F16TOU8_2D           CVIVANTE_NAMESPACE("evis.minimum_F16F16toU8_2D")
-#define KERNEL_NAME_MINIMUM_F16F16TOI8              CVIVANTE_NAMESPACE("evis.minimum_F16F16toI8")
-#define KERNEL_NAME_MINIMUM_F16F16TOI8_2D           CVIVANTE_NAMESPACE("evis.minimum_F16F16toI8_2D")
-#define KERNEL_NAME_MINIMUM_F16F16TOI16             CVIVANTE_NAMESPACE("evis.minimum_F16F16toI16")
-#define KERNEL_NAME_MINIMUM_F16F16TOI16_2D          CVIVANTE_NAMESPACE("evis.minimum_F16F16toI16_2D")
-#define KERNEL_NAME_MINIMUM_I16I16TOU8              CVIVANTE_NAMESPACE("evis.minimum_I16I16toU8")
-#define KERNEL_NAME_MINIMUM_I16I16TOU8_2D           CVIVANTE_NAMESPACE("evis.minimum_I16I16toU8_2D")
-#define KERNEL_NAME_MINIMUM_U8U8TOI16               CVIVANTE_NAMESPACE("evis.minimum_U8U8toI16")
-#define KERNEL_NAME_MINIMUM_U8U8TOI16_2D            CVIVANTE_NAMESPACE("evis.minimum_U8U8toI16_2D")
-
-#define KERNEL_SOURCE_1    "minimum",
-#define KERNEL_SOURCE_2    "minimum_fp16",
-#define KERNEL_SOURCE_3    "minimum_i16"
+#define KERNEL_SOURCE_0    "minimum_0",
+#define KERNEL_SOURCE_1    "minimum_1",
 
 #define HASH_MINIMUM_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
     ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
 
+#define HASH_MINIMUM_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.minimum_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE)
+
+#define HASH_MINIMUM_SH_KERNEL_2D_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.minimum_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_2D")
+
 #define TENSOR_MIN_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
     { HASH_MINIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
-        KERNEL_NAME_MINIMUM_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE, \
+        HASH_MINIMUM_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
         SOURCE },
 
 #define TENSOR_MIN_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
     { HASH_MINIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \
-        KERNEL_NAME_MINIMUM_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE##_2D, \
+        HASH_MINIMUM_SH_KERNEL_2D_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
         SOURCE },
 
-#define HASH_MINIMUM_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.minimum_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE)
-
 #define TENSOR_MIN_KERNELS_HALF(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
     { HASH_MINIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
         HASH_MINIMUM_SH_KERNEL_NAME(F16, F16, F16), \
         SOURCE },
 
-#define HASH_MINIMUM_SH_KERNEL_2D_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.minimum_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_2D")
-
 #define TENSOR_MIN_KERNELS_2D_HALF(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
     { HASH_MINIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \
         HASH_MINIMUM_SH_KERNEL_2D_NAME(F16, F16, F16), \
@@ -108,43 +76,47 @@ static const struct {
         const char* source_name;
     } kernel_map[] =
 {
-    TENSOR_MIN_KERNELS_HALF(F16, F16, F16,       KERNEL_SOURCE_1)
-    TENSOR_MIN_KERNELS_HALF(BF16, BF16, BF16,    KERNEL_SOURCE_1)
-    TENSOR_MIN_KERNELS(F16, F16, I8,        KERNEL_SOURCE_1)
-    TENSOR_MIN_KERNELS(I8,  I8, I8,         KERNEL_SOURCE_1)
-    TENSOR_MIN_KERNELS(U8,  U8, U8,         KERNEL_SOURCE_1)
-    TENSOR_MIN_KERNELS(U8,  U8, I16,        KERNEL_SOURCE_1)
-    TENSOR_MIN_KERNELS(I16, I16, I16,       KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS_HALF(F16, F16, F16,       KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS_HALF(BF16, BF16, BF16,    KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS(I8,  I8, I8,         KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS(U8,  U8, U8,         KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS(U8,  U8, I16,        KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS(I16, I16, I16,       KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS(I16, I16, U8,        KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS(F16, F16, I8,        KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS(F16, F16, U8,        KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS(F16, F16, I16,       KERNEL_SOURCE_0)
 
-    TENSOR_MIN_KERNELS(F16, F16, U8,        KERNEL_SOURCE_2)
-    TENSOR_MIN_KERNELS(I8,  F16, I8,        KERNEL_SOURCE_2)
-    TENSOR_MIN_KERNELS(I8,  F16, F16,       KERNEL_SOURCE_2)
-    TENSOR_MIN_KERNELS(U8,  F16, U8,        KERNEL_SOURCE_2)
-    TENSOR_MIN_KERNELS(U8,  F16, F16,       KERNEL_SOURCE_2)
+    TENSOR_MIN_KERNELS(I8,  I8,  F16,       KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS(I8,  F16, I8,        KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS(I8,  F16, F16,       KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS(U8,  U8,  F16,       KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS(U8,  F16, U8,        KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS(I16, I16, F16,       KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS(U8,  F16, F16,       KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS(I16, F16, I16,       KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS(I16, F16, F16,       KERNEL_SOURCE_1)
 
-    TENSOR_MIN_KERNELS(I16, F16, I16,       KERNEL_SOURCE_3)
-    TENSOR_MIN_KERNELS(I16, F16, F16,       KERNEL_SOURCE_3)
-    TENSOR_MIN_KERNELS(F16, F16, I16,       KERNEL_SOURCE_3)
-    TENSOR_MIN_KERNELS(I16, I16, U8,        KERNEL_SOURCE_3)
+    TENSOR_MIN_KERNELS_2D_HALF(F16, F16, F16,    KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS_2D_HALF(BF16, BF16, BF16, KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS_2D(I8,  I8, I8,      KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS_2D(U8,  U8, U8,      KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS_2D(U8,  U8,  I16,    KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS_2D(I16, I16, I16,    KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS_2D(I16, I16, U8,     KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS_2D(F16, F16, I8,     KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS_2D(F16, F16, U8,     KERNEL_SOURCE_0)
+    TENSOR_MIN_KERNELS_2D(F16, F16, I16,    KERNEL_SOURCE_0)
 
-    TENSOR_MIN_KERNELS_2D_HALF(F16, F16, F16,    KERNEL_SOURCE_1)
-    TENSOR_MIN_KERNELS_2D_HALF(BF16, BF16, BF16, KERNEL_SOURCE_1)
-    TENSOR_MIN_KERNELS_2D(F16, F16, I8,     KERNEL_SOURCE_1)
-    TENSOR_MIN_KERNELS_2D(I8,  I8, I8,      KERNEL_SOURCE_1)
-    TENSOR_MIN_KERNELS_2D(U8,  U8, U8,      KERNEL_SOURCE_1)
-    TENSOR_MIN_KERNELS_2D(U8,  U8,  I16,    KERNEL_SOURCE_1)
-    TENSOR_MIN_KERNELS_2D(I16, I16, I16,    KERNEL_SOURCE_1)
-
-    TENSOR_MIN_KERNELS_2D(F16, F16, U8,     KERNEL_SOURCE_2)
-    TENSOR_MIN_KERNELS_2D(I8,  F16, I8,     KERNEL_SOURCE_2)
-    TENSOR_MIN_KERNELS_2D(I8,  F16, F16,    KERNEL_SOURCE_2)
-    TENSOR_MIN_KERNELS_2D(U8,  F16, U8,     KERNEL_SOURCE_2)
-    TENSOR_MIN_KERNELS_2D(U8,  F16, F16,    KERNEL_SOURCE_2)
-
-    TENSOR_MIN_KERNELS_2D(I16, F16, I16,    KERNEL_SOURCE_3)
-    TENSOR_MIN_KERNELS_2D(I16, F16, F16,    KERNEL_SOURCE_3)
-    TENSOR_MIN_KERNELS_2D(F16, F16, I16,    KERNEL_SOURCE_3)
-    TENSOR_MIN_KERNELS_2D(I16, I16, U8,     KERNEL_SOURCE_3)
+    TENSOR_MIN_KERNELS_2D(I8,  I8,  F16,    KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS_2D(I8,  F16, I8,     KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS_2D(I8,  F16, F16,    KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS_2D(U8,  U8,  F16,    KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS_2D(U8,  F16, U8,     KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS_2D(U8,  F16, F16,    KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS_2D(I16, I16, F16,    KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS_2D(I16, F16, I16,    KERNEL_SOURCE_1)
+    TENSOR_MIN_KERNELS_2D(I16, F16, F16,    KERNEL_SOURCE_1)
 };
 
 static vx_param_description_t kernel_param_def[] =
@@ -170,19 +142,12 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
         {0, 0, 0},
         {0, 0, 0}
         };
-    uint8_t     in0_fl     = 0;
-    int32_t     src0ZP     = 0;
-    float       src0Scale  = 1.0f;
-    uint8_t     in1_fl     = 0;
-    int32_t     src1ZP     = 0;
-    float       src1Scale  = 1.0f;
-    uint8_t     out_fl     = 0;
-    int32_t     dstZP      = 0;
-    float       dstScale   = 1.0f;
-    float       output_zp  = 0.0f;
-
-    int32_t shift0 = 0;
-    int32_t shift1 = 0;
+    int32_t input0_zp    = 0;
+    float   input0_scale = 1.0f;
+    int32_t input1_zp    = 0;
+    float   input1_scale = 1.0f;
+    int32_t output_zp    = 0;
+    float   output_scale = 1.0f;
 
     vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
     vsi_size_array_t * out_shape = NULL;
@@ -199,65 +164,60 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
 
     if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
-        in0_fl = (uint8_t)attr[0]->dfp.fl;
-        if (in0_fl > 0)
+        int32_t fl = attr[0]->dfp.fl;
+        if (fl > 0)
         {
-            src0Scale = 1.0f / (float) ((int64_t)1 << in0_fl);
+            input0_scale = 1.0f / (float) ((int64_t)1 << fl);
         }
         else
         {
-            src0Scale = (float)((int64_t)1 << -in0_fl);
+            input0_scale = (float)((int64_t)1 << -fl);
         }
     }
     else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
         || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM)
     {
-        src0ZP     = attr[0]->asymm.zero_point;
-        src0Scale  = attr[0]->asymm.scale;
+        input0_zp     = attr[0]->asymm.zero_point;
+        input0_scale  = attr[0]->asymm.scale;
     }
 
     if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
-        in1_fl = (uint8_t)attr[1]->dfp.fl;
-        if (in1_fl > 0)
+        int32_t fl = attr[1]->dfp.fl;
+        if (fl > 0)
         {
-            src0Scale = 1.0f / (float) ((int64_t)1 << in1_fl);
+            input1_scale = 1.0f / (float) ((int64_t)1 << fl);
         }
         else
         {
-            src0Scale = (float)((int64_t)1 << -in1_fl);
+            input1_scale = (float)((int64_t)1 << -fl);
         }
     }
     else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
         || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
     {
-        src1ZP     = attr[1]->asymm.zero_point;
-        src1Scale  = attr[1]->asymm.scale;
+        input1_zp     = attr[1]->asymm.zero_point;
+        input1_scale  = attr[1]->asymm.scale;
     }
 
     if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
-        out_fl = (uint8_t)attr[2]->dfp.fl;
-        if (out_fl > 0)
+        int32_t fl = (uint8_t)attr[2]->dfp.fl;
+        if (fl > 0)
         {
-            dstScale = (float) ((int64_t)1 << out_fl);
+            output_scale = (float) ((int64_t)1 << fl);
         }
         else
         {
-            dstScale = 1.0f / (float)((int64_t)1 << -out_fl);
+            output_scale = 1.0f / (float)((int64_t)1 << -fl);
         }
-        dstZP = 0;
     }
     else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
         || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM)
     {
-        dstZP     = attr[2]->asymm.zero_point;
-        dstScale  = attr[2]->asymm.scale;
+        output_zp     = attr[2]->asymm.zero_point;
+        output_scale  = 1.0f / attr[2]->asymm.scale;
     }
-    output_zp = (float)dstZP;
-
-    shift0 = in0_fl - out_fl;
-    shift1 = in1_fl - out_fl;
 
 #define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE )    \
         (IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16))
@@ -265,17 +225,16 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
     pack_key = _PACK_SELECT_KEY( attr[0]->dtype,
             attr[1]->dtype, attr[2]->dtype );
 
-    if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16)
-        || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16))
-        || (attr[0]->dtype == I16 && attr[1]->dtype == I16 && attr[2]->dtype == U8) )
+    if ( ( attr[0]->dtype == I8 && attr[1]->dtype == I8 && attr[2]->dtype == I8  ) ||
+         ( attr[0]->dtype == U8 && attr[1]->dtype == U8 && attr[2]->dtype == U8  ) )
     {
-        gpu_param.global_scale[0] = 8;
+        gpu_param.global_scale[0] = 16;
         gpu_param.global_scale[1] = 1;
         gpu_param.global_scale[2] = 1;
     }
     else
     {
-        gpu_param.global_scale[0] = 16;
+        gpu_param.global_scale[0] = 8;
         gpu_param.global_scale[1] = 1;
         gpu_param.global_scale[2] = 1;
     }
@@ -290,127 +249,8 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
 
     switch( pack_key )
     {
-        case _PACK_SELECT_KEY( I8, I8, I8 ):
-        case _PACK_SELECT_KEY( I8, F16, I8 ):
-        {
-            gpu_dp_inst_t uniConvertI8toI8_0_part0_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniConvertI8toI8_0_part1_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x0b0a0908, 0x0f0e0d0c, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniConvertI8toI8_1_part0_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniConvertI8toI8_1_part1_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x0b0a0908, 0x0f0e0d0c, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            gpu_dp_inst_update_postshfit( &uniConvertI8toI8_0_part0_2x8, shift0 );
-            gpu_dp_inst_update_postshfit( &uniConvertI8toI8_0_part1_2x8, shift0 );
-
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvertI8toI8_0_part0_2x8", &uniConvertI8toI8_0_part0_2x8 );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvertI8toI8_0_part1_2x8", &uniConvertI8toI8_0_part1_2x8 );
-            CHECK_STATUS_FAIL_GOTO(status, final );
-
-            if ( attr[1]->dtype == F16 )
-            {
-                gpu_dp_inst_t uinConvertFp16ToInt8_2x8 = {{
-                    0x11111111, // TCfg
-                    0x00000000, // ASelt
-                    0x03020100, 0x07060504, // ABin
-                    0x22222222, // BSelt
-                    0x00000000, 0x00000000, // BBin
-                    0x00000600, // AccumType, ConstantType, and PostShift
-                    0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                    0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-                }, GPU_DP_TYPE_16 };
-
-                gpu_dp_inst_update_postshfit( &uinConvertFp16ToInt8_2x8, shift1 );
-                status = vsi_nn_kernel_gpu_add_param( node,
-                        "uinConvertFp16ToInt8_2x8", &uinConvertFp16ToInt8_2x8 );
-                CHECK_STATUS_FAIL_GOTO(status, final );
-            }
-            else
-            {
-                gpu_dp_inst_update_postshfit( &uniConvertI8toI8_1_part0_2x8, shift1 );
-                gpu_dp_inst_update_postshfit( &uniConvertI8toI8_1_part1_2x8, shift1 );
-                status = vsi_nn_kernel_gpu_add_param( node,
-                        "uniConvertI8toI8_1_part0_2x8", &uniConvertI8toI8_1_part0_2x8 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                        "uniConvertI8toI8_1_part1_2x8", &uniConvertI8toI8_1_part1_2x8 );
-                CHECK_STATUS_FAIL_GOTO(status, final );
-            }
-        }
-        break;
-    case _PACK_SELECT_KEY( I16, I16, I16 ):
-        {
-            gpu_dp_inst_t uniConvertI16toI16_0_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniConvertI16toI16_1_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            gpu_dp_inst_update_postshfit( &uniConvertI16toI16_0_2x8, shift0 );
-            gpu_dp_inst_update_postshfit( &uniConvertI16toI16_1_2x8, shift1 );
-
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvertI16toI16_0_2x8", &uniConvertI16toI16_0_2x8 );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvertI16toI16_1_2x8", &uniConvertI16toI16_1_2x8 );
-            CHECK_STATUS_FAIL_GOTO(status, final );
-        }
-        break;
-    case _PACK_SELECT_KEY( U8, U8, U8 ):
-    case _PACK_SELECT_KEY( U8, F16, U8 ):
-    case _PACK_SELECT_KEY( F16, F16, U8 ):
-    case _PACK_SELECT_KEY( U8, U8, I16 ):
-    case _PACK_SELECT_KEY( I16, I16, U8 ):
+    case _PACK_SELECT_KEY( I8,  I8,  I8  ):
+    case _PACK_SELECT_KEY( U8,  U8,  U8  ):
         {
             uint16_t M0               = 0;
             uint16_t M1               = 0;
@@ -440,91 +280,57 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
                 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
             }, GPU_DP_TYPE_16 };
 
-            gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0);
-            gpu_quantize_multiplier_16bit( (double)src1Scale / dstScale, &M1, &postShift1);
+            gpu_quantize_multiplier_16bit( (double)input0_scale * output_scale, &M0, &postShift0);
+            gpu_quantize_multiplier_16bit( (double)input1_scale * output_scale, &M1, &postShift1);
 
             multAndoutZP0[0] = (uint32_t)(M0);
-            multAndoutZP0[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0);
+            multAndoutZP0[1] = (uint32_t)((output_zp << postShift0) - input0_zp * M0);
             multAndoutZP1[0] = (uint32_t)(M1);
-            multAndoutZP1[1] = (uint32_t)((dstZP << postShift1) - src1ZP * M1);
+            multAndoutZP1[1] = (uint32_t)((output_zp << postShift1) - input1_zp * M1);
 
             gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift0 );
             gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift0 );
 
-            status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
+            status  = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
+            status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                "uniU8MulAndPostShift0_Lo_2x8",  &uniU8MulAndPostShift_Lo_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                "uniU8MulAndPostShift0_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
             CHECK_STATUS_FAIL_GOTO(status, final );
 
-            if (attr[0]->dtype == U8 || attr[0]->dtype == I16)
-            {
-                status = vsi_nn_kernel_gpu_add_param( node,
-                        "uniU8MulAndPostShift0_Lo_2x8",  &uniU8MulAndPostShift_Lo_2x8 );
-                if (attr[0]->dtype != I16)
-                {
-                    status |= vsi_nn_kernel_gpu_add_param( node,
-                            "uniU8MulAndPostShift0_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
-                }
-                status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
-                CHECK_STATUS_FAIL_GOTO(status, final );
-            }
-
-            if ( attr[1]->dtype == F16 )
-            {
-                gpu_dp_inst_t uniConvertFp16toU8_2x8 = {{
-                    0xdddddddd, // TCfg
-                    0x44444444, // ASelt
-                    0x13121110, 0x17161514, // ABin
-                    0x11111111, // BSelt
-                    0x00000000, 0x00000000, // BBin
-                    0x00002600, // AccumType, ConstantType, and PostShift
-                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                    0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-                }, GPU_DP_TYPE_16 };
-
-                gpu_dp_inst_update_postshfit( &uniConvertFp16toU8_2x8, postShift1 );
-                status = vsi_nn_kernel_gpu_add_param( node,
-                        "uniConvertFp16toU8_2x8", &uniConvertFp16toU8_2x8 );
-                CHECK_STATUS_FAIL_GOTO(status, final );
-            }
-            else
-            {
-                gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift1 );
-                gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 );
-                status = vsi_nn_kernel_gpu_add_param( node,
-                        "uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
-                if (attr[0]->dtype != I16)
-                {
-                    status |= vsi_nn_kernel_gpu_add_param( node,
-                            "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
-                }
-                CHECK_STATUS_FAIL_GOTO(status, final );
-            }
-        }
-        break;
-    case _PACK_SELECT_KEY( I8, F16, F16 ):
-        {
-            gpu_dp_inst_t uniConvertInt8toFp16_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            gpu_dp_inst_update_postshfit( &uniConvertInt8toFp16_2x8, shift0 );
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvertInt8toFp16_2x8", &uniConvertInt8toFp16_2x8 );
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift1 );
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 );
+            status  = vsi_nn_kernel_gpu_add_param( node,
+                "uniU8MulAndPostShift1_Lo_2x8",  &uniU8MulAndPostShift_Lo_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                "uniU8MulAndPostShift1_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
             CHECK_STATUS_FAIL_GOTO(status, final );
         }
         break;
-    case _PACK_SELECT_KEY( U8, F16, F16 ):
+    case _PACK_SELECT_KEY( I8,  I8,  F16 ):
+    case _PACK_SELECT_KEY( I8,  F16, I8  ):
+    case _PACK_SELECT_KEY( I8,  F16, F16 ):
+    case _PACK_SELECT_KEY( F16, F16, I8  ):
+    case _PACK_SELECT_KEY( U8,  U8,  I16 ):
+    case _PACK_SELECT_KEY( U8,  U8,  F16 ):
+    case _PACK_SELECT_KEY( U8,  F16, F16 ):
+    case _PACK_SELECT_KEY( U8,  F16, U8  ):
+    case _PACK_SELECT_KEY( F16, F16, U8  ):
+    case _PACK_SELECT_KEY( I16, I16, F16 ):
+    case _PACK_SELECT_KEY( I16, F16, F16 ):
+    case _PACK_SELECT_KEY( I16, I16, U8  ):
+    case _PACK_SELECT_KEY( I16, I16, I16 ):
+    case _PACK_SELECT_KEY( I16, F16, I16 ):
+    case _PACK_SELECT_KEY( F16, F16, I16 ):
         {
             uint16_t M0               = 0;
-            int32_t  postShift        = 0;
+            uint16_t M1               = 0;
+            int32_t  postShift0       = 0;
+            int32_t  postShift1       = 0;
             uint32_t multAndoutZP0[2] = {0};
-            gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{
+            uint32_t multAndoutZP1[2] = {0};
+            gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{
                 0xdddddddd, // TCfg
                 0x44444444, // ASelt
                 0x13121110, 0x17161514, // ABin
@@ -535,134 +341,25 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
                 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
             }, GPU_DP_TYPE_16 };
 
-            gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift);
+            gpu_quantize_multiplier_16bit( (double)input0_scale * output_scale, &M0, &postShift0);
+            gpu_quantize_multiplier_16bit( (double)input1_scale * output_scale, &M1, &postShift1);
+
             multAndoutZP0[0] = (uint32_t)(M0);
-            multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0);
+            multAndoutZP0[1] = (uint32_t)((output_zp << postShift0) - input0_zp * M0);
+            multAndoutZP1[0] = (uint32_t)(M1);
+            multAndoutZP1[1] = (uint32_t)((output_zp << postShift1) - input1_zp * M1);
 
-            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift );
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift0 );
             status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 );
+                    "uniU8MulAndPostShift0_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
             status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
-            CHECK_STATUS_FAIL_GOTO(status, final );
-        }
-        break;
-    case _PACK_SELECT_KEY( I16, F16, I16 ):
-        {
-            gpu_dp_inst_t uniConvertI16toI16_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uinConvertFp16ToInt16_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            gpu_dp_inst_update_postshfit( &uniConvertI16toI16_2x8, shift0 );
-            gpu_dp_inst_update_postshfit( &uinConvertFp16ToInt16_2x8, shift1 );
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvertI16toI16_2x8", &uniConvertI16toI16_2x8 );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uinConvertFp16ToInt16_2x8", &uinConvertFp16ToInt16_2x8 );
-            CHECK_STATUS_FAIL_GOTO(status, final );
-        }
-        break;
-    case _PACK_SELECT_KEY( F16, F16, I16 ):
-        {
-            gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
-                0x33333333, // TCfg
-                0x11110000, // ASelt
-                0x03020100, 0x03020100, // ABin
-                0x00000000, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00002400, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniConvert1stFp16ToFp32_4x4 = {{
-                0x01010101, // TCfg
-                0x00000000, // ASelt
-                0x00010000, 0x00030002, // ABin
-                0x02020202, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniConvert2ndFp16ToFp32_4x4 = {{
-                0x01010101, // TCfg
-                0x00000000, // ASelt
-                0x00050004, 0x00070006, // ABin
-                0x02020202, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-                || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM)
+            if ( attr[1]->dtype != F16 || attr[2]->dtype != F16)
             {
-                dstScale  = 1.0f / dstScale;
+                gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift1 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
             }
-
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "outputScale", &dstScale );
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "output_zp", &output_zp );
-            status |= vsi_nn_kernel_gpu_add_param(node,
-                    "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
-            status |= vsi_nn_kernel_gpu_add_param(node,
-                    "uniConvert1stFp16ToFp32_4x4", &uniConvert1stFp16ToFp32_4x4);
-            status |= vsi_nn_kernel_gpu_add_param(node,
-                    "uniConvert2ndFp16ToFp32_4x4", &uniConvert2ndFp16ToFp32_4x4);
-            CHECK_STATUS_FAIL_GOTO(status, final );
-        }
-        break;
-    case _PACK_SELECT_KEY( I16, F16, F16 ):
-        {
-            gpu_dp_inst_t uniConvertInt16toFp16_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            gpu_dp_inst_update_postshfit( &uniConvertInt16toFp16_2x8, shift0 );
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvertInt16toFp16_2x8", &uniConvertInt16toFp16_2x8 );
-            CHECK_STATUS_FAIL_GOTO(status, final );
-        }
-        break;
-    case _PACK_SELECT_KEY( F16, F16, I8 ):
-        {
-            gpu_dp_inst_t uinConvertFp16ToInt8_2x8 = {{
-                0x11111111, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x22222222, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00000001, 0x00000001, 0x00000001,
-                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-            }, GPU_DP_TYPE_16 };
-
-            gpu_dp_inst_update_postshfit( &uinConvertFp16ToInt8_2x8, shift0 );
-            status = vsi_nn_kernel_gpu_add_param( node,
-                    "uinConvertFp16ToInt8_2x8", &uinConvertFp16ToInt8_2x8 );
             CHECK_STATUS_FAIL_GOTO(status, final );
         }
         break;
diff --git a/src/tim/vx/internal/src/kernel/evis/moments_evis.c b/src/tim/vx/internal/src/kernel/evis/moments_evis.c
index bc45fc0..d791426 100644
--- a/src/tim/vx/internal/src/kernel/evis/moments_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c
@@ -802,6 +802,13 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
+    if ( ( input0_dtype == I8 || input0_dtype == I16 ) &&
+         ( inputs[0]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_DFP &&
+           inputs[0]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_NONE ) )
+    {
+        return VSI_FAILURE;
+    }
+
     key = HASH_MOMENTS_KEY( input0_dtype, output_dtype, axis_num, axis[0], axis[1], axis[2], rs_flg );
 
     for( i = 0; i < _cnt_of_array(moments_map); i++ )
diff --git a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
index bc78fd3..5dc0502 100644
--- a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
@@ -22,7 +22,6 @@
 *
 *****************************************************************************/
 
-
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -68,24 +67,36 @@ typedef struct
 static const _kernel_map_type _one_hot_kernel_map[] =
 {
     // Register kernel here
+    PACK_ONE_HOT_KERNEL_3D( U8,   I8 ),
     PACK_ONE_HOT_KERNEL_3D( U8,   U8 ),
     PACK_ONE_HOT_KERNEL_3D( U8,   F16 ),
+    PACK_ONE_HOT_KERNEL_3D( U8,   I16 ),
+    PACK_ONE_HOT_KERNEL_3D( U8,   BF16 ),
     PACK_ONE_HOT_KERNEL_3D( I8,   I8 ),
     PACK_ONE_HOT_KERNEL_3D( I8,   F16 ),
+    PACK_ONE_HOT_KERNEL_3D( I16,  I8 ),
+    PACK_ONE_HOT_KERNEL_3D( I16,  U8 ),
     PACK_ONE_HOT_KERNEL_3D( I16,  I16 ),
     PACK_ONE_HOT_KERNEL_3D( I16,  F16 ),
+    PACK_ONE_HOT_KERNEL_3D( I16,  BF16 ),
     PACK_ONE_HOT_KERNEL_3D( F16,  F16 ),
     PACK_ONE_HOT_KERNEL_3D( F16,  I16 ),
     PACK_ONE_HOT_KERNEL_3D( F16,  U8 ),
     PACK_ONE_HOT_KERNEL_3D( F16,  I8 ),
     PACK_ONE_HOT_KERNEL_3D( BF16, BF16 ),
 
+    PACK_ONE_HOT_KERNEL_2D( U8,   I8 ),
     PACK_ONE_HOT_KERNEL_2D( U8,   U8 ),
     PACK_ONE_HOT_KERNEL_2D( U8,   F16 ),
+    PACK_ONE_HOT_KERNEL_2D( U8,   I16 ),
+    PACK_ONE_HOT_KERNEL_2D( U8,   BF16 ),
     PACK_ONE_HOT_KERNEL_2D( I8,   I8 ),
     PACK_ONE_HOT_KERNEL_2D( I8,   F16 ),
+    PACK_ONE_HOT_KERNEL_2D( I16,  U8 ),
+    PACK_ONE_HOT_KERNEL_2D( I16,  I16 ),
     PACK_ONE_HOT_KERNEL_2D( I16,  I16 ),
     PACK_ONE_HOT_KERNEL_2D( I16,  F16 ),
+    PACK_ONE_HOT_KERNEL_2D( I16,  BF16 ),
     PACK_ONE_HOT_KERNEL_2D( F16,  F16 ),
     PACK_ONE_HOT_KERNEL_2D( F16,  I16 ),
     PACK_ONE_HOT_KERNEL_2D( F16,  U8 ),
@@ -93,7 +104,6 @@ static const _kernel_map_type _one_hot_kernel_map[] =
     PACK_ONE_HOT_KERNEL_2D( BF16, BF16 ),
 };
 
-
 /*
  * Kernel params
  */
@@ -151,7 +161,7 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer)
 
     if (VSI_NN_KERNEL_QUANT_DFP == attr[0]->quant)
     {
-        srcFixPointPos   = attr[0]->dfp.fl;
+        srcFixPointPos = attr[0]->dfp.fl;
     }
     else if (VSI_NN_KERNEL_QUANT_ASYMM == attr[0]->quant)
     {
@@ -335,7 +345,6 @@ final:
     return status;
 } /* _one_hot_initializer() */
 
-
 /*
  * Query kernel
  */
@@ -361,6 +370,13 @@ static vsi_status _query_kernel
     in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
+    if ( ( in_dtype == I8 || in_dtype == I16 ) &&
+         ( inputs[0]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_DFP &&
+           inputs[0]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_NONE ) )
+    {
+        return VSI_FAILURE;
+    }
+
     key = ONE_HOT_HASH_KEY( in_dtype, out_dtype, image_2d );
 
     for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
@@ -389,7 +405,6 @@ static vsi_status _query_kernel
     return status;
 } /* _query_kernel() */
 
-
 static vsi_nn_kernel_node_t _setup
     (
     vsi_nn_graph_t              * graph,
@@ -504,4 +519,3 @@ final:
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( one_hot, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
new file mode 100644
index 0000000..4089e0c
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
@@ -0,0 +1,689 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+#define KERNEL_SOURCE_0    "pre_process_rgb888_planar_0",
+#define KERNEL_SOURCE_1    "pre_process_rgb888_planar_1",
+#define KERNEL_SOURCE_2    "pre_process_rgb888_planar_2",
+
+#define STR(a) #a
+
+typedef enum
+{
+    COPY = 0,
+    SCALE,
+    FOUR_OVER_THREE,
+    HALF
+} _internal_scale_e;
+// Add kernel hashtable here
+#define PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, SCALE_FLAG ) \
+        (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 ) | (SCALE_FLAG))
+
+#define PACK_KERNEL_SCALE_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, SCALE ), \
+          CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+          KERNEL_SOURCE_0 }
+
+#define PACK_KERNEL_COPY_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, COPY ), \
+          CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+          KERNEL_SOURCE_1 }
+
+#define PACK_KERNEL_4_OVER_3_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, FOUR_OVER_THREE ), \
+          CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_4over3_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+          KERNEL_SOURCE_2 }
+
+#define PACK_KERNEL_HALF_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, HALF ), \
+          CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+          KERNEL_SOURCE_2 }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type pre_process_rgb888_planar_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_SCALE_MAP( U8, F16 ),
+    PACK_KERNEL_SCALE_MAP( U8, I16 ),
+    PACK_KERNEL_SCALE_MAP( U8, I8 ),
+    PACK_KERNEL_SCALE_MAP( U8, U8 ),
+
+    PACK_KERNEL_COPY_MAP( U8, F16 ),
+    PACK_KERNEL_COPY_MAP( U8, I16 ),
+    PACK_KERNEL_COPY_MAP( U8, I8 ),
+    PACK_KERNEL_COPY_MAP( U8, U8 ),
+
+    PACK_KERNEL_4_OVER_3_MAP( U8, U8 ),
+    PACK_KERNEL_HALF_MAP( U8, U8 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM  _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        2,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    float    output_zp    = 0;
+    float    output_scale = 1;
+    uint32_t width      = 0;
+    uint32_t height     = 0;
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    vsi_size_array_t * out_shape = NULL;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &output_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    out_shape  = attr[0]->shape;
+    width      = (uint32_t)(out_shape->data[0]);
+    height     = (uint32_t)(out_shape->data[1]);
+
+    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        if ( attr[0]->dfp.fl > 0 )
+        {
+            output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl);
+        }
+        else
+        {
+            output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
+        }
+    }
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        output_zp = (float)attr[0]->asymm.zero_point;
+        output_scale /= attr[0]->asymm.scale;
+    }
+
+    shaderParam.global_scale[0]  = 4;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
+        / shaderParam.global_scale[0], 4);
+    shaderParam.global_size[1]   = height;
+    shaderParam.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniVecShift10 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00020000, 0x00060004, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000400, 0x00000000, 0x00000400, 0x00000000,
+            0x00000400, 0x00000000, 0x00000400, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniAddRShift = {{
+            0x0f0f0f0f, // TCfg
+            0x04040404, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002405, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniGetTempVal = {{
+            0x09090909, // TCfg
+            0x00000000, // ASelt
+            0x00230001, 0x00670045, // ABin
+            0x05050505, // BSelt
+            0x00110000, 0x00330022, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractBytes = {{
+            0x0f0f0f0f, // TCfg
+            0x04040404, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002414, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertIntergetoF32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractInteger_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status = vsi_nn_kernel_gpu_add_param(node, "uniVecShift10", &uniVecShift10);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniAddRShift", &uniAddRShift);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4);
+
+        status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
+
+        if (attr[0]->dtype == F16)
+        {
+            status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
+        }
+        else
+        {
+            status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8);
+        }
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _pre_process_rgb888_planar_initializer() */
+
+DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        2,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    float    output_zp    = 0;
+    float    output_scale = 1;
+    uint32_t width      = 0;
+    uint32_t height     = 0;
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    vsi_size_array_t * out_shape = NULL;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &output_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    out_shape  = attr[0]->shape;
+    width      = (uint32_t)(out_shape->data[0]);
+    height     = (uint32_t)(out_shape->data[1]);
+
+    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        if ( attr[0]->dfp.fl > 0 )
+        {
+            output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl);
+        }
+        else
+        {
+            output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
+        }
+    }
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        output_zp = (float)attr[0]->asymm.zero_point;
+        output_scale /= attr[0]->asymm.scale;
+    }
+
+    shaderParam.global_scale[0]  = 16;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
+        / shaderParam.global_scale[0], 4);
+    shaderParam.global_size[1]   = height;
+    shaderParam.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniDataMeanStddevLo_2x8 = {{
+            0x99999999, // TCfg
+            0x44444444, // ASelt
+            0x03020100, 0x07060504, // ABin
+            0x99999999, // BSelt
+            0x06060606, 0x06060606, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniDataMeanStddevHi_2x8 = {{
+            0x99999999, // TCfg
+            0x44444444, // ASelt
+            0x0b0a0908, 0x0f0e0d0c, // ABin
+            0x99999999, // BSelt
+            0x06060606, 0x06060606, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status = vsi_nn_kernel_gpu_add_param(node, "uniDataMeanStddevLo_2x8", &uniDataMeanStddevLo_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniDataMeanStddevHi_2x8", &uniDataMeanStddevHi_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
+        status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _pre_process_gray_copy_initializer() */
+
+DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        2,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    uint32_t    width       = 0;
+    uint32_t    height      = 0;
+    vsi_bool    is_4_over_3 = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    vsi_size_array_t * out_shape = NULL;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+
+    out_shape  = attr[1]->shape;
+    width      = (uint32_t)(out_shape->data[0]);
+    height     = (uint32_t)(out_shape->data[1]);
+
+    is_4_over_3 = (attr[0]->shape->data[0] * 3 == width * 4) &&
+                  (attr[0]->shape->data[1] * 3 == height * 4);
+
+    if (is_4_over_3)
+    {
+        shaderParam.global_scale[0]  = 16;
+        shaderParam.global_scale[1]  = 4;
+        shaderParam.global_size[0]   = gpu_align_p2((attr[0]->shape->data[0] + shaderParam.global_scale[0] - 1)
+            / shaderParam.global_scale[0], 4);
+        shaderParam.global_size[1]   = (attr[0]->shape->data[1] + shaderParam.global_scale[1] - 1)
+            / shaderParam.global_scale[1];
+    }
+    else
+    {
+        shaderParam.global_scale[0]  = 16;
+        shaderParam.global_scale[1]  = 2;
+        shaderParam.global_size[0]   = gpu_align_p2((attr[0]->shape->data[0] + shaderParam.global_scale[0] - 1)
+            / shaderParam.global_scale[0], 4);
+        shaderParam.global_size[1]   = (attr[0]->shape->data[1] + shaderParam.global_scale[1] - 1)
+            / shaderParam.global_scale[1];
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    if (is_4_over_3)
+    {
+        gpu_dp_inst_t uniBilinear_4over3_l00_2x8 = {{
+            0x51551551, // TCfg
+            0x00000000, // ASelt
+            0x04322100, 0xa9087665, // ABin
+            0xa2aa2aa2, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff,
+            0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniBilinear_4over3_l10_2x8 = {{
+            0x00005515, // TCfg
+            0x00000000, // ASelt
+            0xfeed0cba, 0x00000000, // ABin
+            0x0000aa2a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniBilinear_4over3_l01_4x4 = {{
+            0x05555505, // TCfg
+            0x04505004, // ASelt
+            0x21210000, 0x00443232, // ABin
+            0x0aaaaa0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0x5555aaaa, 0x00000000, 0x38e471c7, 0x1c7238e4,
+            0x71c738e4, 0x38e41c72, 0x5555aaaa, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniBilinear_4over3_l11_4x4 = {{
+            0x55055555, // TCfg
+            0x50045050, // ASelt
+            0x76766565, 0xa9a90088, // ABin
+            0xaa0aaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0x38e471c7, 0x1c7238e4, 0x71c738e4, 0x38e41c72,
+            0x5555aaaa, 0x00000000, 0x38e471c7, 0x1c7238e4 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniBilinear_4over3_l21_4x4 = {{
+            0x55550555, // TCfg
+            0x50500450, // ASelt
+            0x00ccbaba, 0xfefeeded, // ABin
+            0xaaaa0aaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0x71c738e4, 0x38e41c72, 0x5555aaaa, 0x00000000,
+            0x38e471c7, 0x1c7238e4, 0x71c738e4, 0x38e41c72 // Constant
+        }, GPU_DP_TYPE_16 };
+
+
+        status  = vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l00_2x8", &uniBilinear_4over3_l00_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l10_2x8", &uniBilinear_4over3_l10_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l01_4x4", &uniBilinear_4over3_l01_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l11_4x4", &uniBilinear_4over3_l11_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l21_4x4", &uniBilinear_4over3_l21_4x4);
+
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+
+    return status;
+} /* _resize_rgb888_planar_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_nn_kernel_t* kernel,
+    const vsi_nn_kernel_param_t * params,
+    vsi_bool is_no_range_change,
+    int32_t width,
+    int32_t height
+    )
+{
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    _internal_scale_e scale_type = SCALE;
+    vsi_status status = VSI_FAILURE;
+    uint32_t key = 0;
+    int32_t i = 0;
+    vsi_bool is_4_over_3 = FALSE;
+    vsi_bool is_half_scale = FALSE;
+    vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
+
+    is_4_over_3 = (width * 3 == (int32_t)outputs[0]->attr.size[0] * 4) &&
+                  (height * 3 == (int32_t)outputs[0]->attr.size[1] * 4);
+    is_half_scale = (width == (int32_t)outputs[0]->attr.size[0] * 2) &&
+                    (height == (int32_t)outputs[0]->attr.size[1] * 2);
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (enable_copy)
+    {
+        scale_type = COPY;
+    }
+    else
+    {
+        if (is_no_range_change && is_4_over_3)
+        {
+            scale_type = FOUR_OVER_THREE;
+        }
+        else if (is_no_range_change && is_half_scale)
+        {
+            scale_type = HALF;
+        }
+        else
+        {
+            scale_type = SCALE;
+        }
+    }
+
+    key = PRE_PROCESS_RGB888_PLANAR_HASH_KEY( input0_dtype, output_dtype, scale_type);
+
+    for ( i = 0; i < _cnt_of_array(pre_process_rgb888_planar_kernel_map); i ++ )
+    {
+        if ( pre_process_rgb888_planar_kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(pre_process_rgb888_planar_kernel_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",
+            pre_process_rgb888_planar_kernel_map[i].function_name );
+        kernel->info.parameters = _pre_process_rgb888_planar_kernel_param_def;
+        kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def );
+
+        if (enable_copy)
+        {
+            kernel->info.initialize = _pre_process_rgb888_planar_copy_initializer;
+        }
+        else if (scale_type == FOUR_OVER_THREE || scale_type == HALF)
+        {
+            kernel->info.initialize = _resize_rgb888_planar_initializer;
+        }
+        else
+        {
+            kernel->info.initialize = _pre_process_rgb888_planar_initializer;
+        }
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                pre_process_rgb888_planar_kernel_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                pre_process_rgb888_planar_kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_PRE_PROCESS_RGB888_PLANAR_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t width  = vsi_nn_kernel_param_get_int32( params, "width" );
+    int32_t height = vsi_nn_kernel_param_get_int32( params, "height" );
+    float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
+    float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
+    float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
+    float scale = vsi_nn_kernel_param_get_float32( params, "scale" );
+    vsi_bool is_no_range_change = FALSE;
+
+    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    if ( width == (int32_t)inputs[0]->attr.size[0] && height == (int32_t)inputs[0]->attr.size[1] &&
+         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 &&
+         outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC &&
+         (float)outputs[0]->attr.dtype.zero_point == r_mean && r_mean == g_mean && r_mean == b_mean &&
+         vsi_nn_abs(outputs[0]->attr.dtype.scale - scale) < 1e-8 )
+    {
+        is_no_range_change = TRUE;
+    }
+
+    status = _query_kernel( inputs, outputs, kernel, params, is_no_range_change, width, height );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 6;
+            int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
+            int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" );
+            int32_t left    = vsi_nn_kernel_param_get_int32( params, "left" );
+            int32_t top     = vsi_nn_kernel_param_get_int32( params, "top" );
+
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+            vsi_nn_kernel_scalar_release( &node_params[13] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( pre_process_rgb888_planar, _setup )
diff --git a/src/tim/vx/internal/src/kernel/evis/repeat_evis.c b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c
index ac72b9f..7fe19bc 100644
--- a/src/tim/vx/internal/src/kernel/evis/repeat_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 
 __BEGIN_DECLS
@@ -480,6 +479,7 @@ static vsi_nn_kernel_node_t _setup
     }
 
     kernel_preprocess = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+    CHECK_PTR_FAIL_GOTO( kernel_preprocess, "Create kernel fail.", final );
     // Assign unique_id
     kernel_preprocess->unique_id = kernel->unique_id;
 
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
index e3b5582..394461f 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 #include "utils/vsi_nn_dtype_util_prv.h"
 
 __BEGIN_DECLS
@@ -1370,6 +1369,7 @@ static vsi_nn_kernel_node_t _setup
             if (is_run_opt_kernel)
             {
                 scale = _create_scale_tensor(graph, inputs[0], outputs[0], align_corners, half_pixel_centers);
+                CHECK_PTR_FAIL_GOTO( scale, "Create buffer fail.", final );
                 node_params[SCALAR_TENSOR_SCALE] = (vsi_nn_kernel_node_param_t)(scale->t);
                 node_params_num = _RESIZE_BILINEAR_PARAM_NUM;
             }
@@ -1378,16 +1378,11 @@ static vsi_nn_kernel_node_t _setup
             VSI_ASSERT( status == VSI_SUCCESS );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
-            if (is_run_opt_kernel)
-            {
-                if (scale)
-                {
-                    vsi_nn_ReleaseTensor(&scale);
-                }
-            }
         }
     }
 
+final:
+    vsi_safe_release_tensor(scale);
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
index 778d1fe..2e0cac5 100644
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -206,7 +205,11 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer)
     block_size = (int32_t)(attr[2]->shape->data[0]);
     height     = (int32_t)(attr[2]->shape->data[1]);
     index_num  = (int32_t)(attr[0]->shape->data[1]);
-    output_zp  = (int32_t)(attr[2]->asymm.zero_point);
+
+    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        output_zp = attr[2]->asymm.zero_point;
+    }
 
     if(coord_dim == 3)
     {
@@ -359,7 +362,11 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_big_initializer)
     block_size = (int32_t)(attr[2]->shape->data[0]);
     height     = (int32_t)(attr[2]->shape->data[1]);
     index_num  = (int32_t)(attr[0]->shape->data[1]);
-    output_zp  = (int32_t)(attr[2]->asymm.zero_point);
+
+    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        output_zp = attr[2]->asymm.zero_point;
+    }
 
     if(coord_dim == 3)
     {
@@ -552,4 +559,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( scatter_nd, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
index 55af6c0..c277ba5 100644
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -332,12 +331,6 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer)
         {
             src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
         }
-        src0ZP = 0;
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        src0Scale = 1;
-        src0ZP = 0;
     }
 
     if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
@@ -355,12 +348,6 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer)
         {
             src2Scale = ((float) ((int64_t)1 << -attr[2]->dfp.fl));
         }
-        src2ZP = 0;
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        src2Scale = 1;
-        src2ZP = 0;
     }
 
     if (attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
@@ -379,12 +366,6 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer)
             dstScale = (1.0f / (float)((int64_t)1 << -attr[3]->dfp.fl));
         }
         dstScale = 1.0f/dstScale;
-        dstZP = 0;
-    }
-    else if ( attr[3]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        dstScale = 1;
-        dstZP = 0;
     }
 
     if (coord_dim == 5)
@@ -623,12 +604,6 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer)
         {
             src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
         }
-        src0ZP = 0;
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        src0Scale = 1;
-        src0ZP = 0;
     }
 
     if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
@@ -646,12 +621,6 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer)
         {
             src2Scale = ((float) ((int64_t)1 << -attr[2]->dfp.fl));
         }
-        src2ZP = 0;
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        src2Scale = 1;
-        src2ZP = 0;
     }
 
     if (attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
@@ -669,13 +638,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer)
         {
             dstScale = (1.0f / (float)((int64_t)1 << -attr[3]->dfp.fl));
         }
-        dstScale = 1.0f/dstScale;
-        dstZP = 0;
-    }
-    else if ( attr[3]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        dstScale = 1;
-        dstZP = 0;
+        dstScale = 1.0f / dstScale;
     }
 
     if (coord_dim == 5)
@@ -873,12 +836,6 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_pre_initializer)
         {
             src0Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl));
         }
-        src0ZP = 0;
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        src0Scale = 1;
-        src0ZP = 0;
     }
 
     if (coord_dim == 5)
@@ -1029,12 +986,6 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_post_initializer)
         {
             src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
         }
-        src0ZP = 0;
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        src0Scale = 1;
-        src0ZP = 0;
     }
 
     if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
@@ -1052,10 +1003,6 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_post_initializer)
             src2Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl));
         }
     }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        src2Scale = 1;
-    }
 
     if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
     {
@@ -1072,12 +1019,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_post_initializer)
         {
             dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
         }
-        dstScale = 1.0f/dstScale;
-        dstZP = 0;
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        dstScale = 1;
+        dstScale = 1.0f / dstScale;
         dstZP = 0;
     }
 
@@ -1602,4 +1544,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( scatter_nd_update, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
index 5a101e0..ac8ff6c 100644
--- a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -167,6 +166,7 @@ DEF_KERNEL_INITIALIZER(_get_matrix_initializer)
     float    scale[4] = {0};
 
     attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
 
     if ( attr->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
@@ -257,7 +257,9 @@ DEF_KERNEL_INITIALIZER(_warp_affine_initializer)
     float    output_zp  = 0;
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
 
     if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
index fb2bd1f..fea09ff 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@@ -340,7 +340,7 @@ static char* _load_source_code_from_file
         *size = read_bytes;
     }
 final:
-    fclose( fp );
+    if (fp) fclose( fp );
     return source;
 } /* _load_source_code_from_file() */
 
@@ -351,15 +351,17 @@ static vx_program _create_program
     size_t num
     )
 {
-    vx_char** sources;
-    vx_size* source_sizes;
+    vx_char** sources = NULL;
+    vx_size* source_sizes = NULL;
     size_t i;
     vsi_status status;
     vx_program program;
     program = NULL;
 
     sources = (vx_char**)malloc( sizeof(vx_char*) * num );
+    CHECK_PTR_FAIL_GOTO( sources, "Create buffer fail.", final );
     source_sizes = (vx_size*)malloc( sizeof(vx_size) * num );
+    CHECK_PTR_FAIL_GOTO( source_sizes, "Create buffer fail.", final );
 
     for( i = 0; i < num; i ++ )
     {
@@ -373,14 +375,11 @@ static vx_program _create_program
     {
         VSILOGE("Create program from source fail!");
     }
-    if( sources )
-    {
-        free( sources );
-    }
-    if( source_sizes )
-    {
-        free( source_sizes );
-    }
+
+final:
+    vsi_nn_safe_free( sources );
+    vsi_nn_safe_free( source_sizes );
+
     return program;
 } /* _create_program() */
 
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
index b5dfa9e..80d56d0 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
@@ -69,10 +69,11 @@ static float log_eval(float data)
     return logf(data);
 }
 
-static float elu_eval(float data, vsi_nn_kernel_lut_params *lut_param)
+static float selu_eval(float data, vsi_nn_kernel_lut_params *lut_param)
 {
     float alpha = lut_param->params[0];
-    return data >=0 ? data : expf(data) * alpha - alpha;
+    float gamma = lut_param->params[1];
+    return data >=0 ? data * gamma : expf(data) * alpha * gamma - alpha * gamma;
 }
 
 static float neg_eval(float data)
@@ -179,6 +180,15 @@ static float square_eval(float x)
     return x * x;
 }
 
+static float celu_eval(float x, vsi_nn_kernel_lut_params *lut_param)
+{
+    float alpha = lut_param->params[0];
+    float positive = vsi_nn_max(0, x);
+    float negative = vsi_nn_min(alpha * (expf(x / alpha) - 1), 0);
+
+    return positive + negative;
+}
+
 static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *lut_param)
 {
     float result = 0;
@@ -196,8 +206,8 @@ static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *
         result =  exp_eval(data);
         break;
         break;
-    case VSI_NN_KERNEL_LUT_ELU:
-        result =  elu_eval(data, lut_param);
+    case VSI_NN_KERNEL_LUT_SELU:
+        result =  selu_eval(data, lut_param);
         break;
         break;
     case VSI_NN_KERNEL_LUT_NEG:
@@ -232,6 +242,9 @@ static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *
     case VSI_NN_KERNEL_LUT_SQUARE:
         result =  square_eval(data);
         break;
+    case VSI_NN_KERNEL_LUT_CELU:
+        result =  celu_eval(data, lut_param);
+        break;
     default:
         VSILOGE( "unsupported activation function:%d", lut_param->act_type );
         break;
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c
index 20b4589..c5b640c 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c
@@ -74,7 +74,7 @@ typedef struct
     vsi_bool vsi_nn_kernel_param_add_##TYPE_NAME \
         (vsi_nn_kernel_param_t* params, const char* key, TYPE value) \
     { \
-        _param_type* p; \
+        _param_type* p = NULL; \
         CHECK_PARAM_NULL( params, FALSE, "Params is null ptr." ); \
         CHECK_PARAM_NULL( key, FALSE, "Param key is null ptr." ); \
         p = malloc( sizeof(_param_type) ); \
@@ -89,11 +89,11 @@ typedef struct
     TYPE vsi_nn_kernel_param_get_##TYPE_NAME \
         ( const vsi_nn_kernel_param_t* params, const char* key) \
     { \
-        _param_type* p; \
+        _param_type* p = NULL; \
         CHECK_PARAM_NULL( params, FALSE, "Params is null ptr." ); \
         CHECK_PARAM_NULL( key, FALSE, "Param key is null ptr." ); \
         p = vsi_nn_hashmap_get( params, key ); \
-        if( p->type != PARAM_DTYPE ) { \
+        if( p && p->type != PARAM_DTYPE ) { \
             VSILOGW("Key %s is not \"%s\"", key, ""#TYPE_NAME ); \
         } \
         CHECK_PARAM_NULL( p, DEFAULT_VALUE, "Key %s not in params.", key ); \
@@ -236,4 +236,3 @@ void vsi_nn_kernel_param_clear( vsi_nn_kernel_param_t * params )
         vsi_nn_hashmap_clear( hashmap );
     }
 } /* vsi_nn_kernel_param_clear() */
-
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
index e3f454a..bdb2240 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
@@ -122,7 +122,7 @@ static vsi_status _select
 
 REGISTER_VX_FIRST_KERNEL_SELECTOR(exp)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(log)
-REGISTER_VX_FIRST_KERNEL_SELECTOR(elu)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(selu)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(neg)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(mish)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(hard_sigmoid)
@@ -132,5 +132,6 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(erf)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(gelu)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(hard_gelu)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(matrixmul)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(celu)
 
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
index 30b1257..0ab544b 100644
--- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
@@ -65,11 +65,15 @@ static vsi_nn_kernel_node_t _setup
         lut_param.params[0] = vsi_nn_kernel_param_get_float32( params, "min_value" );
         lut_param.params[1] = vsi_nn_kernel_param_get_float32( params, "max_value" );
     }
-    else if (lut_type == VSI_NN_KERNEL_LUT_ELU || lut_type == VSI_NN_KERNEL_LUT_HSIGMOID)
+    else if (lut_type == VSI_NN_KERNEL_LUT_SELU || lut_type == VSI_NN_KERNEL_LUT_HSIGMOID)
     {
         lut_param.params[0] = vsi_nn_kernel_param_get_float32( params, "alpha" );
         lut_param.params[1] = vsi_nn_kernel_param_get_float32( params, "beta" );
     }
+    else if (lut_type == VSI_NN_KERNEL_LUT_CELU)
+    {
+        lut_param.params[0] = vsi_nn_kernel_param_get_float32( params, "alpha" );
+    }
 
     if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
          outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32  )
@@ -91,7 +95,7 @@ static vsi_nn_kernel_node_t _setup
     node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t);
     if ( NULL == node )
     {
-        VSILOGW("Call vxTensorTableLookupLayer fail.");
+        VSILOGI("Call vxTensorTableLookupLayer fail.");
         goto final;
     }
 
@@ -133,7 +137,7 @@ final:
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( mish,         VSI_NN_KERNEL_LUT_MISH )
 //REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( exp,          VSI_NN_KERNEL_LUT_EXP )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( log,          VSI_NN_KERNEL_LUT_LOG )
-REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( elu,          VSI_NN_KERNEL_LUT_ELU )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( selu,         VSI_NN_KERNEL_LUT_SELU )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( neg,          VSI_NN_KERNEL_LUT_NEG )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( hard_sigmoid, VSI_NN_KERNEL_LUT_HSIGMOID )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( gelu,         VSI_NN_KERNEL_LUT_GELU )
@@ -141,6 +145,7 @@ REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( hard_gelu,    VSI_NN_KERNEL_LUT_HGELU
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( erf,          VSI_NN_KERNEL_LUT_ERF )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( relu_keras,   VSI_NN_KERNEL_LUT_RELU_KERAS )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( clip,         VSI_NN_KERNEL_LUT_CLIP )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( celu,         VSI_NN_KERNEL_LUT_CELU )
 
 #undef REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL
 
diff --git a/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c b/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c
index bdbc6c4..3f5bfa1 100644
--- a/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c
@@ -74,7 +74,7 @@ REGISTER_BATCH_GEMM_OPENVX_KERNEL( matrixmul )
 
     if( NULL == node )
     {
-        VSILOGW("Call vxBatchGemmNode fail.");
+        VSILOGI("Call vxBatchGemmNode fail.");
         goto OnError;
     }
 
@@ -88,4 +88,3 @@ OnError:
 #undef REGISTER_BATCH_GEMM_OPENVX_KERNEL
 
 #endif
-
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/clip_BF16.cl b/src/tim/vx/internal/src/libnnext/ops/cl/clip_BF16.cl
index dddb09b..84d5ed1 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/clip_BF16.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/clip_BF16.cl
@@ -1,10 +1,16 @@
 #pragma OPENCL EXTENSION CL_VIV_asm : enable
 
-__kernel void clip_BF16toBF16(
-    __read_only  image2d_array_t  input,
-    __write_only image2d_array_t  output,
-                           float  minData,
-                           float  maxData)
+__kernel void clip_BF16toBF16
+    (
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                           float minData,
+                           float maxData,
+                           float inputScale,
+                           float inputTail,
+                           float outputScale,
+                           float outputZP
+    )
 {
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
     uint4 src0 = read_imageui(input, coord);
@@ -18,11 +24,17 @@ __kernel void clip_BF16toBF16(
     write_imageui(output, coord, dst);
 }
 
-__kernel void clip_BF16toBF16_2D(
-    __read_only  image2d_t  input,
-    __write_only image2d_t  output,
-                     float  minData,
-                     float  maxData)
+__kernel void clip_BF16toBF16_2D
+    (
+    __read_only  image2d_t input,
+    __write_only image2d_t output,
+                     float minData,
+                     float maxData,
+                     float inputScale,
+                     float inputTail,
+                     float outputScale,
+                     float outputZP
+    )
 {
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));
     uint4 src0 = read_imageui(input, coord);
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/clip_F32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/clip_F32.cl
index 384798e..466a14e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/clip_F32.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/clip_F32.cl
@@ -1,64 +1,111 @@
-__kernel void clip_F32toF32(
-    __read_only  image2d_array_t  input,
-    __write_only image2d_array_t  output,
-                           float  minData,
-                           float  maxData)
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    float4 src = read_imagef(input, coord);
-    float4 dst = src > minData ? src : minData;
-    dst = dst < maxData ? dst : maxData;
-    write_imagef(output, coord, dst);
-}
-
-__kernel void clip_F32toF32_2D(
-    __read_only  image2d_t  input,
-    __write_only image2d_t  output,
-                     float  minData,
-                     float  maxData)
-{
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
-    float4 src = read_imagef(input, coord);
-    float4 dst = src > minData ? src : minData;
-    dst = dst < maxData ? dst : maxData;
-    write_imagef(output, coord, dst);
-}
-
-__kernel void clip_F32toU8(
-    __read_only  image2d_array_t  input,
-    __write_only image2d_array_t  output,
-                           float  minData,
-                           float  maxData,
+__kernel void clip_F32toF32
+    (
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                           float minData,
+                           float maxData,
                            float inputScale,
                            float inputTail,
                            float outputScale,
                            float outputZP
-                           )
+    )
 {
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
     float4 src = read_imagef(input, coord);
-    float4 result = src > minData ? src : minData;
-    result = result < maxData ? result : maxData;
-    uint4 dst = convert_uint4_rte(result * outputScale + outputZP);
-    write_imageui(output, coord, dst);
+    float4 dst = clamp(src, minData, maxData);
+    write_imagef(output, coord, dst);
 }
 
-__kernel void clip_F32toU8_2D(
-    __read_only  image2d_t  input,
-    __write_only image2d_t  output,
-                     float  minData,
-                     float  maxData,
+__kernel void clip_F32toF32_2D
+    (
+    __read_only  image2d_t input,
+    __write_only image2d_t output,
+                     float minData,
+                     float maxData,
                      float inputScale,
                      float inputTail,
                      float outputScale,
                      float outputZP
-                     )
+    )
 {
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));
     float4 src = read_imagef(input, coord);
-    float4 result = src > minData ? src : minData;
-    result = result < maxData ? result : maxData;
+    float4 dst = clamp(src, minData, maxData);
+    write_imagef(output, coord, dst);
+}
+
+__kernel void clip_F32toU8
+    (
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                           float minData,
+                           float maxData,
+                           float inputScale,
+                           float inputTail,
+                           float outputScale,
+                           float outputZP
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    float4 src = read_imagef(input, coord);
+    float4 result = clamp(src, minData, maxData);
     uint4 dst = convert_uint4_rte(result * outputScale + outputZP);
     write_imageui(output, coord, dst);
 }
 
+__kernel void clip_F32toU8_2D
+    (
+    __read_only  image2d_t input,
+    __write_only image2d_t output,
+                     float minData,
+                     float maxData,
+                     float inputScale,
+                     float inputTail,
+                     float outputScale,
+                     float outputZP
+    )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    float4 src = read_imagef(input, coord);
+    float4 result = clamp(src, minData, maxData);
+    uint4 dst = convert_uint4_rte(result * outputScale + outputZP);
+    write_imageui(output, coord, dst);
+}
+
+__kernel void clip_F32toI32
+    (
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                           float minData,
+                           float maxData,
+                           float inputScale,
+                           float inputTail,
+                           float outputScale,
+                           float outputZP
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    float4 src = read_imagef(input, coord);
+    float4 result = clamp(src, minData, maxData);
+    int4 dst = convert_int4_rte(result * outputScale + outputZP);
+    write_imagei(output, coord, dst);
+}
+
+__kernel void clip_F32toI32_2D
+    (
+    __read_only  image2d_t input,
+    __write_only image2d_t output,
+                     float minData,
+                     float maxData,
+                     float inputScale,
+                     float inputTail,
+                     float outputScale,
+                     float outputZP
+    )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    float4 src = read_imagef(input, coord);
+    float4 result = clamp(src, minData, maxData);
+    int4 dst = convert_int4_rte(result * outputScale + outputZP);
+    write_imagei(output, coord, dst);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/clip_I32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/clip_I32.cl
new file mode 100644
index 0000000..f3d2421
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/clip_I32.cl
@@ -0,0 +1,73 @@
+__kernel void clip_I32toI32
+    (
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                           float minData,
+                           float maxData,
+                           float inputScale,
+                           float inputTail,
+                           float outputScale,
+                           float outputZP
+     )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    float4 src = convert_float4(read_imagei(input, coord)) * inputScale + inputTail;
+    float4 result = clamp(src, minData, maxData);
+    int4 dst = convert_int4_rte(result * outputScale + outputZP);
+    write_imagei(output, coord, dst);
+}
+
+__kernel void clip_I32toI32_2D
+    (
+    __read_only  image2d_t input,
+    __write_only image2d_t output,
+                     float minData,
+                     float maxData,
+                     float inputScale,
+                     float inputTail,
+                     float outputScale,
+                     float outputZP
+    )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    float4 src = convert_float4(read_imagei(input, coord)) * inputScale + inputTail;
+    float4 result = clamp(src, minData, maxData);
+    int4 dst = convert_int4_rte(result * outputScale + outputZP);
+    write_imagei(output, coord, dst);
+}
+
+__kernel void clip_I32toF32
+    (
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                           float minData,
+                           float maxData,
+                           float inputScale,
+                           float inputTail,
+                           float outputScale,
+                           float outputZP
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    float4 src = convert_float4(read_imagei(input, coord)) * inputScale + inputTail;
+    float4 dst = clamp(src, minData, maxData);
+    write_imagef(output, coord, dst);
+}
+
+__kernel void clip_I32toF32_2D
+    (
+    __read_only  image2d_t input,
+    __write_only image2d_t output,
+                     float minData,
+                     float maxData,
+                     float inputScale,
+                     float inputTail,
+                     float outputScale,
+                     float outputZP
+    )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    float4 src = convert_float4(read_imagei(input, coord)) * inputScale + inputTail;
+    float4 dst = clamp(src, minData, maxData);
+    write_imagef(output, coord, dst);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl
new file mode 100644
index 0000000..c991ffc
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl
@@ -0,0 +1,228 @@
+float eltwise_unary_sin(float x, float alpha, float beta)
+{
+    return native_sin(x);
+}
+
+float eltwise_unary_cos(float x, float alpha, float beta)
+{
+    return native_cos(x);
+}
+
+#define logE        (1.44269502f)
+#define twoLogE     (logE * 2.0f)
+float eltwise_unary_exp(float x, float alpha, float beta)
+{
+    x *= logE;
+    x = exp2(x);
+    return x;
+}
+
+#define rlogE    (0.693147182f)
+float eltwise_unary_log(float x, float alpha, float beta)
+{
+    x = log2(x);
+    return x * rlogE;
+}
+
+float eltwise_unary_neg(float x, float alpha, float beta)
+{
+    return x * -1;
+}
+
+float eltwise_unary_hard_sigmoid(float x, float alpha, float beta)
+{
+    x = alpha * x + beta;
+    x = clamp(x, 0, 1);
+    return x;
+}
+
+float _softrelu(float x, float alpha)
+{
+    x *= logE;
+    x = exp2(x);
+    x += 1;
+    x = log2(x);
+    return x * rlogE;
+}
+
+float _tanh(float x, float alpha)
+{
+    x *= -twoLogE;
+    x = 1 + exp2(x);
+    x = 1 / x;
+    return (2 * x - 1);
+}
+
+float eltwise_unary_mish(float x, float alpha, float beta)
+{
+    float y = _softrelu(x, alpha);
+    x = x * _tanh(y, alpha);
+    return x;
+}
+
+float eltwise_unary_round(float x, float alpha, float beta)
+{
+    return convert_float(convert_int_rte(x));
+}
+
+float evaluate_polynomial_alpha(float x2)
+{
+    float4 alpha0 = (float4){-2.72614225801306e-10f, 2.77068142495902e-08f,
+                            -2.10102402082508e-06f, -5.69250639462346e-05f};
+    float4 alpha1 = (float4){-7.34990630326855e-04f, -2.95459980854025e-03f,
+                            -1.60960333262415e-02f, 0};
+
+    float poly = alpha0.x * x2 + alpha0.y;
+    poly = poly * x2 + alpha0.z;
+    poly = poly * x2 + alpha0.w;
+    poly = poly * x2 + alpha1.x;
+    poly = poly * x2 + alpha1.y;
+    poly = poly * x2 + alpha1.z;
+
+    return poly;
+}
+
+float evaluate_polynomial_beta(float x2)
+{
+    float4 beta0 = (float4){-1.45660718464996e-05f, -2.13374055278905e-04f,
+                            -1.68282697438203e-03f, -7.37332916720468e-03f};
+    float4 beta1 = (float4){-1.42647390514189e-02f, 0, 0, 0};
+
+    float poly = beta0.x * x2 + beta0.y;
+    poly = poly * x2 + beta0.z;
+    poly = poly * x2 + beta0.w;
+    poly = poly * x2 + beta1.x;
+
+    return 1.0f / poly;
+}
+
+float erf_eval(float _x)
+{
+    float x = clamp(_x, -4, 4);
+    float x2 = x * x;
+
+    return x * evaluate_polynomial_alpha(x2) * evaluate_polynomial_beta(x2);
+}
+
+#define RSQRT2      (0.70710678118654752440084436210485f)
+float eltwise_unary_gelu(float x, float alpha, float beta)
+{
+    x = 0.5f * x * (1 + erf_eval(x * RSQRT2));
+
+    return x;
+}
+
+#define SQRT_2_RCP_PI  0.7978845834732056f
+float eltwise_unary_hard_gelu(float x, float alpha, float beta)
+{
+    float cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI *
+                        (x + 0.044715f * x * x * x), 0);
+    return x * cdf;
+}
+
+float eltwise_unary_selu(float val, float alpha_times_gamma, float gamma)
+{
+    float x = val * logE;
+    x = exp2(x) * alpha_times_gamma - alpha_times_gamma;
+
+    return val <= 0 ? x : val * gamma;
+}
+
+float eltwise_unary_celu(float val, float alpha, float rcp_alpha)
+{
+    float x = val * logE * rcp_alpha;
+    x = exp2(x) * alpha - alpha;
+
+    return val < 0 ? x : val;
+}
+
+#define ELTWISE_UNARY_F32_2D(func_name) \
+__kernel void func_name##_F32toF32_2D \
+    ( \
+    __read_only  image2d_t input, \
+    __write_only image2d_t output, \
+                 float     inputScale, \
+                 float     inputTail, \
+                 float     outputScale, \
+                 float     outputZP, \
+                 float     alpha, \
+                 float           beta \
+    ) \
+{ \
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    float4 src = read_imagef(input, coord); \
+ \
+    float4 dst = 0; \
+    dst.x = eltwise_unary_##func_name(src.x, alpha, beta); \
+ \
+    write_imagef(output, coord, dst.xxxx); \
+}
+ELTWISE_UNARY_F32_2D(sin)
+ELTWISE_UNARY_F32_2D(cos)
+ELTWISE_UNARY_F32_2D(exp)
+ELTWISE_UNARY_F32_2D(log)
+ELTWISE_UNARY_F32_2D(neg)
+ELTWISE_UNARY_F32_2D(mish)
+ELTWISE_UNARY_F32_2D(hard_sigmoid)
+ELTWISE_UNARY_F32_2D(round)
+ELTWISE_UNARY_F32_2D(gelu)
+ELTWISE_UNARY_F32_2D(hard_gelu)
+ELTWISE_UNARY_F32_2D(selu)
+ELTWISE_UNARY_F32_2D(celu)
+
+#define ELTWISE_UNARY_U8_2D(func_name) \
+__kernel void func_name##_U8toU8_2D \
+    ( \
+    __read_only  image2d_t input, \
+    __write_only image2d_t output, \
+                 float     inputScale, \
+                 float     inputTail, \
+                 float     outputScale, \
+                 float     outputZP, \
+                 float     alpha, \
+                 float     beta \
+    ) \
+{ \
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    uint4 src = read_imageui(input, coord); \
+    float4 data = convert_float4(src) * inputScale - inputTail; \
+ \
+    data.x = eltwise_unary_##func_name(data.x, alpha, beta); \
+    uint4 dst = convert_uint4(data * outputScale + outputZP); \
+ \
+    write_imageui(output, coord, dst); \
+}
+ELTWISE_UNARY_U8_2D(sin)
+ELTWISE_UNARY_U8_2D(cos)
+ELTWISE_UNARY_U8_2D(exp)
+ELTWISE_UNARY_U8_2D(log)
+ELTWISE_UNARY_U8_2D(neg)
+ELTWISE_UNARY_U8_2D(mish)
+ELTWISE_UNARY_U8_2D(hard_sigmoid)
+ELTWISE_UNARY_U8_2D(round)
+ELTWISE_UNARY_U8_2D(gelu)
+ELTWISE_UNARY_U8_2D(hard_gelu)
+ELTWISE_UNARY_U8_2D(selu)
+ELTWISE_UNARY_U8_2D(celu)
+
+__kernel void neg_I32toI32_2D
+    (
+    __read_only  image2d_t input,
+    __write_only image2d_t output,
+                 float     inputScale,
+                 float     inputTail,
+                 float     outputScale,
+                 float     outputZP,
+                 float     alpha,
+                 float     beta
+    )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    int4 src = read_imagei(input, coord);
+
+    int4 dst = -src;
+
+    write_imagei(output, coord, dst);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl
similarity index 58%
rename from src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl
rename to src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl
index 55b63cb..20cc454 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl
@@ -24,14 +24,6 @@ float eltwise_unary_log(float x, float alpha, float beta)
     return x * rlogE;
 }
 
-float eltwise_unary_elu(float val, float alpha, float beta)
-{
-    float x = val * logE;
-    x = exp2(x) * alpha - alpha;
-
-    return val < 0 ? x : val;
-}
-
 float eltwise_unary_neg(float x, float alpha, float beta)
 {
     return x * -1;
@@ -73,34 +65,45 @@ float eltwise_unary_round(float x, float alpha, float beta)
     return convert_float(convert_int_rte(x));
 }
 
-#define MUL2_RSQRTPI    (1.1283791670955126f)
-float erf_eval(float x)
+float evaluate_polynomial_alpha(float x2)
 {
-    float res = 0;
-    float tmp = x;
-    float factorial = 1;
-    float x_pow = x;
-    float one = 1.0f;
-    float n = 1;
+    float4 alpha0 = (float4){-2.72614225801306e-10f, 2.77068142495902e-08f,
+                            -2.10102402082508e-06f, -5.69250639462346e-05f};
+    float4 alpha1 = (float4){-7.34990630326855e-04f, -2.95459980854025e-03f,
+                            -1.60960333262415e-02f, 0};
 
-    if (x <= -3)
-        return -1;
-    else if (x >= 3)
-        return 1;
+    float poly = alpha0.x * x2 + alpha0.y;
+    poly = poly * x2 + alpha0.z;
+    poly = poly * x2 + alpha0.w;
+    poly = poly * x2 + alpha1.x;
+    poly = poly * x2 + alpha1.y;
+    poly = poly * x2 + alpha1.z;
 
-    while (fabs(tmp) > 1e-5)
-    {
-        res += tmp;
-
-        factorial *= n;
-        one *= -1;
-        x_pow *= x * x;
-        tmp = one / factorial * x_pow / ( 2 * n + 1);
-
-        n += 1.0f;
-    }
-    return res * MUL2_RSQRTPI;
+    return poly;
 }
+
+float evaluate_polynomial_beta(float x2)
+{
+    float4 beta0 = (float4){-1.45660718464996e-05f, -2.13374055278905e-04f,
+                            -1.68282697438203e-03f, -7.37332916720468e-03f};
+    float4 beta1 = (float4){-1.42647390514189e-02f, 0, 0, 0};
+
+    float poly = beta0.x * x2 + beta0.y;
+    poly = poly * x2 + beta0.z;
+    poly = poly * x2 + beta0.w;
+    poly = poly * x2 + beta1.x;
+
+    return 1.0f / poly;
+}
+
+float erf_eval(float _x)
+{
+    float x = clamp(_x, -4, 4);
+    float x2 = x * x;
+
+    return x * evaluate_polynomial_alpha(x2) * evaluate_polynomial_beta(x2);
+}
+
 #define RSQRT2      (0.70710678118654752440084436210485f)
 float eltwise_unary_gelu(float x, float alpha, float beta)
 {
@@ -117,6 +120,22 @@ float eltwise_unary_hard_gelu(float x, float alpha, float beta)
     return x * cdf;
 }
 
+float eltwise_unary_selu(float val, float alpha_times_gamma, float gamma)
+{
+    float x = val * logE;
+    x = exp2(x) * alpha_times_gamma - alpha_times_gamma;
+
+    return val < 0 ? x : val * gamma;
+}
+
+float eltwise_unary_celu(float val, float alpha, float rcp_alpha)
+{
+    float x = val * logE * rcp_alpha;
+    x = exp2(x) * alpha - alpha;
+
+    return val < 0 ? x : val;
+}
+
 #define ELTWISE_UNARY_F32(func_name) \
 __kernel void func_name##_F32toF32 \
     ( \
@@ -143,47 +162,14 @@ ELTWISE_UNARY_F32(sin)
 ELTWISE_UNARY_F32(cos)
 ELTWISE_UNARY_F32(exp)
 ELTWISE_UNARY_F32(log)
-ELTWISE_UNARY_F32(elu)
 ELTWISE_UNARY_F32(neg)
 ELTWISE_UNARY_F32(mish)
 ELTWISE_UNARY_F32(hard_sigmoid)
 ELTWISE_UNARY_F32(round)
 ELTWISE_UNARY_F32(gelu)
 ELTWISE_UNARY_F32(hard_gelu)
-
-#define ELTWISE_UNARY_F32_2D(func_name) \
-__kernel void func_name##_F32toF32_2D \
-    ( \
-    __read_only  image2d_t input, \
-    __write_only image2d_t output, \
-                 float     inputScale, \
-                 float     inputTail, \
-                 float     outputScale, \
-                 float     outputZP, \
-                 float     alpha, \
-                 float           beta \
-    ) \
-{ \
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1)); \
- \
-    float4 src = read_imagef(input, coord); \
- \
-    float4 dst = 0; \
-    dst.x = eltwise_unary_##func_name(src.x, alpha, beta); \
- \
-    write_imagef(output, coord, dst.xxxx); \
-}
-ELTWISE_UNARY_F32_2D(sin)
-ELTWISE_UNARY_F32_2D(cos)
-ELTWISE_UNARY_F32_2D(exp)
-ELTWISE_UNARY_F32_2D(log)
-ELTWISE_UNARY_F32_2D(elu)
-ELTWISE_UNARY_F32_2D(neg)
-ELTWISE_UNARY_F32_2D(mish)
-ELTWISE_UNARY_F32_2D(hard_sigmoid)
-ELTWISE_UNARY_F32_2D(round)
-ELTWISE_UNARY_F32_2D(gelu)
-ELTWISE_UNARY_F32_2D(hard_gelu)
+ELTWISE_UNARY_F32(selu)
+ELTWISE_UNARY_F32(celu)
 
 #define ELTWISE_UNARY_U8(func_name) \
 __kernel void func_name##_U8toU8 \
@@ -212,48 +198,14 @@ ELTWISE_UNARY_U8(sin)
 ELTWISE_UNARY_U8(cos)
 ELTWISE_UNARY_U8(exp)
 ELTWISE_UNARY_U8(log)
-ELTWISE_UNARY_U8(elu)
 ELTWISE_UNARY_U8(neg)
 ELTWISE_UNARY_U8(mish)
 ELTWISE_UNARY_U8(hard_sigmoid)
 ELTWISE_UNARY_U8(round)
 ELTWISE_UNARY_U8(gelu)
 ELTWISE_UNARY_U8(hard_gelu)
-
-#define ELTWISE_UNARY_U8_2D(func_name) \
-__kernel void func_name##_U8toU8_2D \
-    ( \
-    __read_only  image2d_t input, \
-    __write_only image2d_t output, \
-                 float     inputScale, \
-                 float     inputTail, \
-                 float     outputScale, \
-                 float     outputZP, \
-                 float     alpha, \
-                 float     beta \
-    ) \
-{ \
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1)); \
- \
-    uint4 src = read_imageui(input, coord); \
-    float4 data = convert_float4(src) * inputScale - inputTail; \
- \
-    data.x = eltwise_unary_##func_name(data.x, alpha, beta); \
-    uint4 dst = convert_uint4(data * outputScale + outputZP); \
- \
-    write_imageui(output, coord, dst); \
-}
-ELTWISE_UNARY_U8_2D(sin)
-ELTWISE_UNARY_U8_2D(cos)
-ELTWISE_UNARY_U8_2D(exp)
-ELTWISE_UNARY_U8_2D(log)
-ELTWISE_UNARY_U8_2D(elu)
-ELTWISE_UNARY_U8_2D(neg)
-ELTWISE_UNARY_U8_2D(mish)
-ELTWISE_UNARY_U8_2D(hard_sigmoid)
-ELTWISE_UNARY_U8_2D(round)
-ELTWISE_UNARY_U8_2D(gelu)
-ELTWISE_UNARY_U8_2D(hard_gelu)
+ELTWISE_UNARY_U8(selu)
+ELTWISE_UNARY_U8(celu)
 
 __kernel void neg_I32toI32
     (
@@ -274,23 +226,3 @@ __kernel void neg_I32toI32
 
     write_imagei(output, coord, dst);
 }
-
-__kernel void neg_I32toI32_2D
-    (
-    __read_only  image2d_t input,
-    __write_only image2d_t output,
-                 float     inputScale,
-                 float     inputTail,
-                 float     outputScale,
-                 float     outputZP,
-                 float     alpha,
-                 float     beta
-    )
-{
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
-    int4 src = read_imagei(input, coord);
-
-    int4 dst = -src;
-
-    write_imagei(output, coord, dst);
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/erf.cl b/src/tim/vx/internal/src/libnnext/ops/cl/erf.cl
index 0a0e410..f393abb 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/erf.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/erf.cl
@@ -1,26 +1,40 @@
-#define MUL2_RSQRTPI    (1.1283791670955126f)
+float evaluate_polynomial_alpha(float x2)
+{
+    float4 alpha0 = (float4){-2.72614225801306e-10f, 2.77068142495902e-08f,
+                            -2.10102402082508e-06f, -5.69250639462346e-05f};
+    float4 alpha1 = (float4){-7.34990630326855e-04f, -2.95459980854025e-03f,
+                            -1.60960333262415e-02f, 0};
+
+    float poly = alpha0.x * x2 + alpha0.y;
+    poly = poly * x2 + alpha0.z;
+    poly = poly * x2 + alpha0.w;
+    poly = poly * x2 + alpha1.x;
+    poly = poly * x2 + alpha1.y;
+    poly = poly * x2 + alpha1.z;
+
+    return poly;
+}
+
+float evaluate_polynomial_beta(float x2)
+{
+    float4 beta0 = (float4){-1.45660718464996e-05f, -2.13374055278905e-04f,
+                            -1.68282697438203e-03f, -7.37332916720468e-03f};
+    float4 beta1 = (float4){-1.42647390514189e-02f, 0, 0, 0};
+
+    float poly = beta0.x * x2 + beta0.y;
+    poly = poly * x2 + beta0.z;
+    poly = poly * x2 + beta0.w;
+    poly = poly * x2 + beta1.x;
+
+    return 1.0f / poly;
+}
+
 float eltwise_unary_erf(float _x)
 {
-    float x = clamp(_x, -2, 2);
-    float res = 0;
-    float tmp = x;
-    float factorial = 1;
-    float x_pow = x;
-    float one = 1.0f;
-    float n = 1;
+    float x = clamp(_x, -4, 4);
+    float x2 = x * x;
 
-    while (fabs(tmp) > 1e-5)
-    {
-        res += tmp;
-
-        factorial *= n;
-        one *= -1;
-        x_pow *= x * x;
-        tmp = one / factorial * x_pow / ( 2 * n + 1);
-
-        n += 1.0f;
-    }
-    return res * MUL2_RSQRTPI;
+    return x * evaluate_polynomial_alpha(x2) * evaluate_polynomial_beta(x2);
 }
 
 #define ELTWISE_UNARY_F32(func_name) \
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_elements.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_elements.cl
new file mode 100644
index 0000000..323f694
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_elements.cl
@@ -0,0 +1,135 @@
+
+#define GATHER_ELEMENTS_AXIS0_2D(name, data_type, read_func, write_func, conv_func) \
+__kernel void gather_elements_axis0_##name##_I32to##name##_2D \
+    ( \
+    __read_only  image2d_t input0, \
+    __read_only  image2d_t input1, \
+    __write_only image2d_t output, \
+                 float     input_scale, \
+                 float     input_tail, \
+                 int       axis_size \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    int index = read_imagei(input1, coord).x; \
+    int index1 = index + axis_size; \
+    index = index < 0 ? index1 : index; \
+ \
+    data_type data = read_func(input0, (int2)(index, coord.y)); \
+    float4 dst = convert_float4(data) * input_scale + input_tail; \
+    data = conv_func(dst); \
+ \
+    write_func(output, coord, data); \
+}
+GATHER_ELEMENTS_AXIS0_2D(F32, float4, read_imagef,  write_imagef,  convert_float4)
+GATHER_ELEMENTS_AXIS0_2D(I32, int4,   read_imagei,  write_imagei,  convert_int4_rte)
+GATHER_ELEMENTS_AXIS0_2D(U32, uint4,  read_imageui, write_imageui, convert_uint4_rte)
+
+#define GATHER_ELEMENTS_AXIS0(name, data_type, read_func, write_func, conv_func) \
+__kernel void gather_elements_axis0_##name##_I32to##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output, \
+                 float           input_scale, \
+                 float           input_tail, \
+                 int             axis_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    int index = read_imagei(input1, coord).x; \
+    int index1 = index + axis_size; \
+    index = index < 0 ? index1 : index; \
+ \
+    data_type data = read_func(input0, (int4)(index, coord.yzz)); \
+    float4 dst = convert_float4(data) * input_scale + input_tail; \
+    data = conv_func(dst); \
+ \
+    write_func(output, coord, data); \
+}
+GATHER_ELEMENTS_AXIS0(F32, float4, read_imagef,  write_imagef,  convert_float4)
+GATHER_ELEMENTS_AXIS0(I32, int4,   read_imagei,  write_imagei,  convert_int4_rte)
+GATHER_ELEMENTS_AXIS0(U32, uint4,  read_imageui, write_imageui, convert_uint4_rte)
+
+#define GATHER_ELEMENTS_AXIS1_2D(name, data_type, read_func, write_func, conv_func) \
+__kernel void gather_elements_axis1_##name##_I32to##name##_2D \
+    ( \
+    __read_only  image2d_t input0, \
+    __read_only  image2d_t input1, \
+    __write_only image2d_t output, \
+                 float     input_scale, \
+                 float     input_tail, \
+                 int       axis_size \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    int index = read_imagei(input1, coord).x; \
+    int index1 = index + axis_size; \
+    index = index < 0 ? index1 : index; \
+ \
+    data_type data = read_func(input0, (int2)(coord.x, index)); \
+    float4 dst = convert_float4(data) * input_scale + input_tail; \
+    data = conv_func(dst); \
+ \
+    write_func(output, coord, data); \
+}
+GATHER_ELEMENTS_AXIS1_2D(F32, float4, read_imagef,  write_imagef,  convert_float4)
+GATHER_ELEMENTS_AXIS1_2D(I32, int4,   read_imagei,  write_imagei,  convert_int4_rte)
+GATHER_ELEMENTS_AXIS1_2D(U32, uint4,  read_imageui, write_imageui, convert_uint4_rte)
+
+#define GATHER_ELEMENTS_AXIS1(name, data_type, read_func, write_func, conv_func) \
+__kernel void gather_elements_axis1_##name##_I32to##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output, \
+                 float           input_scale, \
+                 float           input_tail, \
+                 int             axis_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    int index = read_imagei(input1, coord).x; \
+    int index1 = index + axis_size; \
+    index = index < 0 ? index1 : index; \
+ \
+    data_type data = read_func(input0, (int4)(coord.x, index, coord.zz)); \
+    float4 dst = convert_float4(data) * input_scale + input_tail; \
+    data = conv_func(dst); \
+ \
+    write_func(output, coord, data); \
+}
+GATHER_ELEMENTS_AXIS1(F32, float4, read_imagef,  write_imagef,  convert_float4)
+GATHER_ELEMENTS_AXIS1(I32, int4,   read_imagei,  write_imagei,  convert_int4_rte)
+GATHER_ELEMENTS_AXIS1(U32, uint4,  read_imageui, write_imageui, convert_uint4_rte)
+
+#define GATHER_ELEMENTS_AXIS2(name, data_type, read_func, write_func, conv_func) \
+__kernel void gather_elements_axis2_##name##_I32to##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output, \
+                 float           input_scale, \
+                 float           input_tail, \
+                 int             axis_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    int index = read_imagei(input1, coord).x; \
+    int index1 = index + axis_size; \
+    index = index < 0 ? index1 : index; \
+ \
+    data_type data = read_func(input0, (int4)(coord.xy, index, coord.z)); \
+    float4 dst = convert_float4(data) * input_scale + input_tail; \
+    data = conv_func(dst); \
+ \
+    write_func(output, coord, data); \
+}
+GATHER_ELEMENTS_AXIS2(F32, float4, read_imagef,  write_imagef,  convert_float4)
+GATHER_ELEMENTS_AXIS2(I32, int4,   read_imagei,  write_imagei,  convert_int4_rte)
+GATHER_ELEMENTS_AXIS2(U32, uint4,  read_imageui, write_imageui, convert_uint4_rte)
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis0.cl
index 36794ea..8a8b113 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis0.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis0.cl
@@ -5,7 +5,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_a
     __write_only image2d_t output,
                        int axis,
                        int axis_size,
-                     float rsEps
+                     float rsEps,
+                     float inputScale,
+                     float inputTail,
+                     float outputScale,
+                     float outputZP
     )
 {
     int lidx = get_local_id(0);
@@ -28,7 +32,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_a
     data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];
     sum = dot(data0, one);
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));
-    for(coord.x = gidx; coord.x < axis_size; coord.x += 16)
+    for (coord.x = gidx; coord.x < axis_size; coord.x += 16)
     {
         src         = read_imagef(input, coord);
         scale_value = read_imagef(scale, coord_scale);
@@ -70,7 +74,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_a
     data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];
     sum = dot(data0, one);
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));
-    for(coord.x = gidx; coord.x < axis_size; coord.x += 16)
+    for (coord.x = gidx; coord.x < axis_size; coord.x += 16)
     {
         src         = convert_float4(read_imageui(input, coord))  * inputScale + inputTail;
         scale_value = read_imagef(scale, coord_scale);
@@ -80,4 +84,45 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_a
     }
 }
 
-
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_axis0_I32_F32toI32_2D(
+    __read_only  image2d_t input,
+    __read_only  image2d_t scale,
+    __write_only image2d_t output,
+                       int axis,
+                       int axis_size,
+                     float rsEps,
+                     float inputScale,
+                     float inputTail,
+                     float outputScale,
+                     float outputZP
+    )
+{
+    int lidx = get_local_id(0);
+    int gidx = get_global_id(0);
+    float4 src, scale_value, result;
+    float sum  = 0.0f, pSum = 0.0f, rsqrt_sum = 0.0f;
+    int2 coord = (int2)(gidx, get_global_id(1));
+    int2 coord_scale = (int2)(gidx, 0);
+    __local float lcl_sum[16];
+    for(; coord.x < axis_size; coord.x += 16)
+    {
+        src = convert_float4(read_imagei(input, coord))  * inputScale + inputTail;
+        pSum += (src.x * src.x);
+    }
+    lcl_sum[lidx] = pSum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0];
+    float4 one = (float4)(1, 1, 1, 1);
+    float4 data0;
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];
+    sum = dot(data0, one);
+    rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));
+    for (coord.x = gidx; coord.x < axis_size; coord.x += 16)
+    {
+        src         = convert_float4(read_imagei(input, coord))  * inputScale + inputTail;
+        scale_value = read_imagef(scale, coord_scale);
+        result      = src * rsqrt_sum * scale_value;
+        int4 dst = convert_int4_rte(result * outputScale + outputZP);
+        write_imagei(output, coord, dst);
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis1.cl
index 39ad98a..9bfc07e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis1.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis1.cl
@@ -5,7 +5,11 @@ __kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_a
     __write_only image2d_t output,
                        int axis,
                        int axis_size,
-                     float rsEps
+                     float rsEps,
+                     float inputScale,
+                     float inputTail,
+                     float outputScale,
+                     float outputZP
     )
 {
     int lidx = get_local_id(1);
@@ -28,7 +32,7 @@ __kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_a
     data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];
     sum = dot(data0, one);
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));
-    for(coord.y = gidy; coord.y < axis_size; coord.y += 16)
+    for (coord.y = gidy; coord.y < axis_size; coord.y += 16)
     {
         src         = read_imagef(input, coord);
         scale_value = read_imagef(scale, coord_scale);
@@ -70,7 +74,7 @@ __kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_a
     data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];
     sum = dot(data0, one);
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));
-    for(coord.y = gidy; coord.y < axis_size; coord.y += 16)
+    for (coord.y = gidy; coord.y < axis_size; coord.y += 16)
     {
         src         = convert_float4(read_imageui(input, coord))  * inputScale + inputTail;
         scale_value = read_imagef(scale, coord_scale);
@@ -79,3 +83,46 @@ __kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_a
         write_imageui(output, coord, dst);
     }
 }
+
+__kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_axis1_I32_F32toI32_2D(
+    __read_only  image2d_t input,
+    __read_only  image2d_t scale,
+    __write_only image2d_t output,
+                       int axis,
+                       int axis_size,
+                     float rsEps,
+                     float inputScale,
+                     float inputTail,
+                     float outputScale,
+                     float outputZP
+    )
+{
+    int lidx = get_local_id(1);
+    int gidy = get_global_id(1);
+    float4 src, scale_value, result;
+    float sum  = 0.0f, pSum = 0.0f, rsqrt_sum = 0.0f;
+    int2 coord = (int2)(get_global_id(0), gidy );
+    int2 coord_scale = (int2)(gidy, 0);
+    __local float lcl_sum[16];
+    for (; coord.y < axis_size; coord.y += 16)
+    {
+        src = convert_float4(read_imagei(input, coord)) * inputScale + inputTail;
+        pSum = pSum + src.x * src.x;
+    }
+    lcl_sum[lidx] = pSum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0];
+    float4 one = (float4)(1, 1, 1, 1);
+    float4 data0;
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];
+    sum = dot(data0, one);
+    rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));
+    for (coord.y = gidy; coord.y < axis_size; coord.y += 16)
+    {
+        src         = convert_float4(read_imagei(input, coord)) * inputScale + inputTail;
+        scale_value = read_imagef(scale, coord_scale);
+        result      = src * rsqrt_sum * scale_value;
+        int4 dst = convert_int4_rte(result * outputScale + outputZP);
+        write_imagei(output, coord, dst);
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl b/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl
index bbf45c2..08e66a7 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl
@@ -121,7 +121,10 @@ __kernel void maximum_I32I32toI32
     READ_IMAGEI_2DARRAY(src0, input0, coord);
     READ_IMAGEI_2DARRAY(src1, input1, coord);
 
-    int4 dst = src0 > src1 ? src0 : src1;
+    float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
+    float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
+    float4 data = data0 > data1 ? data0 : data1;
+    int4 dst = convert_int4(data * outputScale + outputZP);
 
     write_imagei(output, coord, dst);
 }
@@ -144,8 +147,10 @@ __kernel void maximum_I32I32toI32_2D
     int4 src0 = read_imagei(input0, coord);
     int4 src1 = read_imagei(input1, coord);
 
-    int4 dst = src0 > src1 ? src0 : src1;
+    float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
+    float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
+    float4 data = data0 > data1 ? data0 : data1;
+    int4 dst = convert_int4(data * outputScale + outputZP);
 
     write_imagei(output, coord, dst);
 }
-
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl b/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl
index 981d789..27c6501 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl
@@ -121,7 +121,10 @@ __kernel void minimum_I32I32toI32
     READ_IMAGEI_2DARRAY(src0, input0, coord);
     READ_IMAGEI_2DARRAY(src1, input1, coord);
 
-    int4 dst = src0 < src1 ? src0 : src1;
+    float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
+    float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
+    float4 data = data0 < data1 ? data0 : data1;
+    int4 dst = convert_int4(data * outputScale + outputZP);
 
     write_imagei(output, coord, dst);
 }
@@ -144,8 +147,10 @@ __kernel void minimum_I32I32toI32_2D
     int4 src0 = read_imagei(input0, coord);
     int4 src1 = read_imagei(input1, coord);
 
-    int4 dst = src0 < src1 ? src0 : src1;
+    float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
+    float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
+    float4 data = data0 < data1 ? data0 : data1;
+    int4 dst = convert_int4(data * outputScale + outputZP);
 
     write_imagei(output, coord, dst);
 }
-
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl
index effa919..076e59f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl
@@ -26,8 +26,8 @@ __kernel void moments_axis0_U8toF32(
         {
             data = read_imageui(input, coord0).x;
             coord0.x++;
-            tmpSum += (data);
-            tmpSqr += (data * data);
+            tmpSum = tmpSum + data;
+            tmpSqr = tmpSqr + data * data;
         }
         sqr = convert_float(as_int(tmpSqr - 2 * input_zp * tmpSum + width * input_zp * input_zp)) * e2InScale;
         sum = convert_float(as_int(tmpSum - width * input_zp)) * input_scale;
@@ -100,7 +100,7 @@ __kernel void moments_axis0_I32toF32(
 
     for(coord0.x = 0; coord0.x < width;)
     {
-        data = convert_float(read_imagei(input, coord0).x);
+        data = convert_float(read_imagei(input, coord0).x - input_zp);
         coord0.x++;
 
         sum = sum + data;
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl
index 05f9e3a..62dd4d6 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl
@@ -23,8 +23,8 @@ __kernel void moments_axis01_U8toF32(
         {
             data = read_imageui(input, coord);
             coord.y++;
-            tmpSum += data.x;
-            tmpSqr += data.x * data.x;
+            tmpSum = tmpSum + data.x;
+            tmpSqr = tmpSqr + data.x * data.x;
         }
         sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;
         sum += (tmpSum - height * input_zp) * input_scale;
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl
index 44e9809..2178544 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl
@@ -24,8 +24,8 @@ __kernel void moments_axis012_U8toF32(
             {
                 data = read_imageui(input, coord);
                 coord.y++;
-                tmpSum += data.x;
-                tmpSqr += data.x * data.x;
+                tmpSum = tmpSum + data.x;
+                tmpSqr = tmpSqr + data.x * data.x;
             }
             sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;
             sum += (tmpSum - height * input_zp) * input_scale;
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl
index 191e321..64cec51 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl
@@ -20,8 +20,8 @@ __kernel void moments_axis1_U8toF32(
         {
             data = read_imageui(input, coord0).x;
             coord0.y++;
-            tmpSum += (data);
-            tmpSqr += (data * data);
+            tmpSum = tmpSum + data;
+            tmpSqr = tmpSqr + data * data;
         }
         sqr = convert_float(as_int(tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp)) * e2InScale;
         sum = convert_float(as_int(tmpSum - height * input_zp)) * input_scale;
@@ -95,7 +95,7 @@ __kernel void moments_axis1_I32toF32(
 
     for(coord0.y = 0; coord0.y < height;)
     {
-        data = convert_float(read_imagei(input, coord0).x);
+        data = convert_float(read_imagei(input, coord0).x - input_zp);
         coord0.y++;
         sum = sum + data;
         sqr = sqr + data * data;
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl
index 8cf72cb..4f2330e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl
@@ -26,8 +26,8 @@ __kernel void moments_axis2_U8toF32(
         {
             data = read_imageui(input, coord0).x;
             coord0.z++;
-            tmpSum += (data);
-            tmpSqr += (data * data);
+            tmpSum = tmpSum + data;
+            tmpSqr = tmpSqr + data * data;
         }
         sqr = as_int(tmpSqr - 2 * input_zp * tmpSum + chn * input_zp * input_zp) * e2InScale;
         sum = tmpSum * input_scale;
@@ -107,7 +107,7 @@ __kernel void moments_axis2_I32toF32(
 
     for(coord0.z = 0; coord0.z < chn;)
     {
-        data = convert_float(read_imagei(input, coord0).x);
+        data = convert_float(read_imagei(input, coord0).x - input_zp);
         coord0.z++;
 
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl b/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl
index d186c41..e535b86 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl
@@ -37,12 +37,13 @@ __kernel void one_hot_I32toI32
 {
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
 
-    int4 val = read_imagei(input, coord.xy);
+    int4 src = read_imagei(input, coord.xy);
 
+    int  val = convert_int(convert_float(src.x) * inputScale - inputTail);
     do
     {
         int4 dst;
-        dst.x = val.x == coord.z ? on_value : off_value;
+        dst.x = val == coord.z ? on_value : off_value;
 
         write_imagei(output, coord.xzyw, dst.xxxx);
 
@@ -63,11 +64,13 @@ __kernel void one_hot_I32toU8
 {
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
 
-    int4 val = read_imagei(input, coord.xy);
+    int4 src = read_imagei(input, coord.xy);
+
+    int  val = convert_int(convert_float(src.x) * inputScale - inputTail);
     do
     {
         uint4 dst;
-        dst.x = val.x == coord.z ? on_value : off_value;
+        dst.x = val == coord.z ? on_value : off_value;
 
         write_imageui(output, coord.xzyw, dst.xxxx);
 
@@ -88,12 +91,13 @@ __kernel void one_hot_I32toF32
 {
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
 
-    int4 val = read_imagei(input, coord.xy);
+    int4 src = read_imagei(input, coord.xy);
 
+    int  val = convert_int(convert_float(src.x) * inputScale - inputTail);
     do
     {
         float4 dst;
-        dst.x = val.x == coord.z ? on_value : off_value;
+        dst.x = val == coord.z ? on_value : off_value;
 
         write_imagef(output, coord.xzyw, dst.xxxx);
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_0.vx
similarity index 58%
rename from src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx
rename to src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_0.vx
index 086e399..69c3ede 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_0.vx
@@ -3,44 +3,9 @@
 _viv_uniform float alpha;
 _viv_uniform float beta;
 
-float4 eltwise_unary_sin(float4 x)
-{
-    return native_sin(x);
-}
-
-float4 eltwise_unary_cos(float4 x)
-{
-    return native_cos(x);
-}
-
 #define logE        (1.44269502f)
 #define twoLogE     (logE * 2.0f)
-float4 eltwise_unary_exp(float4 x)
-{
-    x *= logE;
-    x = exp2(x);
-    return x;
-}
-
 #define rlogE    (0.693147182f)
-float4 eltwise_unary_log(float4 x)
-{
-    x = log2(x);
-    return x * rlogE;
-}
-
-float4 eltwise_unary_elu(float4 val)
-{
-    float4 x = val * logE;
-    x = exp2(x) * alpha - alpha;
-
-    return val < 0 ? x : val;
-}
-
-float4 eltwise_unary_neg(float4 x)
-{
-    return x * -1;
-}
 
 float4 eltwise_unary_hard_sigmoid(float4 x)
 {
@@ -78,43 +43,51 @@ float4 eltwise_unary_round(float4 x)
     return convert_float4(convert_int4_rte(x));
 }
 
-#define MUL2_RSQRTPI    (1.1283791670955126f)
-float erf_eval(float x)
+float4 evaluate_polynomial_alpha(float4 x2)
 {
-    float res = 0;
-    float tmp = x;
-    float factorial = 1;
-    float x_pow = x;
-    float one = 1.0f;
-    float n = 1;
+    float4 alpha0 = (float4){-2.72614225801306e-10f, 2.77068142495902e-08f,
+                            -2.10102402082508e-06f, -5.69250639462346e-05f};
+    float4 alpha1 = (float4){-7.34990630326855e-04f, -2.95459980854025e-03f,
+                            -1.60960333262415e-02f, 0};
 
-    if (x <= -3)
-        return -1;
-    else if(x >= 3)
-        return 1;
+    float4 poly = alpha0.x * x2 + alpha0.y;
+    poly = poly * x2 + alpha0.z;
+    poly = poly * x2 + alpha0.w;
+    poly = poly * x2 + alpha1.x;
+    poly = poly * x2 + alpha1.y;
+    poly = poly * x2 + alpha1.z;
 
-    while (fabs(tmp) > 1e-5)
-    {
-        res += tmp;
-
-        factorial *= n;
-        one *= -1;
-        x_pow *= x * x;
-        tmp = one / factorial * x_pow / ( 2 * n + 1);
-
-        n += 1.0f;
-    }
-    return res * MUL2_RSQRTPI;
+    return poly;
 }
+
+float4 evaluate_polynomial_beta(float4 x2)
+{
+    float4 beta0 = (float4){-1.45660718464996e-05f, -2.13374055278905e-04f,
+                            -1.68282697438203e-03f, -7.37332916720468e-03f};
+    float4 beta1 = (float4){-1.42647390514189e-02f, 0, 0, 0};
+
+    float4 poly = beta0.x * x2 + beta0.y;
+    poly = poly * x2 + beta0.z;
+    poly = poly * x2 + beta0.w;
+    poly = poly * x2 + beta1.x;
+
+    return 1.0f / poly;
+}
+
+float4 erf_eval(float4 _x)
+{
+    float4 x = clamp(_x, -4, 4);
+    float4 x2 = x * x;
+
+    return x * evaluate_polynomial_alpha(x2) * evaluate_polynomial_beta(x2);
+}
+
 #define RSQRT2      (0.70710678118654752440084436210485f)
 float4 eltwise_unary_gelu(float4 x)
 {
     float4 erf, data;
     data = x * RSQRT2;
-    erf.x = erf_eval(data.x);
-    erf.y = erf_eval(data.y);
-    erf.z = erf_eval(data.z);
-    erf.w = erf_eval(data.w);
+    erf = erf_eval(data);
     x = 0.5f * x * (1 + erf);
 
     return x;
@@ -172,72 +145,6 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;
     _viv_asm(COPY, dst, dst2, 16); \
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
-//EXP
-ELTSISE_UNARY_2D(exp, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(exp, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(exp, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(exp, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(exp, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(exp, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(exp, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(exp, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(exp, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(exp, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//SIN
-ELTSISE_UNARY_2D(sin, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(sin, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(sin, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(sin, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(sin, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(sin, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(sin, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(sin, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(sin, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//COS
-ELTSISE_UNARY_2D(cos, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(cos, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(cos, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(cos, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(cos, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(cos, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(cos, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(cos, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(cos, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//LOG
-ELTSISE_UNARY_2D(log, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(log, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(log, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(log, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(log, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(log, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(log, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(log, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(log, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//ELU
-ELTSISE_UNARY_2D(elu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(elu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(elu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(elu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(elu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(elu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(elu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(elu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(elu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(elu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//NEG
-ELTSISE_UNARY_2D(neg, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(neg, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(neg, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(neg, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(neg, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_2D(neg, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(neg, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_2D(neg, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_2D(neg, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_2D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
 //MISH
 ELTSISE_UNARY_2D(mish, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
 ELTSISE_UNARY_2D(mish, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
@@ -327,18 +234,6 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;
     VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
-//EXP
-ELTSISE_UNARY_BF16_2D(exp)
-//SIN
-ELTSISE_UNARY_BF16_2D(sin)
-//COS
-ELTSISE_UNARY_BF16_2D(cos)
-//LOG
-ELTSISE_UNARY_BF16_2D(log)
-//ELU
-ELTSISE_UNARY_BF16_2D(elu)
-//NEG
-ELTSISE_UNARY_BF16_2D(neg)
 //MISH
 ELTSISE_UNARY_BF16_2D(mish)
 //HARD_SIGMOID
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx
new file mode 100644
index 0000000..696807f
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx
@@ -0,0 +1,221 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float alpha;
+_viv_uniform float beta;
+
+float4 eltwise_unary_sin(float4 x)
+{
+    return native_sin(x);
+}
+
+float4 eltwise_unary_cos(float4 x)
+{
+    return native_cos(x);
+}
+
+#define logE        (1.44269502f)
+#define twoLogE     (logE * 2.0f)
+float4 eltwise_unary_exp(float4 x)
+{
+    x *= logE;
+    x = exp2(x);
+    return x;
+}
+
+#define rlogE    (0.693147182f)
+float4 eltwise_unary_log(float4 x)
+{
+    x = log2(x);
+    return x * rlogE;
+}
+
+float4 eltwise_unary_neg(float4 x)
+{
+    return x * -1;
+}
+
+float4 eltwise_unary_selu(float4 val)
+{
+    float4 x = val * logE;
+    x = exp2(x) * alpha - alpha;
+
+    return val < 0 ? x : val * beta;
+}
+
+float4 eltwise_unary_celu(float4 val)
+{
+    float4 x = val * logE * beta;
+    x = exp2(x) * alpha - alpha;
+
+    return val < 0 ? x : val;
+}
+
+_viv_uniform float inputScale;
+_viv_uniform float inputTail;
+_viv_uniform float outputScale;
+_viv_uniform float outputZP;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;
+_viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;
+
+#define ELTSISE_UNARY_2D(func_name, src_type_name, dst_type_name, src_type, \
+        src_copy_type, convert_type, dst_type, dst_copy_type) \
+    __kernel void func_name##_##src_type_name##to##dst_type_name##_2D( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 int              type, \
+                 float            _alpha, \
+                 float            _beta \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    src_type      src0; \
+    src_copy_type src1; \
+    VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, src0, 16); \
+ \
+    float4 vecA; \
+    float4 vecB; \
+    VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \
+    VXC_DP4x4(vecB, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \
+    vecA = vecA * inputScale + inputTail; \
+    vecB = vecB * inputScale + inputTail; \
+    vecA = eltwise_unary_##func_name(vecA); \
+    vecB = eltwise_unary_##func_name(vecB); \
+    vecA = vecA * outputScale + outputZP; \
+    vecB = vecB * outputScale + outputZP; \
+ \
+    convert_type dst0, dst1; \
+    _viv_asm(CONV_RTE, dst0, vecA); \
+    _viv_asm(CONV_RTE, dst1, vecB); \
+    dst_type dst2; \
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    dst_copy_type dst; \
+    _viv_asm(COPY, dst, dst2, 16); \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+//EXP
+ELTSISE_UNARY_2D(exp, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(exp, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(exp, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(exp, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(exp, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(exp, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(exp, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(exp, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(exp, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(exp, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//SIN
+ELTSISE_UNARY_2D(sin, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(sin, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(sin, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(sin, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(sin, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(sin, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(sin, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(sin, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(sin, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//COS
+ELTSISE_UNARY_2D(cos, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(cos, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(cos, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(cos, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(cos, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(cos, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(cos, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(cos, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(cos, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//LOG
+ELTSISE_UNARY_2D(log, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(log, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(log, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(log, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(log, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(log, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(log, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(log, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(log, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//SELU
+ELTSISE_UNARY_2D(selu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(selu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(selu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(selu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(selu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(selu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(selu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(selu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(selu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(selu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//NEG
+ELTSISE_UNARY_2D(neg, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(neg, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(neg, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(neg, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(neg, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(neg, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(neg, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(neg, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(neg, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//CELU
+ELTSISE_UNARY_2D(celu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(celu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(celu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(celu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(celu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(celu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(celu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(celu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(celu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(celu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+
+#define ELTSISE_UNARY_BF16_2D(func_name) \
+    __kernel void func_name##_BF16toBF16_2D( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 int              type, \
+                 float            _alpha, \
+                 float            _beta \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    vxc_ushort8   src0, src1, dst; \
+    VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 vecA; \
+    float4 vecB; \
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
+    _viv_asm(COPY, vecA, src1, 16); \
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \
+    _viv_asm(COPY, vecB, src1, 16); \
+    vecA = eltwise_unary_##func_name(vecA); \
+    vecB = eltwise_unary_##func_name(vecB); \
+ \
+    _viv_asm(COPY, src0, vecA, 16); \
+    _viv_asm(COPY, src1, vecB, 16); \
+ \
+    VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+//EXP
+ELTSISE_UNARY_BF16_2D(exp)
+//SIN
+ELTSISE_UNARY_BF16_2D(sin)
+//COS
+ELTSISE_UNARY_BF16_2D(cos)
+//LOG
+ELTSISE_UNARY_BF16_2D(log)
+//SELU
+ELTSISE_UNARY_BF16_2D(selu)
+//NEG
+ELTSISE_UNARY_BF16_2D(neg)
+//CELU
+ELTSISE_UNARY_BF16_2D(celu)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_0.vx
similarity index 58%
rename from src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx
rename to src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_0.vx
index a7ba363..d04ec5a 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_0.vx
@@ -3,44 +3,9 @@
 _viv_uniform float alpha;
 _viv_uniform float beta;
 
-float4 eltwise_unary_sin(float4 x)
-{
-    return native_sin(x);
-}
-
-float4 eltwise_unary_cos(float4 x)
-{
-    return native_cos(x);
-}
-
 #define logE        (1.44269502f)
 #define twoLogE     (logE * 2.0f)
-float4 eltwise_unary_exp(float4 x)
-{
-    x *= logE;
-    x = exp2(x);
-    return x;
-}
-
 #define rlogE    (0.693147182f)
-float4 eltwise_unary_log(float4 x)
-{
-    x = log2(x);
-    return x * rlogE;
-}
-
-float4 eltwise_unary_elu(float4 val)
-{
-    float4 x = val * logE;
-    x = exp2(x) * alpha - alpha;
-
-    return val < 0 ? x : val;
-}
-
-float4 eltwise_unary_neg(float4 x)
-{
-    return x * -1;
-}
 
 float4 eltwise_unary_hard_sigmoid(float4 x)
 {
@@ -78,43 +43,51 @@ float4 eltwise_unary_round(float4 x)
     return convert_float4(convert_int4_rte(x));
 }
 
-#define MUL2_RSQRTPI    (1.1283791670955126f)
-float erf_eval(float x)
+float4 evaluate_polynomial_alpha(float4 x2)
 {
-    float res = 0;
-    float tmp = x;
-    float factorial = 1;
-    float x_pow = x;
-    float one = 1.0f;
-    float n = 1;
+    float4 alpha0 = (float4){-2.72614225801306e-10f, 2.77068142495902e-08f,
+                            -2.10102402082508e-06f, -5.69250639462346e-05f};
+    float4 alpha1 = (float4){-7.34990630326855e-04f, -2.95459980854025e-03f,
+                            -1.60960333262415e-02f, 0};
 
-    if (x <= -3)
-        return -1;
-    else if(x >= 3)
-        return 1;
+    float4 poly = alpha0.x * x2 + alpha0.y;
+    poly = poly * x2 + alpha0.z;
+    poly = poly * x2 + alpha0.w;
+    poly = poly * x2 + alpha1.x;
+    poly = poly * x2 + alpha1.y;
+    poly = poly * x2 + alpha1.z;
 
-    while (fabs(tmp) > 1e-5)
-    {
-        res += tmp;
-
-        factorial *= n;
-        one *= -1;
-        x_pow *= x * x;
-        tmp = one / factorial * x_pow / ( 2 * n + 1);
-
-        n += 1.0f;
-    }
-    return res * MUL2_RSQRTPI;
+    return poly;
 }
+
+float4 evaluate_polynomial_beta(float4 x2)
+{
+    float4 beta0 = (float4){-1.45660718464996e-05f, -2.13374055278905e-04f,
+                            -1.68282697438203e-03f, -7.37332916720468e-03f};
+    float4 beta1 = (float4){-1.42647390514189e-02f, 0, 0, 0};
+
+    float4 poly = beta0.x * x2 + beta0.y;
+    poly = poly * x2 + beta0.z;
+    poly = poly * x2 + beta0.w;
+    poly = poly * x2 + beta1.x;
+
+    return 1.0f / poly;
+}
+
+float4 erf_eval(float4 _x)
+{
+    float4 x = clamp(_x, -4, 4);
+    float4 x2 = x * x;
+
+    return x * evaluate_polynomial_alpha(x2) * evaluate_polynomial_beta(x2);
+}
+
 #define RSQRT2      (0.70710678118654752440084436210485f)
 float4 eltwise_unary_gelu(float4 x)
 {
     float4 erf, data;
     data = x * RSQRT2;
-    erf.x = erf_eval(data.x);
-    erf.y = erf_eval(data.y);
-    erf.z = erf_eval(data.z);
-    erf.w = erf_eval(data.w);
+    erf = erf_eval(data);
     x = 0.5f * x * (1 + erf);
 
     return x;
@@ -172,72 +145,6 @@ __kernel void func_name##_##src_type_name##to##dst_type_name( \
     _viv_asm(COPY, dst, dst2, 16); \
     VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
-//EXP
-ELTSISE_UNARY_3D(exp, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(exp, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(exp, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(exp, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(exp, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(exp, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(exp, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(exp, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(exp, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(exp, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//SIN
-ELTSISE_UNARY_3D(sin, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(sin, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(sin, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(sin, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(sin, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(sin, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(sin, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(sin, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(sin, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//COS
-ELTSISE_UNARY_3D(cos, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(cos, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(cos, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(cos, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(cos, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(cos, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(cos, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(cos, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(cos, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//LOG
-ELTSISE_UNARY_3D(log, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(log, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(log, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(log, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(log, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(log, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(log, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(log, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(log, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//ELU
-ELTSISE_UNARY_3D(elu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(elu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(elu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(elu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(elu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(elu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(elu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(elu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(elu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(elu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-//NEG
-ELTSISE_UNARY_3D(neg, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(neg, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(neg, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(neg, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(neg, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
-ELTSISE_UNARY_3D(neg, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(neg, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
-ELTSISE_UNARY_3D(neg, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
-ELTSISE_UNARY_3D(neg, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
-ELTSISE_UNARY_3D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
 //MISH
 ELTSISE_UNARY_3D(mish, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
 ELTSISE_UNARY_3D(mish, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
@@ -326,18 +233,6 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;
     VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
     VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
-//EXP
-ELTSISE_UNARY_BF16(exp)
-//SIN
-ELTSISE_UNARY_BF16(sin)
-//COS
-ELTSISE_UNARY_BF16(cos)
-//LOG
-ELTSISE_UNARY_BF16(log)
-//ELU
-ELTSISE_UNARY_BF16(elu)
-//NEG
-ELTSISE_UNARY_BF16(neg)
 //MISH
 ELTSISE_UNARY_BF16(mish)
 //HARD_SIGMOID
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx
new file mode 100644
index 0000000..d150e2a
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx
@@ -0,0 +1,220 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float alpha;
+_viv_uniform float beta;
+
+float4 eltwise_unary_sin(float4 x)
+{
+    return native_sin(x);
+}
+
+float4 eltwise_unary_cos(float4 x)
+{
+    return native_cos(x);
+}
+
+#define logE        (1.44269502f)
+#define twoLogE     (logE * 2.0f)
+float4 eltwise_unary_exp(float4 x)
+{
+    x *= logE;
+    x = exp2(x);
+    return x;
+}
+
+#define rlogE    (0.693147182f)
+float4 eltwise_unary_log(float4 x)
+{
+    x = log2(x);
+    return x * rlogE;
+}
+
+float4 eltwise_unary_neg(float4 x)
+{
+    return x * -1;
+}
+
+float4 eltwise_unary_selu(float4 val)
+{
+    float4 x = val * logE;
+    x = exp2(x) * alpha - alpha;
+
+    return val < 0 ? x : val * beta;
+}
+
+float4 eltwise_unary_celu(float4 val)
+{
+    float4 x = val * logE * beta;
+    x = exp2(x) * alpha - alpha;
+
+    return val < 0 ? x : val;
+}
+
+_viv_uniform float inputScale;
+_viv_uniform float inputTail;
+_viv_uniform float outputScale;
+_viv_uniform float outputZP;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;
+_viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;
+
+#define ELTSISE_UNARY_3D(func_name, src_type_name, dst_type_name, src_type, \
+                src_copy_type, convert_type, dst_type, dst_copy_type) \
+__kernel void func_name##_##src_type_name##to##dst_type_name( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 int              type, \
+                 float            _alpha, \
+                 float            _beta \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    src_type      src0; \
+    src_copy_type src1; \
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, src0, 16); \
+ \
+    float4 vecA; \
+    float4 vecB; \
+    VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \
+    VXC_DP4x4(vecB, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \
+    vecA = vecA * inputScale + inputTail; \
+    vecB = vecB * inputScale + inputTail; \
+    vecA = eltwise_unary_##func_name(vecA); \
+    vecB = eltwise_unary_##func_name(vecB); \
+    vecA = vecA * outputScale + outputZP; \
+    vecB = vecB * outputScale + outputZP; \
+ \
+    convert_type dst0, dst1; \
+    _viv_asm(CONV_RTE, dst0, vecA); \
+    _viv_asm(CONV_RTE, dst1, vecB); \
+    dst_type dst2; \
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    dst_copy_type dst; \
+    _viv_asm(COPY, dst, dst2, 16); \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+//EXP
+ELTSISE_UNARY_3D(exp, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(exp, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(exp, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(exp, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(exp, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(exp, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(exp, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(exp, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(exp, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(exp, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//SIN
+ELTSISE_UNARY_3D(sin, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(sin, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(sin, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(sin, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(sin, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(sin, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(sin, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(sin, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(sin, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//COS
+ELTSISE_UNARY_3D(cos, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(cos, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(cos, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(cos, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(cos, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(cos, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(cos, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(cos, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(cos, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//LOG
+ELTSISE_UNARY_3D(log, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(log, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(log, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(log, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(log, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(log, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(log, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(log, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(log, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//SELU
+ELTSISE_UNARY_3D(selu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(selu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(selu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(selu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(selu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(selu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(selu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(selu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(selu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(selu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//NEG
+ELTSISE_UNARY_3D(neg, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(neg, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(neg, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(neg, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(neg, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(neg, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(neg, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(neg, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(neg, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//CELU
+ELTSISE_UNARY_3D(celu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(celu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(celu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(celu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(celu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(celu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(celu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(celu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(celu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(celu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+#define ELTSISE_UNARY_BF16(func_name) \
+    __kernel void func_name##_BF16toBF16( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 int              type, \
+                 float            _alpha, \
+                 float            _beta \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    vxc_ushort8   src0, src1, dst; \
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 vecA; \
+    float4 vecB; \
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
+    _viv_asm(COPY, vecA, src1, 16); \
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \
+    _viv_asm(COPY, vecB, src1, 16); \
+    vecA = eltwise_unary_##func_name(vecA); \
+    vecB = eltwise_unary_##func_name(vecB); \
+ \
+    _viv_asm(COPY, src0, vecA, 16); \
+    _viv_asm(COPY, src1, vecB, 16); \
+ \
+    VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+//EXP
+ELTSISE_UNARY_BF16(exp)
+//SIN
+ELTSISE_UNARY_BF16(sin)
+//COS
+ELTSISE_UNARY_BF16(cos)
+//LOG
+ELTSISE_UNARY_BF16(log)
+//SELU
+ELTSISE_UNARY_BF16(selu)
+//NEG
+ELTSISE_UNARY_BF16(neg)
+//CELU
+ELTSISE_UNARY_BF16(selu)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/erf.vx b/src/tim/vx/internal/src/libnnext/ops/vx/erf.vx
index 37bde57..03b36c7 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/erf.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/erf.vx
@@ -1,28 +1,42 @@
 #include "cl_viv_vx_ext.h"
 
-#define MUL2_RSQRTPI    (1.1283791670955126f)
-float eltwise_unary_erf(float _x)
+float4 evaluate_polynomial_alpha(float4 x2)
 {
-    float x = clamp(_x, -2, 2);
-    float res = 0;
-    float tmp = x;
-    float factorial = 1;
-    float x_pow = x;
-    float one = 1.0f;
-    float n = 1;
+    float4 alpha0 = (float4){-2.72614225801306e-10f, 2.77068142495902e-08f,
+                            -2.10102402082508e-06f, -5.69250639462346e-05f};
+    float4 alpha1 = (float4){-7.34990630326855e-04f, -2.95459980854025e-03f,
+                            -1.60960333262415e-02f, 0};
 
-    while (fabs(tmp) > 1e-5)
-    {
-        res += tmp;
+    float4 poly = alpha0.x * x2 + alpha0.y;
+    poly = poly * x2 + alpha0.z;
+    poly = poly * x2 + alpha0.w;
+    poly = poly * x2 + alpha1.x;
+    poly = poly * x2 + alpha1.y;
+    poly = poly * x2 + alpha1.z;
 
-        factorial *= n;
-        one *= -1;
-        x_pow *= x * x;
-        tmp = one / factorial * x_pow / ( 2 * n + 1);
+    return poly;
+}
 
-        n += 1.0f;
-    }
-    return res * MUL2_RSQRTPI;
+float4 evaluate_polynomial_beta(float4 x2)
+{
+    float4 beta0 = (float4){-1.45660718464996e-05f, -2.13374055278905e-04f,
+                            -1.68282697438203e-03f, -7.37332916720468e-03f};
+    float4 beta1 = (float4){-1.42647390514189e-02f, 0, 0, 0};
+
+    float4 poly = beta0.x * x2 + beta0.y;
+    poly = poly * x2 + beta0.z;
+    poly = poly * x2 + beta0.w;
+    poly = poly * x2 + beta1.x;
+
+    return 1.0f / poly;
+}
+
+float4 eltwise_unary_erf(float4 _x)
+{
+    float4 x = clamp(_x, -4, 4);
+    float4 x2 = x * x;
+
+    return x * evaluate_polynomial_alpha(x2) * evaluate_polynomial_beta(x2);
 }
 
 _viv_uniform float inputScale;
@@ -48,10 +62,7 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;
     float4 vecA; \
     VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \
     vecA = vecA * inputScale + inputTail; \
-    vecA.x = eltwise_unary_##func_name(vecA.x); \
-    vecA.y = eltwise_unary_##func_name(vecA.y); \
-    vecA.z = eltwise_unary_##func_name(vecA.z); \
-    vecA.w = eltwise_unary_##func_name(vecA.w); \
+    vecA = eltwise_unary_##func_name(vecA); \
     vecA = vecA * outputScale + outputZP; \
  \
     convert_type dst0; \
@@ -92,10 +103,7 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;
     vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
     VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
     _viv_asm(COPY, vecA, src1, 16); \
-    vecA.x = eltwise_unary_##func_name(vecA.x); \
-    vecA.y = eltwise_unary_##func_name(vecA.y); \
-    vecA.z = eltwise_unary_##func_name(vecA.z); \
-    vecA.w = eltwise_unary_##func_name(vecA.w); \
+    vecA = eltwise_unary_##func_name(vecA); \
  \
     _viv_asm(COPY, src0, vecA, 16); \
  \
@@ -121,10 +129,7 @@ __write_only image2d_array_t  output \
     float4 vecA; \
     VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \
     vecA = vecA * inputScale + inputTail; \
-    vecA.x = eltwise_unary_##func_name(vecA.x); \
-    vecA.y = eltwise_unary_##func_name(vecA.y); \
-    vecA.z = eltwise_unary_##func_name(vecA.z); \
-    vecA.w = eltwise_unary_##func_name(vecA.w); \
+    vecA = eltwise_unary_##func_name(vecA); \
     vecA = vecA * outputScale + outputZP; \
  \
     convert_type dst0; \
@@ -161,10 +166,7 @@ ELTSISE_UNARY_3D(erf, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_s
     vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
     VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
     _viv_asm(COPY, vecA, src1, 16); \
-    vecA.x = eltwise_unary_##func_name(vecA.x); \
-    vecA.y = eltwise_unary_##func_name(vecA.y); \
-    vecA.z = eltwise_unary_##func_name(vecA.z); \
-    vecA.w = eltwise_unary_##func_name(vecA.w); \
+    vecA = eltwise_unary_##func_name(vecA); \
  \
     _viv_asm(COPY, src0, vecA, 16); \
  \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/floordiv.vx b/src/tim/vx/internal/src/libnnext/ops/vx/floordiv.vx
index 21a6b90..368d983 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/floordiv.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/floordiv.vx
@@ -58,21 +58,21 @@ __kernel void floordiv_##src0_name##src1_name##to##dst_name \
 TENSOR_FLOORDIV(F16, F16, F16, half4, vxc_short8, vxc_short8,\
                 vxc_half8, CONV, 1, 0, 1, 0, 1, 0)
 TENSOR_FLOORDIV(F16, F16, I16, short4, vxc_short8, vxc_short8,\
-               vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, 0)
+               vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)
 TENSOR_FLOORDIV(F16, F16, I8,  char4, vxc_char8, vxc_short8,\
-                vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, 0)
+                vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)
 TENSOR_FLOORDIV(F16, F16, U8,  uchar4, vxc_uchar8, vxc_short8,\
                vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)
 
 TENSOR_FLOORDIV(I16, I16, I16, short4, vxc_short8, vxc_short8,\
-                vxc_short8, CONV_SAT_RTE, in_scale0, 0, in_scale1, 0, out_scale, 0)
+                vxc_short8, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)
 TENSOR_FLOORDIV(I16, I16, F16, half4, vxc_short8, vxc_short8,\
-                vxc_short8, CONV, in_scale0, 0, in_scale1, 0, 1, 0)
+                vxc_short8, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)
 
 TENSOR_FLOORDIV(I8, I8, I8, char4, vxc_char8, vxc_char16,\
-                vxc_char16, CONV_SAT_RTE, in_scale0, 0, in_scale1, 0, out_scale, 0)
+                vxc_char16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)
 TENSOR_FLOORDIV(I8, I8, F16, half4, vxc_short8, vxc_char16,\
-                vxc_char16, CONV, in_scale0, 0, in_scale1, 0, 1, 0)
+                vxc_char16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)
 
 TENSOR_FLOORDIV(U8, U8, U8,  uchar4, vxc_uchar8, vxc_uchar16,\
                 vxc_uchar16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)
@@ -80,7 +80,6 @@ TENSOR_FLOORDIV(U8, U8, F16, half4, vxc_short8, vxc_uchar16,\
                 vxc_uchar16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)
 
 
-
 #define TENSOR_FLOORDIV_2D(src0_name, src1_name, dst_name, dst_type, save_type, read_type, copy_type, \
     conv_mode, IN0_SCALE, IN0_TAIL, IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET) \
 __kernel void floordiv_##src0_name##src1_name##to##dst_name##_2D \
@@ -99,21 +98,21 @@ __kernel void floordiv_##src0_name##src1_name##to##dst_name##_2D \
 TENSOR_FLOORDIV_2D(F16, F16, F16, half4, vxc_short8, vxc_short8,\
                 vxc_half8, CONV, 1, 0, 1, 0, 1, 0)
 TENSOR_FLOORDIV_2D(F16, F16, I16, short4, vxc_short8, vxc_short8,\
-               vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, 0)
+               vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)
 TENSOR_FLOORDIV_2D(F16, F16, I8,  char4, vxc_char8, vxc_short8,\
-                vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, 0)
+                vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)
 TENSOR_FLOORDIV_2D(F16, F16, U8,  uchar4, vxc_uchar8, vxc_short8,\
                vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)
 
 TENSOR_FLOORDIV_2D(I16, I16, I16, short4, vxc_short8, vxc_short8,\
-                vxc_short8, CONV_SAT_RTE, in_scale0, 0, in_scale1, 0, out_scale, 0)
+                vxc_short8, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)
 TENSOR_FLOORDIV_2D(I16, I16, F16, half4, vxc_short8, vxc_short8,\
-                vxc_short8, CONV, in_scale0, 0, in_scale1, 0, 1, 0)
+                vxc_short8, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)
 
 TENSOR_FLOORDIV_2D(I8, I8, I8, char4, vxc_char8, vxc_char16,\
-                vxc_char16, CONV_SAT_RTE, in_scale0, 0, in_scale1, 0, out_scale, 0)
+                vxc_char16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)
 TENSOR_FLOORDIV_2D(I8, I8, F16, half4, vxc_short8, vxc_char16,\
-                vxc_char16, CONV, in_scale0, 0, in_scale1, 0, 1, 0)
+                vxc_char16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)
 
 TENSOR_FLOORDIV_2D(U8, U8, U8,  uchar4, vxc_uchar8, vxc_uchar16,\
                 vxc_uchar16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_elements.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_elements.vx
new file mode 100644
index 0000000..39a8a99
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_elements.vx
@@ -0,0 +1,153 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int axis_size;
+
+#define GATHER_ELEMENTS_AXIS0_2D(name, data_type) \
+__kernel void gather_elements_axis0_##name##_I32to##name##_2D \
+    ( \
+    __read_only  image2d_t input0, \
+    __read_only  image2d_t input1, \
+    __write_only image2d_t output, \
+                 int       axis \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+    int4 indice1 = indice + axis_size; \
+    indice = indice < 0 ? indice1 : indice; \
+ \
+    data_type src; \
+    VXC_ReadImage(src, input0, (int2)(indice.x, coord.y), 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src, input0, (int2)(indice.y, coord.y), 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src, input0, (int2)(indice.z, coord.y), 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src, input0, (int2)(indice.w, coord.y), 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+GATHER_ELEMENTS_AXIS0_2D(F16, vxc_short4)
+GATHER_ELEMENTS_AXIS0_2D(I16, vxc_short4)
+GATHER_ELEMENTS_AXIS0_2D(I8,  vxc_char4)
+GATHER_ELEMENTS_AXIS0_2D(U8,  vxc_uchar4)
+
+#define GATHER_ELEMENTS_AXIS0(name, data_type) \
+__kernel void gather_elements_axis0_##name##_I32to##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output, \
+                 int             axis \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    Tensor img = create_tensor_from_image2d_array(input1, 4); \
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+    int4 indice1 = indice + axis_size; \
+    indice = indice < 0 ? indice1 : indice; \
+ \
+    data_type src; \
+    int4 coord_in = coord; \
+    coord_in.x = indice.x; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, \
+                VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = indice.y; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, \
+                VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = indice.z; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, \
+                VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = indice.w; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, \
+                VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+GATHER_ELEMENTS_AXIS0(F16, vxc_short4)
+GATHER_ELEMENTS_AXIS0(I16, vxc_short4)
+GATHER_ELEMENTS_AXIS0(I8,  vxc_char4)
+GATHER_ELEMENTS_AXIS0(U8,  vxc_uchar4)
+
+#define GATHER_ELEMENTS_AXIS1_2D(name, data_type) \
+__kernel void gather_elements_axis1_##name##_I32to##name##_2D \
+    ( \
+    __read_only  image2d_t input0, \
+    __read_only  image2d_t input1, \
+    __write_only image2d_t output, \
+                 int       axis \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    int index = read_imagei(input1, coord).x; \
+    int index1 = index + axis_size; \
+    index = index < 0 ? index1 : index; \
+ \
+    data_type src; \
+    VXC_ReadImage(src, input0, (int2)(coord.x, index), 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+}
+GATHER_ELEMENTS_AXIS1_2D(F16, vxc_short4)
+GATHER_ELEMENTS_AXIS1_2D(I16, vxc_short4)
+GATHER_ELEMENTS_AXIS1_2D(I8,  vxc_char4)
+GATHER_ELEMENTS_AXIS1_2D(U8,  vxc_uchar4)
+
+#define GATHER_ELEMENTS_AXIS1(name, data_type) \
+__kernel void gather_elements_axis1_##name##_I32to##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output, \
+                 int             axis \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    int index = read_imagei(input1, coord).x; \
+    int index1 = index + axis_size; \
+    index = index < 0 ? index1 : index; \
+ \
+    data_type src; \
+    int4 coord_in = coord; \
+    coord_in.y = index; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, \
+                VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+}
+GATHER_ELEMENTS_AXIS1(F16, vxc_short4)
+GATHER_ELEMENTS_AXIS1(I16, vxc_short4)
+GATHER_ELEMENTS_AXIS1(I8,  vxc_char4)
+GATHER_ELEMENTS_AXIS1(U8,  vxc_uchar4)
+
+#define GATHER_ELEMENTS_AXIS2(name, data_type) \
+__kernel void gather_elements_axis2_##name##_I32to##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output, \
+                 int             axis \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    int index = read_imagei(input1, coord).x; \
+    int index1 = index + axis_size; \
+    index = index < 0 ? index1 : index; \
+ \
+    data_type src; \
+    int4 coord_in = coord; \
+    coord_in.z = index; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, \
+                VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+}
+GATHER_ELEMENTS_AXIS2(F16, vxc_short4)
+GATHER_ELEMENTS_AXIS2(I16, vxc_short4)
+GATHER_ELEMENTS_AXIS2(I8,  vxc_char4)
+GATHER_ELEMENTS_AXIS2(U8,  vxc_uchar4)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx
index 1221ed1..bd3a733 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx
@@ -238,21 +238,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \
 
 L2NORMSCALE_AXIS0_2D(F16, F16, F16, ushort, vxc_ushort8, vxc_half8, 1, \
                      ushort, half4, vxc_half8, vxc_ushort8)
-L2NORMSCALE_AXIS0_2D(I16, F16, F16, short,  vxc_short8, vxc_short8, r_inputScale, \
-                     ushort, half4, vxc_half8, vxc_ushort8)
-L2NORMSCALE_AXIS0_2D(I16, F16, I16, short,  vxc_short8, vxc_short8, r_inputScale, \
-                     short, int4, vxc_short8, vxc_short8)
-L2NORMSCALE_AXIS0_2D(I8,  F16, F16, char,   vxc_char8,  vxc_char8, r_inputScale, \
-                     ushort, half4, vxc_half8, vxc_ushort8)
-L2NORMSCALE_AXIS0_2D(I8,  F16, I8,  char,   vxc_char8,  vxc_char8, r_inputScale, \
-                     char,  int4, vxc_char8, vxc_char8)
 
-
-
-#define L2NORMSCALE_AXIS0_U8_2D(in1_name, out_name,\
-                            dst_type, convert_type, output_type, copy_type) \
+#define L2NORMSCALE_AXIS0_QNT_2D(in0_name, in1_name, out_name,\
+                    src_type, src_scalar_type, dst_type, convert_type, output_type, copy_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \
-     void l2normalizescale_axis0_U8_##in1_name##to##out_name##_2D \
+void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \
     (\
     __read_only  image2d_t input,\
     __read_only  image2d_t scale,\
@@ -265,8 +255,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \
     Image src_img = create_image_from_image2d(input, 1); \
     uchar *src_ptr_base = (uchar *)src_img.ptr; \
     uchar *src_ptr; \
-    vxc_uchar8 src0, src1; \
-    vxc_uchar8   val0, val1; \
+    src_type src0, src1; \
+    src_type val0, val1; \
     int   inputRemain; \
     vxc_float4 sum = {0.0f}; \
     vxc_uchar8 input_ZP ; \
@@ -274,10 +264,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \
     src_ptr = src_ptr_base + (get_global_id(0) + get_global_id(1) * inputWidth); \
     for (int i = 0; i < inputWidthCount; i++) \
     { \
-        VXC_Vload8(src0, src_ptr, 0); \
-        VXC_Vload8(src1, src_ptr, 1); \
-        _viv_asm(COPY, val0, src0, 16); \
-        _viv_asm(COPY, val1, src1, 16); \
+        VXC_Vload8(val0, src_ptr, 0); \
+        VXC_Vload8(val1, src_ptr, 1); \
         VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 1),\
             uniSumSqrt_16x1); \
         VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 1),\
@@ -292,7 +280,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \
         inputRemain = inputWidth - offset; \
         if (inputRemain > 0) \
         { \
-            L2NORMSCALE_REM_PROCESS((uchar)inputZP) \
+            L2NORMSCALE_REM_PROCESS((src_scalar_type)inputZP) \
             _viv_asm(COPY, val0, src0, 16); \
             _viv_asm(COPY, val1, src1, 16); \
             VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 1),\
@@ -314,5 +302,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \
     L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \
 }
 
-L2NORMSCALE_AXIS0_U8_2D(F16, F16, ushort, half4, vxc_half8,  vxc_ushort8)
-L2NORMSCALE_AXIS0_U8_2D(F16, U8,  uchar,  int4,  vxc_uchar8, vxc_uchar8)
+L2NORMSCALE_AXIS0_QNT_2D(U8,  F16, F16, vxc_uchar8, uchar, ushort, half4, vxc_half8,  vxc_ushort8)
+L2NORMSCALE_AXIS0_QNT_2D(U8,  F16, U8,  vxc_uchar8, uchar, uchar,  int4,  vxc_uchar8, vxc_uchar8)
+L2NORMSCALE_AXIS0_QNT_2D(I8,  F16, F16, vxc_char8,  char,  ushort, half4, vxc_half8,  vxc_ushort8)
+L2NORMSCALE_AXIS0_QNT_2D(I8,  F16, I8,  vxc_char8,  char,  char,   int4,  vxc_char8,  vxc_char8)
+L2NORMSCALE_AXIS0_QNT_2D(I16, F16, F16, vxc_short8, short, ushort, half4, vxc_half8,  vxc_ushort8)
+L2NORMSCALE_AXIS0_QNT_2D(I16, F16, I16, vxc_short8, short, short,  int4,  vxc_short8, vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis1.vx
index 65daaed..bb69d3b 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis1.vx
@@ -7,10 +7,6 @@ _viv_uniform VXC_512Bits UniFp16MulHi_dp4x4;
 
 //int8 version
 _viv_uniform float r_inputScale;
-_viv_uniform VXC_512Bits uniIntegerSquareLo_4x4;
-_viv_uniform VXC_512Bits uniIntegerSquareHi_4x4;
-_viv_uniform VXC_512Bits uniDataSquareAddU32Lo_4x4;
-_viv_uniform VXC_512Bits uniDataSquareAddU32Hi_4x4;
 
 _viv_uniform VXC_512Bits uniUInt8SquareLo_4x4;
 _viv_uniform VXC_512Bits uniUInt8SquareHi_4x4;
@@ -127,10 +123,9 @@ __kernel void l2normalizescale_axis1_F16_##in1_name##to##out_name##_2D \
 
 L2NORMSCALE_AXIS1_F16_2D(F16, F16, vxc_short8,  vxc_half8,   vxc_half8,   half4,        vxc_short8)
 
-
-#define L2NORMSCALE_AXIS1_I8_2D(in1_name, out_name,\
+#define L2NORMSCALE_AXIS1_QNT_2D(in0_name, in1_name, out_name,\
        input_type, incopy_type, output_type, convert_type, copy_type) \
-__kernel void l2normalizescale_axis1_I8_##in1_name##to##out_name##_2D \
+__kernel void l2normalizescale_axis1_##in0_name##_##in1_name##to##out_name##_2D \
     (\
     __read_only  image2d_array_t input,\
     __read_only  image2d_array_t scale,\
@@ -139,93 +134,11 @@ __kernel void l2normalizescale_axis1_I8_##in1_name##to##out_name##_2D \
     )\
 { \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
-    vxc_char8 src0_I8, src1_I8; \
-    vxc_uint4 dst0_I8 = 0, dst1_I8 = 0; \
-    for(int i = 0; i < L2NorS_depth; i += 2) \
-    { \
-        VXC_ReadImage(src0_I8, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-        VXC_ReadImage(src1_I8, input, coord.xy, VXC_5BITOFFSET_XY(0, 1),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-        coord.y += 2; \
-        VXC_DP4x4(dst0_I8, src0_I8, dst0_I8, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniDataSquareAddU32Lo_4x4); \
-        VXC_DP4x4(dst1_I8, src0_I8, dst1_I8, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniDataSquareAddU32Hi_4x4); \
-        VXC_DP4x4(dst0_I8, src1_I8, dst0_I8, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniDataSquareAddU32Lo_4x4); \
-        VXC_DP4x4(dst1_I8, src1_I8, dst1_I8, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniDataSquareAddU32Hi_4x4); \
-    } \
-    vxc_float4 sum_lo, sum_hi; \
-    sum_lo = convert_float4(dst0_I8); \
-    sum_hi = convert_float4(dst1_I8); \
-    sum_lo = rsqrt(sum_lo) * r_inputScale; \
-    sum_hi = rsqrt(sum_hi) * r_inputScale; \
-    L2NORMSCALE_MUL_AXIS1_PROCESS(input_type, incopy_type, output_type, convert_type, copy_type) \
-}
-
-L2NORMSCALE_AXIS1_I8_2D(F16, I8,  vxc_char16,  vxc_char16,  vxc_char16,  int4,   vxc_char16)
-L2NORMSCALE_AXIS1_I8_2D(F16, F16, vxc_char16,  vxc_char16,  vxc_half8,   half4,  vxc_short8)
-
-
-#define L2NORMSCALE_AXIS1_I16_2D(in1_name, out_name,\
-       input_type, incopy_type, output_type, convert_type, copy_type) \
-__kernel void l2normalizescale_axis1_I16_##in1_name##to##out_name##_2D \
-    (\
-    __read_only  image2d_array_t input,\
-    __read_only  image2d_array_t scale,\
-    __write_only image2d_array_t output,\
-    int axis\
-    )\
-{ \
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
-    vxc_short8 src0_I16, src1_I16; \
+    input_type src0_U8, src1_U8; \
     vxc_float4 squr, sum_lo = 0, sum_hi = 0; \
     for(int i = 0; i < L2NorS_depth; i += 2) \
     { \
-        VXC_ReadImage(src0_I16, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-        VXC_ReadImage(src1_I16, input, coord.xy, VXC_5BITOFFSET_XY(0, 1),\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-        coord.y += 2; \
-        VXC_DP4x4(squr, src0_I16, src0_I16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniIntegerSquareLo_4x4); \
-        sum_lo = squr + sum_lo; \
-        VXC_DP4x4(squr, src0_I16, src0_I16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniIntegerSquareHi_4x4); \
-        sum_hi = squr + sum_hi; \
-        VXC_DP4x4(squr, src1_I16, src1_I16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniIntegerSquareLo_4x4); \
-        sum_lo = squr + sum_lo; \
-        VXC_DP4x4(squr, src1_I16, src1_I16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-            uniIntegerSquareHi_4x4); \
-        sum_hi = squr + sum_hi; \
-    } \
-    sum_lo = rsqrt(sum_lo) * r_inputScale; \
-    sum_hi = rsqrt(sum_hi) * r_inputScale; \
-    L2NORMSCALE_MUL_AXIS1_PROCESS(input_type, incopy_type, output_type, convert_type, copy_type) \
-}
-
-L2NORMSCALE_AXIS1_I16_2D(F16, I16, vxc_short8, vxc_short8, vxc_short8, int4,  vxc_short8)
-L2NORMSCALE_AXIS1_I16_2D(F16, F16, vxc_short8, vxc_short8, vxc_half8,  half4, vxc_short8)
-
-#define L2NORMSCALE_AXIS1_U8_2D(in1_name, out_name,\
-       input_type, incopy_type, output_type, convert_type, copy_type) \
-__kernel void l2normalizescale_axis1_U8_##in1_name##to##out_name##_2D \
-    (\
-    __read_only  image2d_array_t input,\
-    __read_only  image2d_array_t scale,\
-    __write_only image2d_array_t output,\
-    int axis\
-    )\
-{ \
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
-    vxc_uchar8 src0_U8, src1_U8; \
-    vxc_float4 squr, sum_lo = 0, sum_hi = 0; \
-    for(int i = 0; i < L2NorS_depth; i += 2) \
-    { \
-        vxc_uchar8 zero; \
+        vxc_short2 zero; \
         VXC_ReadImage(src0_U8, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
         VXC_ReadImage(src1_U8, input, coord.xy, VXC_5BITOFFSET_XY(0, 1),\
@@ -246,5 +159,9 @@ __kernel void l2normalizescale_axis1_U8_##in1_name##to##out_name##_2D \
     L2NORMSCALE_MUL_AXIS1_PROCESS(input_type, incopy_type, output_type, convert_type, copy_type) \
 }
 
-L2NORMSCALE_AXIS1_U8_2D(F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8,   half4,  vxc_short8)
-L2NORMSCALE_AXIS1_U8_2D(F16, U8,  vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,   vxc_uchar16)
+L2NORMSCALE_AXIS1_QNT_2D(U8,  F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8,   half4,  vxc_short8)
+L2NORMSCALE_AXIS1_QNT_2D(U8,  F16, U8,  vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,   vxc_uchar16)
+L2NORMSCALE_AXIS1_QNT_2D(I8,  F16, F16, vxc_char16,  vxc_char16,  vxc_half8,   half4,  vxc_short8)
+L2NORMSCALE_AXIS1_QNT_2D(I8,  F16, I8,  vxc_char16,  vxc_char16,  vxc_uchar16, int4,   vxc_char16)
+L2NORMSCALE_AXIS1_QNT_2D(I16, F16, F16, vxc_short8,  vxc_short8,  vxc_half8,   half4,  vxc_short8)
+L2NORMSCALE_AXIS1_QNT_2D(I16, F16, I16, vxc_short8,  vxc_short8,  vxc_short8,  int4,   vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/maximum.vx b/src/tim/vx/internal/src/libnnext/ops/vx/maximum.vx
deleted file mode 100644
index cb7c067..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/maximum.vx
+++ /dev/null
@@ -1,325 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-__kernel void maximum_F16F16toF16
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 vec0, vec1, dst;
-    vxc_half8  src0, src1;
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src0, vec0, 16);
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    _viv_asm(COPY, dst, src0, 16);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_F16F16toF16_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_short8 vec0, vec1, dst;
-    vxc_half8  src0, src1;
-    VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src0, vec0, 16);
-    VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    coord.z ++;
-
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    _viv_asm(COPY, dst, src0, 16);
-
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8;
-
-__kernel void maximum_F16F16toI8
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 vec0, vec1;
-    vxc_char8  dst;
-    vxc_half8  src0, src1;
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src0, vec0, 16);
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_F16F16toI8_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 vec0, vec1;
-    vxc_char8  dst;
-    vxc_half8  src0, src1;
-    VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src0, vec0, 16);
-    VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-_viv_uniform VXC_512Bits uniConvertI8toI8_0_part0_2x8;
-_viv_uniform VXC_512Bits uniConvertI8toI8_0_part1_2x8;
-_viv_uniform VXC_512Bits uniConvertI8toI8_1_part0_2x8;
-_viv_uniform VXC_512Bits uniConvertI8toI8_1_part1_2x8;
-__kernel void maximum_I8I8toI8
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_char16 src0, src1, dst;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part0_2x8);
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part1_2x8);
-    dst = max(src0, src1);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_I8I8toI8_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_char16 src0, src1, dst;
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    coord.z ++;
-
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part0_2x8);
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part1_2x8);
-    dst = max(src0, src1);
-
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
-_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8;
-_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;
-_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Hi_2x8;
-_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
-_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
-__kernel void maximum_U8U8toU8
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_uchar16 src0, src1, dst;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    vxc_ushort8 mp0, mp1;
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Lo_2x8);
-    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Hi_2x8);
-    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift1_Lo_2x8);
-    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift1_Hi_2x8);
-    dst = max(src0, src1);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_U8U8toU8_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_uchar16 src0, src1, dst;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    vxc_ushort8 mp0, mp1;
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Lo_2x8);
-    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Hi_2x8);
-    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift1_Lo_2x8);
-    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift1_Hi_2x8);
-    dst = max(src0, src1);
-
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_U8U8toI16
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_uchar16 src0, src1;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    vxc_short8 dst0, dst1, dst;
-    vxc_ushort8 mp0, mp1;
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Lo_2x8);
-    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift1_Lo_2x8);
-    dst = max(dst0, dst1);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_U8U8toI16_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_uchar16 src0, src1;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    vxc_short8 dst0, dst1, dst;
-    vxc_ushort8 mp0, mp1;
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Lo_2x8);
-    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift1_Lo_2x8);
-    dst = max(dst0, dst1);
-
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-_viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8;
-_viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8;
-__kernel void maximum_I16I16toI16
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0, src1, dst;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_0_2x8);
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_1_2x8);
-    dst = max(src0, src1);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_I16I16toI16_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_short8 src0, src1, dst;
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    coord.z ++;
-
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_0_2x8);
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_1_2x8);
-    dst = max(src0, src1);
-
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/maximum_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/maximum_0.vx
new file mode 100644
index 0000000..2803730
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/maximum_0.vx
@@ -0,0 +1,194 @@
+#include "cl_viv_vx_ext.h"
+
+__kernel void maximum_F16F16toF16
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 vec0, vec1, dst;
+    vxc_half8  src0, src1;
+    VXC_ReadImage2DArray(vec0, input0, coord, 0,\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, src0, vec0, 16);
+    VXC_ReadImage2DArray(vec1, input1, coord, 0,\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, src1, vec1, 16);
+
+    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
+    _viv_asm(COPY, dst, src0, 16);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void maximum_F16F16toF16_2D
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
+
+    vxc_short8 vec0, vec1, dst;
+    vxc_half8  src0, src1;
+    VXC_ReadImage(vec0, input0, coord.xy, 0,\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, src0, vec0, 16);
+    VXC_ReadImage(vec1, input1, coord.xy, 0,\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, src1, vec1, 16);
+
+    coord.z ++;
+
+    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
+    _viv_asm(COPY, dst, src0, 16);
+
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Hi_2x8;
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
+
+#define MAXIMUM_8BITS_QUANT_IMPL(name, dtype) \
+__kernel void maximum_##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    dtype src0, src1, dst; \
+    VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Hi_2x8); \
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Hi_2x8); \
+    dst = max(src0, src1); \
+ \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
+}
+MAXIMUM_8BITS_QUANT_IMPL(U8U8toU8, vxc_uchar16)
+MAXIMUM_8BITS_QUANT_IMPL(I8I8toI8, vxc_char16)
+
+#define MAXIMUM_8BITS_2D_QUANT_IMPL(name, dtype) \
+__kernel void maximum_##name##_2D \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    dtype src0, src1, dst; \
+    VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Hi_2x8); \
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Hi_2x8); \
+    dst = max(src0, src1); \
+ \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
+}
+MAXIMUM_8BITS_2D_QUANT_IMPL(U8U8toU8, vxc_uchar16)
+MAXIMUM_8BITS_2D_QUANT_IMPL(I8I8toI8, vxc_char16)
+
+#define MAXIMUM_QUANT_IMPL(name, src_type, copy_type, dst_type) \
+__kernel void maximum_##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output  \
+    ) \
+{ \
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    copy_type data0, data1; \
+    src_type src0, src1; \
+    VXC_ReadImage2DArray(data0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src0, data0, 16); \
+    VXC_ReadImage2DArray(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, data1, 16); \
+ \
+    dst_type dst0, dst1, dst; \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+    dst = max(dst0, dst1); \
+ \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+MAXIMUM_QUANT_IMPL(U8U8toI16,   vxc_uchar16, vxc_uchar16, vxc_short8)
+MAXIMUM_QUANT_IMPL(I16I16toI16, vxc_short8,  vxc_short8,  vxc_short8)
+MAXIMUM_QUANT_IMPL(I16I16toU8,  vxc_short8,  vxc_short8,  vxc_uchar16)
+MAXIMUM_QUANT_IMPL(F16F16toU8,  vxc_half8,   vxc_short8,  vxc_uchar16)
+MAXIMUM_QUANT_IMPL(F16F16toI8,  vxc_half8,   vxc_short8,  vxc_char16)
+MAXIMUM_QUANT_IMPL(F16F16toI16, vxc_half8,   vxc_short8,  vxc_short8)
+
+#define MAXIMUM_QUANT_2D_IMPL(name, src_type, copy_type, dst_type) \
+__kernel void maximum_##name##_2D \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output  \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    copy_type data0, data1; \
+    src_type src0, src1; \
+    VXC_ReadImage(data0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src0, data0, 16); \
+    VXC_ReadImage(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, data1, 16); \
+ \
+    dst_type dst0, dst1, dst; \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+    dst = max(dst0, dst1); \
+ \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+MAXIMUM_QUANT_2D_IMPL(U8U8toI16,   vxc_uchar16, vxc_uchar16, vxc_short8)
+MAXIMUM_QUANT_2D_IMPL(I16I16toI16, vxc_short8,  vxc_short8,  vxc_short8)
+MAXIMUM_QUANT_2D_IMPL(I16I16toU8,  vxc_short8,  vxc_short8,  vxc_uchar16)
+MAXIMUM_QUANT_2D_IMPL(F16F16toU8,  vxc_half8,   vxc_short8,  vxc_uchar16)
+MAXIMUM_QUANT_2D_IMPL(F16F16toI8,  vxc_half8,   vxc_short8,  vxc_char16)
+MAXIMUM_QUANT_2D_IMPL(F16F16toI16, vxc_half8,   vxc_short8,  vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/maximum_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/maximum_1.vx
new file mode 100644
index 0000000..3220e35
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/maximum_1.vx
@@ -0,0 +1,211 @@
+#include "cl_viv_vx_ext.h"
+
+
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;
+
+#define MAXIMUM_F16TOQUANT_IMPL(name, src0_type, copy_type, dst_type) \
+__kernel void maximum_##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output  \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    copy_type data0; \
+    src0_type src0; \
+    vxc_half8 src1; \
+    vxc_short8 data1; \
+    dst_type dst0, dst1, dst; \
+    VXC_ReadImage2DArray(data0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src0, data0, 16); \
+    VXC_ReadImage2DArray(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, data1, 16); \
+ \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+    dst = max(dst0, dst1); \
+ \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+MAXIMUM_F16TOQUANT_IMPL(U8F16toU8,   vxc_uchar16, vxc_uchar16, vxc_uchar8)
+MAXIMUM_F16TOQUANT_IMPL(I8F16toI8,   vxc_char16,  vxc_char16,  vxc_char8)
+MAXIMUM_F16TOQUANT_IMPL(I16F16toI16, vxc_short8,  vxc_short8,  vxc_short8)
+MAXIMUM_F16TOQUANT_IMPL(F16F16toI16, vxc_half8,   vxc_short8,  vxc_short8)
+MAXIMUM_F16TOQUANT_IMPL(F16F16toI8,  vxc_half8,   vxc_short8,  vxc_char8)
+MAXIMUM_F16TOQUANT_IMPL(F16F16toU8,  vxc_half8,   vxc_short8,  vxc_uchar8)
+
+#define MAXIMUM_F16TOQUANT_2D_IMPL(name, src0_type, copy_type, dst_type) \
+__kernel void maximum_##name##_2D \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output  \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    copy_type data0; \
+    src0_type src0; \
+    vxc_half8 src1; \
+    vxc_short8 data1; \
+    dst_type dst0, dst1, dst; \
+    VXC_ReadImage(data0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src0, data0, 16); \
+    VXC_ReadImage(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, data1, 16); \
+ \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+    dst = max(dst0, dst1); \
+ \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+MAXIMUM_F16TOQUANT_2D_IMPL(U8F16toU8,   vxc_uchar16, vxc_uchar16, vxc_uchar8)
+MAXIMUM_F16TOQUANT_2D_IMPL(I8F16toI8,   vxc_char16,  vxc_char16,  vxc_char8)
+MAXIMUM_F16TOQUANT_2D_IMPL(I16F16toI16, vxc_short8,  vxc_short8,  vxc_short8)
+MAXIMUM_F16TOQUANT_2D_IMPL(F16F16toI16, vxc_half8,   vxc_short8,  vxc_short8)
+MAXIMUM_F16TOQUANT_2D_IMPL(F16F16toI8,  vxc_half8,   vxc_short8,  vxc_char8)
+MAXIMUM_F16TOQUANT_2D_IMPL(F16F16toU8,  vxc_half8,   vxc_short8,  vxc_uchar8)
+
+#define MAXIMUM_QUANT_F16TOF16_IMPL(name, src_type) \
+__kernel void maximum_##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output  \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    src_type vec0; \
+    vxc_half8 src0, src1; \
+    vxc_short8 data1, dst; \
+    VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, data1, 16); \
+ \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                uniU8MulAndPostShift0_Lo_2x8); \
+ \
+    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \
+    _viv_asm(COPY, dst, src0, 16); \
+ \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+MAXIMUM_QUANT_F16TOF16_IMPL(U8F16toF16,  vxc_uchar16)
+MAXIMUM_QUANT_F16TOF16_IMPL(I8F16toF16,  vxc_char16)
+MAXIMUM_QUANT_F16TOF16_IMPL(I16F16toF16, vxc_short8)
+
+#define MAXIMUM_QUANT_F16TOF16_2D_IMPL(name, src_type) \
+__kernel void maximum_##name##_2D \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output  \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    src_type vec0; \
+    vxc_half8 src0, src1; \
+    vxc_short8 data1, dst; \
+    VXC_ReadImage(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, data1, 16); \
+ \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                uniU8MulAndPostShift0_Lo_2x8); \
+ \
+    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \
+    _viv_asm(COPY, dst, src0, 16); \
+ \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+MAXIMUM_QUANT_F16TOF16_2D_IMPL(U8F16toF16,  vxc_uchar16)
+MAXIMUM_QUANT_F16TOF16_2D_IMPL(I8F16toF16,  vxc_char16)
+MAXIMUM_QUANT_F16TOF16_2D_IMPL(I16F16toF16, vxc_short8)
+
+#define MAXIMUM_QUANTTOF16_IMPL(name, src_type) \
+__kernel void maximum_##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output  \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    src_type vec0, vec1; \
+    vxc_half8 src0, src1; \
+    vxc_short8 dst; \
+    VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(src0, vec0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(src1, vec1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+ \
+    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \
+    _viv_asm(COPY, dst, src0, 16); \
+ \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+MAXIMUM_QUANTTOF16_IMPL(U8U8toF16,   vxc_uchar16)
+MAXIMUM_QUANTTOF16_IMPL(I8I8toF16,   vxc_char16)
+MAXIMUM_QUANTTOF16_IMPL(I16I16toF16, vxc_short8)
+
+#define MAXIMUM_QUANTTOF16_2D_IMPL(name, src_type) \
+__kernel void maximum_##name##_2D \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output  \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    src_type vec0, vec1; \
+    vxc_half8 src0, src1; \
+    vxc_short8 dst; \
+    VXC_ReadImage(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(src0, vec0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(src1, vec1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+ \
+    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \
+    _viv_asm(COPY, dst, src0, 16); \
+ \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+MAXIMUM_QUANTTOF16_2D_IMPL(U8U8toF16,   vxc_uchar16)
+MAXIMUM_QUANTTOF16_2D_IMPL(I8I8toF16,   vxc_char16)
+MAXIMUM_QUANTTOF16_2D_IMPL(I16I16toF16, vxc_short8)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/maximum_fp16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/maximum_fp16.vx
deleted file mode 100644
index 76269f7..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/maximum_fp16.vx
+++ /dev/null
@@ -1,317 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniConvertI8toI8_0_part0_2x8;
-_viv_uniform VXC_512Bits uniConvertI8toI8_0_part1_2x8;
-_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8;
-_viv_uniform VXC_512Bits uniConvertInt8toFp16_2x8;
-
-__kernel void maximum_I8F16toI8
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_char16 src0, src2, dst;
-    vxc_short8 src1, src3, src4, src5;
-    vxc_half8 data0, data1, data2, data3;
-    vxc_char16 tmp0, tmp1;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src4, input1, coord, VXC_5BITOFFSET_XY(8, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, data0, src1, 16);
-    _viv_asm(COPY, data1, src4, 16);
-
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);
-    VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);
-    VXC_DP2x8(tmp0, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);
-    dst = max(src0, tmp0);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_I8F16toI8_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_char16 src0, src2, dst;
-    vxc_short8 src1, src3, src4, src5;
-    vxc_half8 data0, data1, data2, data3;
-    vxc_char16 tmp0;
-
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src4, input1, coord.xy, VXC_5BITOFFSET_XY(8, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, data0, src1, 16);
-    _viv_asm(COPY, data1, src4, 16);
-
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);
-    VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);
-    VXC_DP2x8(tmp0, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);
-    dst = max(src0, tmp0);
-
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_I8F16toF16
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_char8 vec0, vec2;
-    vxc_short8 vec1, vec3, dst;
-    vxc_half8  src0, src1, src2, src3;
-
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt8toFp16_2x8);
-
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    _viv_asm(COPY, dst, src0, 16);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_I8F16toF16_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_char8 vec0, vec2;
-    vxc_short8 vec1, vec3, dst;
-    vxc_half8  src0, src1, src2, src3;
-    VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt8toFp16_2x8);
-
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    _viv_asm(COPY, dst, src0, 16);
-
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
-_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
-
-__kernel void maximum_U8F16toF16
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_uchar8 vec0, vec2;
-    vxc_short8 vec1, vec3, dst;
-    vxc_half8  src0, src1, src2, src3;
-
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    vxc_ushort8 ms0;
-    _viv_asm(COPY, ms0, multAndoutZP0, 16);
-    VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-                uniU8MulAndPostShift_0_Lo_2x8);
-
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    _viv_asm(COPY, dst, src0, 16);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_U8F16toF16_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_uchar8 vec0, vec2;
-    vxc_short8 vec1, vec3, dst;
-    vxc_half8  src0, src1, src2, src3;
-
-    VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    vxc_ushort8 ms0;
-    _viv_asm(COPY, ms0, multAndoutZP0, 16);
-    VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-                uniU8MulAndPostShift_0_Lo_2x8);
-
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    _viv_asm(COPY, dst, src0, 16);
-
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
-_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8;
-_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8;
-_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
-__kernel void maximum_U8F16toU8
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_uchar16 src0, dst0, dst1;
-    vxc_ushort8 src1, src2;
-    vxc_half8 data1, data2;
-    VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src2, input1, coord, VXC_5BITOFFSET_XY(8, 0), \
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    _viv_asm(COPY, data2, src2, 16);
-
-    vxc_ushort8 mp0, mp1;
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Lo_2x8);
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Hi_2x8);
-    VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniConvertFp16toU8_2x8);
-    VXC_DP2x8(dst1, data2, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\
-        uniConvertFp16toU8_2x8);
-    dst0 = max(dst0, dst1);
-
-    VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_U8F16toU8_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_uchar16 src0, dst0, dst1;
-    vxc_ushort8 src1, src2;
-    vxc_half8 data1, data2;
-    VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src2, input1, coord, VXC_5BITOFFSET_XY(8, 0), \
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    _viv_asm(COPY, data2, src2, 16);
-
-    vxc_ushort8 mp0, mp1;
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Lo_2x8);
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Hi_2x8);
-    VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniConvertFp16toU8_2x8);
-    VXC_DP2x8(dst1, data2, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\
-        uniConvertFp16toU8_2x8);
-    dst0 = max(dst0, dst1);
-
-    VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_F16F16toU8
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_ushort8 src0, src1;
-    vxc_half8 data0, data1;
-    vxc_uchar16 dst0, dst1;
-    VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    _viv_asm(COPY, data1, src1, 16);
-
-    vxc_ushort8 mp1;
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(dst0, data0, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniConvertFp16toU8_2x8);
-    VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniConvertFp16toU8_2x8);
-    dst0 = max(dst0, dst1);
-
-    VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_F16F16toU8_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_ushort8 src0, src1;
-    vxc_half8 data0, data1;
-    vxc_uchar16 dst0, dst1;
-    VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    _viv_asm(COPY, data1, src1, 16);
-
-    vxc_ushort8 mp1;
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(dst0, data0, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniConvertFp16toU8_2x8);
-    VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniConvertFp16toU8_2x8);
-    dst0 = max(dst0, dst1);
-
-    VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/maximum_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/maximum_i16.vx
deleted file mode 100644
index aab5d72..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/maximum_i16.vx
+++ /dev/null
@@ -1,233 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniConvertI16toI16_2x8;
-_viv_uniform VXC_512Bits uinConvertFp16ToInt16_2x8;
-_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8;
-_viv_uniform float outputScale;
-_viv_uniform float output_zp;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform VXC_512Bits uniConvert1stFp16ToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert2ndFp16ToFp32_4x4;
-
-
-__kernel void maximum_I16F16toI16
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0, src1, tmp0, dst;
-    vxc_half8 data0;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, data0, src1, 16);
-
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_2x8);
-    VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt16_2x8);
-    dst = max(src0, tmp0);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_I16F16toI16_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_short8 src0, src1, tmp0, dst;
-    vxc_half8 data0;
-
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, data0, src1, 16);
-
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_2x8);
-    VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt16_2x8);
-    dst = max(src0, tmp0);
-
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_I16F16toF16
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 vec0, vec1, dst;
-    vxc_half8  src0, src1;
-
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt16toFp16_2x8);
-
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    _viv_asm(COPY, dst, src0, 16);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_I16F16toF16_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_short8 vec0, vec1, dst;
-    vxc_half8  src0, src1;
-    VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt16toFp16_2x8);
-
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    _viv_asm(COPY, dst, src0, 16);
-
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_F16F16toI16
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 vec0, vec1;
-    vxc_short8 dst;
-    vxc_half8  src0, src1;
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src0, vec0, 16);
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    int4 tmpDst0, tmpDst1;
-    float4 tmpData0, tmpData1;
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert1stFp16ToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert2ndFp16ToFp32_4x4);
-    tmpDst0 = convert_int4_rte(tmpData0 * outputScale + output_zp);
-    tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);
-    VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_F16F16toI16_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 vec0, vec1;
-    vxc_short8 dst;
-    vxc_half8  src0, src1;
-    VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src0, vec0, 16);
-    VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    int4 tmpDst0, tmpDst1;
-    float4 tmpData0, tmpData1;
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert1stFp16ToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert2ndFp16ToFp32_4x4);
-    tmpDst0 = convert_int4_rte(tmpData0 * outputScale + output_zp);
-    tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);
-    VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
-_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;
-_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
-_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
-__kernel void maximum_I16I16toU8
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0, src1;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    vxc_uchar16 dst0, dst1, dst;
-    vxc_ushort8 mp0, mp1;
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Lo_2x8);
-    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift1_Lo_2x8);
-    dst = max(dst0, dst1);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void maximum_I16I16toU8_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_short8 src0, src1;
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    vxc_uchar16 dst0, dst1, dst;
-    vxc_ushort8 mp0, mp1;
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Lo_2x8);
-    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift1_Lo_2x8);
-    dst = max(dst0, dst1);
-
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/minimum.vx b/src/tim/vx/internal/src/libnnext/ops/vx/minimum.vx
deleted file mode 100644
index 0b3ef97..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/minimum.vx
+++ /dev/null
@@ -1,327 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-__kernel void minimum_F16F16toF16
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 vec0, vec1, dst;
-    vxc_half8  src0, src1;
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src0, vec0, 16);
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    _viv_asm(COPY, dst, src0, 16);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_F16F16toF16_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_short8 vec0, vec1, dst;
-    vxc_half8  src0, src1;
-    VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src0, vec0, 16);
-    VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    coord.z ++;
-
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    _viv_asm(COPY, dst, src0, 16);
-
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8;
-
-__kernel void minimum_F16F16toI8
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 vec0, vec1;
-    vxc_char8  dst;
-    vxc_half8  src0, src1;
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src0, vec0, 16);
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_F16F16toI8_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_short8 vec0, vec1;
-    vxc_char8  dst;
-    vxc_half8  src0, src1;
-    VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src0, vec0, 16);
-    VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);
-
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-_viv_uniform VXC_512Bits uniConvertI8toI8_0_part0_2x8;
-_viv_uniform VXC_512Bits uniConvertI8toI8_0_part1_2x8;
-_viv_uniform VXC_512Bits uniConvertI8toI8_1_part0_2x8;
-_viv_uniform VXC_512Bits uniConvertI8toI8_1_part1_2x8;
-__kernel void minimum_I8I8toI8
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_char16 src0, src1, dst;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part0_2x8);
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part1_2x8);
-    dst = min(src0, src1);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_I8I8toI8_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_char16 src0, src1, dst;
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    coord.z ++;
-
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part0_2x8);
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part1_2x8);
-    dst = min(src0, src1);
-
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
-_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8;
-_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;
-_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Hi_2x8;
-_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
-_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
-__kernel void minimum_U8U8toU8
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_uchar16 src0, src1, dst;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    vxc_ushort8 mp0, mp1;
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Lo_2x8);
-    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Hi_2x8);
-    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift1_Lo_2x8);
-    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift1_Hi_2x8);
-    dst = min(src0, src1);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_U8U8toU8_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_uchar16 src0, src1, dst;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    vxc_ushort8 mp0, mp1;
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Lo_2x8);
-    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Hi_2x8);
-    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift1_Lo_2x8);
-    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift1_Hi_2x8);
-    dst = min(src0, src1);
-
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_U8U8toI16
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_uchar16 src0, src1;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    vxc_short8 dst0, dst1, dst;
-    vxc_ushort8 mp0, mp1;
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Lo_2x8);
-    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift1_Lo_2x8);
-    dst = min(dst0, dst1);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_U8U8toI16_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_uchar16 src0, src1;
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    vxc_short8 dst0, dst1, dst;
-    vxc_ushort8 mp0, mp1;
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Lo_2x8);
-    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift1_Lo_2x8);
-    dst = min(dst0, dst1);
-
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-_viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8;
-_viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8;
-__kernel void minimum_I16I16toI16
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0, src1, dst;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_0_2x8);
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_1_2x8);
-    dst = min(src0, src1);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_I16I16toI16_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_short8 src0, src1, dst;
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    coord.z ++;
-
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_0_2x8);
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_1_2x8);
-    dst = min(src0, src1);
-
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/minimum_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/minimum_0.vx
new file mode 100644
index 0000000..7762d1d
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/minimum_0.vx
@@ -0,0 +1,194 @@
+#include "cl_viv_vx_ext.h"
+
+__kernel void minimum_F16F16toF16
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 vec0, vec1, dst;
+    vxc_half8  src0, src1;
+    VXC_ReadImage2DArray(vec0, input0, coord, 0,\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, src0, vec0, 16);
+    VXC_ReadImage2DArray(vec1, input1, coord, 0,\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, src1, vec1, 16);
+
+    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
+    _viv_asm(COPY, dst, src0, 16);
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void minimum_F16F16toF16_2D
+    (
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
+
+    vxc_short8 vec0, vec1, dst;
+    vxc_half8  src0, src1;
+    VXC_ReadImage(vec0, input0, coord.xy, 0,\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, src0, vec0, 16);
+    VXC_ReadImage(vec1, input1, coord.xy, 0,\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, src1, vec1, 16);
+
+    coord.z ++;
+
+    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
+    _viv_asm(COPY, dst, src0, 16);
+
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Hi_2x8;
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
+
+#define MINIMUM_8BITS_QUANT_IMPL(name, dtype) \
+__kernel void minimum_##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    dtype src0, src1, dst; \
+    VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Hi_2x8); \
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Hi_2x8); \
+    dst = min(src0, src1); \
+ \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
+}
+MINIMUM_8BITS_QUANT_IMPL(U8U8toU8, vxc_uchar16)
+MINIMUM_8BITS_QUANT_IMPL(I8I8toI8, vxc_char16)
+
+#define MINIMUM_8BITS_2D_QUANT_IMPL(name, dtype) \
+__kernel void minimum_##name##_2D \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    dtype src0, src1, dst; \
+    VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Hi_2x8); \
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Hi_2x8); \
+    dst = min(src0, src1); \
+ \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
+}
+MINIMUM_8BITS_2D_QUANT_IMPL(U8U8toU8, vxc_uchar16)
+MINIMUM_8BITS_2D_QUANT_IMPL(I8I8toI8, vxc_char16)
+
+#define MINIMUM_QUANT_IMPL(name, src_type, copy_type, dst_type) \
+__kernel void minimum_##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output  \
+    ) \
+{ \
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    copy_type data0, data1; \
+    src_type src0, src1; \
+    VXC_ReadImage2DArray(data0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src0, data0, 16); \
+    VXC_ReadImage2DArray(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, data1, 16); \
+ \
+    dst_type dst0, dst1, dst; \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+    dst = min(dst0, dst1); \
+ \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+MINIMUM_QUANT_IMPL(U8U8toI16,   vxc_uchar16, vxc_uchar16, vxc_short8)
+MINIMUM_QUANT_IMPL(I16I16toI16, vxc_short8,  vxc_short8,  vxc_short8)
+MINIMUM_QUANT_IMPL(I16I16toU8,  vxc_short8,  vxc_short8,  vxc_uchar16)
+MINIMUM_QUANT_IMPL(F16F16toU8,  vxc_half8,   vxc_short8,  vxc_uchar16)
+MINIMUM_QUANT_IMPL(F16F16toI8,  vxc_half8,   vxc_short8,  vxc_char16)
+MINIMUM_QUANT_IMPL(F16F16toI16, vxc_half8,   vxc_short8,  vxc_short8)
+
+#define MINIMUM_QUANT_2D_IMPL(name, src_type, copy_type, dst_type) \
+__kernel void minimum_##name##_2D \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output  \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    copy_type data0, data1; \
+    src_type src0, src1; \
+    VXC_ReadImage(data0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src0, data0, 16); \
+    VXC_ReadImage(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, data1, 16); \
+ \
+    dst_type dst0, dst1, dst; \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+    dst = min(dst0, dst1); \
+ \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+MINIMUM_QUANT_2D_IMPL(U8U8toI16,   vxc_uchar16, vxc_uchar16, vxc_short8)
+MINIMUM_QUANT_2D_IMPL(I16I16toI16, vxc_short8,  vxc_short8,  vxc_short8)
+MINIMUM_QUANT_2D_IMPL(I16I16toU8,  vxc_short8,  vxc_short8,  vxc_uchar16)
+MINIMUM_QUANT_2D_IMPL(F16F16toU8,  vxc_half8,   vxc_short8,  vxc_uchar16)
+MINIMUM_QUANT_2D_IMPL(F16F16toI8,  vxc_half8,   vxc_short8,  vxc_char16)
+MINIMUM_QUANT_2D_IMPL(F16F16toI16, vxc_half8,   vxc_short8,  vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/minimum_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/minimum_1.vx
new file mode 100644
index 0000000..62ac848
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/minimum_1.vx
@@ -0,0 +1,210 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;
+
+#define MINIMUM_F16TOQUANT_IMPL(name, src0_type, copy_type, dst_type) \
+__kernel void minimum_##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output  \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    copy_type data0; \
+    src0_type src0; \
+    vxc_half8 src1; \
+    vxc_short8 data1; \
+    dst_type dst0, dst1, dst; \
+    VXC_ReadImage2DArray(data0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src0, data0, 16); \
+    VXC_ReadImage2DArray(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, data1, 16); \
+ \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+    dst = min(dst0, dst1); \
+ \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+MINIMUM_F16TOQUANT_IMPL(U8F16toU8,   vxc_uchar16, vxc_uchar16, vxc_uchar8)
+MINIMUM_F16TOQUANT_IMPL(I8F16toI8,   vxc_char16,  vxc_char16,  vxc_char8)
+MINIMUM_F16TOQUANT_IMPL(I16F16toI16, vxc_short8,  vxc_short8,  vxc_short8)
+MINIMUM_F16TOQUANT_IMPL(F16F16toI16, vxc_half8,   vxc_short8,  vxc_short8)
+MINIMUM_F16TOQUANT_IMPL(F16F16toI8,  vxc_half8,   vxc_short8,  vxc_char8)
+MINIMUM_F16TOQUANT_IMPL(F16F16toU8,  vxc_half8,   vxc_short8,  vxc_uchar8)
+
+#define MINIMUM_F16TOQUANT_2D_IMPL(name, src0_type, copy_type, dst_type) \
+__kernel void minimum_##name##_2D \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output  \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    copy_type data0; \
+    src0_type src0; \
+    vxc_half8 src1; \
+    vxc_short8 data1; \
+    dst_type dst0, dst1, dst; \
+    VXC_ReadImage(data0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src0, data0, 16); \
+    VXC_ReadImage(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, data1, 16); \
+ \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+    dst = min(dst0, dst1); \
+ \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+MINIMUM_F16TOQUANT_2D_IMPL(U8F16toU8,   vxc_uchar16, vxc_uchar16, vxc_uchar8)
+MINIMUM_F16TOQUANT_2D_IMPL(I8F16toI8,   vxc_char16,  vxc_char16,  vxc_char8)
+MINIMUM_F16TOQUANT_2D_IMPL(I16F16toI16, vxc_short8,  vxc_short8,  vxc_short8)
+MINIMUM_F16TOQUANT_2D_IMPL(F16F16toI16, vxc_half8,   vxc_short8,  vxc_short8)
+MINIMUM_F16TOQUANT_2D_IMPL(F16F16toI8,  vxc_half8,   vxc_short8,  vxc_char8)
+MINIMUM_F16TOQUANT_2D_IMPL(F16F16toU8,  vxc_half8,   vxc_short8,  vxc_uchar8)
+
+#define MINIMUM_QUANT_F16TOF16_IMPL(name, src_type) \
+__kernel void minimum_##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output  \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    src_type vec0; \
+    vxc_half8 src0, src1; \
+    vxc_short8 data1, dst; \
+    VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, data1, 16); \
+ \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                uniU8MulAndPostShift0_Lo_2x8); \
+ \
+    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \
+    _viv_asm(COPY, dst, src0, 16); \
+ \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+MINIMUM_QUANT_F16TOF16_IMPL(U8F16toF16,  vxc_uchar16)
+MINIMUM_QUANT_F16TOF16_IMPL(I8F16toF16,  vxc_char16)
+MINIMUM_QUANT_F16TOF16_IMPL(I16F16toF16, vxc_short8)
+
+#define MINIMUM_QUANT_F16TOF16_2D_IMPL(name, src_type) \
+__kernel void minimum_##name##_2D \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output  \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    src_type vec0; \
+    vxc_half8 src0, src1; \
+    vxc_short8 data1, dst; \
+    VXC_ReadImage(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, data1, 16); \
+ \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                uniU8MulAndPostShift0_Lo_2x8); \
+ \
+    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \
+    _viv_asm(COPY, dst, src0, 16); \
+ \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+MINIMUM_QUANT_F16TOF16_2D_IMPL(U8F16toF16,  vxc_uchar16)
+MINIMUM_QUANT_F16TOF16_2D_IMPL(I8F16toF16,  vxc_char16)
+MINIMUM_QUANT_F16TOF16_2D_IMPL(I16F16toF16, vxc_short8)
+
+#define MINIMUM_QUANTTOF16_IMPL(name, src_type) \
+__kernel void minimum_##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output  \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ \
+    src_type vec0, vec1; \
+    vxc_half8 src0, src1; \
+    vxc_short8 dst; \
+    VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(src0, vec0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(src1, vec1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+ \
+    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \
+    _viv_asm(COPY, dst, src0, 16); \
+ \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+MINIMUM_QUANTTOF16_IMPL(U8U8toF16,   vxc_uchar16)
+MINIMUM_QUANTTOF16_IMPL(I8I8toF16,   vxc_char16)
+MINIMUM_QUANTTOF16_IMPL(I16I16toF16, vxc_short8)
+
+#define MINIMUM_QUANTTOF16_2D_IMPL(name, src_type) \
+__kernel void minimum_##name##_2D \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output  \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    src_type vec0, vec1; \
+    vxc_half8 src0, src1; \
+    vxc_short8 dst; \
+    VXC_ReadImage(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_ushort8 mp0, mp1; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(src0, vec0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(src1, vec1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+ \
+    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \
+    _viv_asm(COPY, dst, src0, 16); \
+ \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+MINIMUM_QUANTTOF16_2D_IMPL(U8U8toF16,   vxc_uchar16)
+MINIMUM_QUANTTOF16_2D_IMPL(I8I8toF16,   vxc_char16)
+MINIMUM_QUANTTOF16_2D_IMPL(I16I16toF16, vxc_short8)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/minimum_fp16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/minimum_fp16.vx
deleted file mode 100644
index f60a751..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/minimum_fp16.vx
+++ /dev/null
@@ -1,317 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniConvertI8toI8_0_part0_2x8;
-_viv_uniform VXC_512Bits uniConvertI8toI8_0_part1_2x8;
-_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8;
-_viv_uniform VXC_512Bits uniConvertInt8toFp16_2x8;
-
-__kernel void minimum_I8F16toI8
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_char16 src0, src2, dst;
-    vxc_short8 src1, src3, src4, src5;
-    vxc_half8 data0, data1, data2, data3;
-    vxc_char16 tmp0, tmp1;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src4, input1, coord, VXC_5BITOFFSET_XY(8, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, data0, src1, 16);
-    _viv_asm(COPY, data1, src4, 16);
-
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);
-    VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);
-    VXC_DP2x8(tmp0, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);
-    dst = min(src0, tmp0);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_I8F16toI8_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_char16 src0, src2, dst;
-    vxc_short8 src1, src3, src4, src5;
-    vxc_half8 data0, data1, data2, data3;
-    vxc_char16 tmp0;
-
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src4, input1, coord.xy, VXC_5BITOFFSET_XY(8, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, data0, src1, 16);
-    _viv_asm(COPY, data1, src4, 16);
-
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);
-    VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);
-    VXC_DP2x8(tmp0, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);
-    dst = min(src0, tmp0);
-
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_I8F16toF16
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_char8 vec0, vec2;
-    vxc_short8 vec1, vec3, dst;
-    vxc_half8  src0, src1, src2, src3;
-
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt8toFp16_2x8);
-
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    _viv_asm(COPY, dst, src0, 16);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_I8F16toF16_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_char8 vec0, vec2;
-    vxc_short8 vec1, vec3, dst;
-    vxc_half8  src0, src1, src2, src3;
-    VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt8toFp16_2x8);
-
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    _viv_asm(COPY, dst, src0, 16);
-
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
-_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
-
-__kernel void minimum_U8F16toF16
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_uchar8 vec0, vec2;
-    vxc_short8 vec1, vec3, dst;
-    vxc_half8  src0, src1, src2, src3;
-
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    vxc_ushort8 ms0;
-    _viv_asm(COPY, ms0, multAndoutZP0, 16);
-    VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-                uniU8MulAndPostShift_0_Lo_2x8);
-
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    _viv_asm(COPY, dst, src0, 16);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_U8F16toF16_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_uchar8 vec0, vec2;
-    vxc_short8 vec1, vec3, dst;
-    vxc_half8  src0, src1, src2, src3;
-
-    VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    vxc_ushort8 ms0;
-    _viv_asm(COPY, ms0, multAndoutZP0, 16);
-    VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-                uniU8MulAndPostShift_0_Lo_2x8);
-
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    _viv_asm(COPY, dst, src0, 16);
-
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
-_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8;
-_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8;
-_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
-__kernel void minimum_U8F16toU8
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_uchar16 src0, dst0, dst1;
-    vxc_ushort8 src1, src2;
-    vxc_half8 data1, data2;
-    VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src2, input1, coord, VXC_5BITOFFSET_XY(8, 0), \
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    _viv_asm(COPY, data2, src2, 16);
-
-    vxc_ushort8 mp0, mp1;
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Lo_2x8);
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Hi_2x8);
-    VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniConvertFp16toU8_2x8);
-    VXC_DP2x8(dst1, data2, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\
-        uniConvertFp16toU8_2x8);
-    dst0 = min(dst0, dst1);
-
-    VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_U8F16toU8_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_uchar16 src0, dst0, dst1;
-    vxc_ushort8 src1, src2;
-    vxc_half8 data1, data2;
-    VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src2, input1, coord, VXC_5BITOFFSET_XY(8, 0), \
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data1, src1, 16);
-    _viv_asm(COPY, data2, src2, 16);
-
-    vxc_ushort8 mp0, mp1;
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Lo_2x8);
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Hi_2x8);
-    VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniConvertFp16toU8_2x8);
-    VXC_DP2x8(dst1, data2, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\
-        uniConvertFp16toU8_2x8);
-    dst0 = min(dst0, dst1);
-
-    VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_F16F16toU8
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_ushort8 src0, src1;
-    vxc_half8 data0, data1;
-    vxc_uchar16 dst0, dst1;
-    VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    _viv_asm(COPY, data1, src1, 16);
-
-    vxc_ushort8 mp1;
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(dst0, data0, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniConvertFp16toU8_2x8);
-    VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniConvertFp16toU8_2x8);
-    dst0 = min(dst0, dst1);
-
-    VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_F16F16toU8_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));
-
-    vxc_ushort8 src0, src1;
-    vxc_half8 data0, data1;
-    vxc_uchar16 dst0, dst1;
-    VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    _viv_asm(COPY, data1, src1, 16);
-
-    vxc_ushort8 mp1;
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(dst0, data0, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniConvertFp16toU8_2x8);
-    VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniConvertFp16toU8_2x8);
-    dst0 = min(dst0, dst1);
-
-    VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/minimum_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/minimum_i16.vx
deleted file mode 100644
index c2f5ca5..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/minimum_i16.vx
+++ /dev/null
@@ -1,237 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniConvertI16toI16_2x8;
-_viv_uniform VXC_512Bits uinConvertFp16ToInt16_2x8;
-_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8;
-
-_viv_uniform float outputScale;
-_viv_uniform float output_zp;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform VXC_512Bits uniConvert1stFp16ToFp32_4x4;
-_viv_uniform VXC_512Bits uniConvert2ndFp16ToFp32_4x4;
-
-__kernel void minimum_I16F16toI16
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0, src1, tmp0, dst;
-    vxc_half8 data0;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, data0, src1, 16);
-
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_2x8);
-    VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt16_2x8);
-    dst = min(src0, tmp0);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_I16F16toI16_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_short8 src0, src1, tmp0, dst;
-    vxc_half8 data0;
-
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, data0, src1, 16);
-
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_2x8);
-    VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt16_2x8);
-    dst = min(src0, tmp0);
-
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_I16F16toF16
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 vec0, vec1, dst;
-    vxc_half8  src0, src1;
-
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt16toFp16_2x8);
-
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    _viv_asm(COPY, dst, src0, 16);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_I16F16toF16_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_short8 vec0, vec1, dst;
-    vxc_half8  src0, src1;
-    VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, src1, vec1, 16);
-
-    VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt16toFp16_2x8);
-
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-    _viv_asm(COPY, dst, src0, 16);
-
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_F16F16toI16
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0, src1, dst;
-    vxc_half8 data0, data1;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, data0, src0, 16);
-    _viv_asm(COPY, data1, src1, 16);
-
-    VXC_VertMin3_Half(data0, data0, data1, data1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-
-    int4 tmpDst0, tmpDst1;
-    float4 tmpData0, tmpData1;
-    VXC_DP4x4(tmpData0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert1stFp16ToFp32_4x4);
-    VXC_DP4x4(tmpData1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert2ndFp16ToFp32_4x4);
-    tmpDst0 = convert_int4_rte(tmpData0 * outputScale + output_zp);
-    tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);
-    VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_F16F16toI16_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_short8 src0, src1, dst;
-    vxc_half8 data0, data1;
-
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(COPY, data0, src0, 16);
-    _viv_asm(COPY, data1, src1, 16);
-
-    VXC_VertMin3_Half(data0, data0, data1, data1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));
-
-    int4 tmpDst0, tmpDst1;
-    float4 tmpData0, tmpData1;
-    VXC_DP4x4(tmpData0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert1stFp16ToFp32_4x4);
-    VXC_DP4x4(tmpData1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert2ndFp16ToFp32_4x4);
-    tmpDst0 = convert_int4_rte(tmpData0 * outputScale + output_zp);
-    tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);
-    VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
-_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;
-_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
-_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
-__kernel void minimum_I16I16toU8
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 src0, src1;
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    vxc_uchar16 dst0, dst1, dst;
-    vxc_ushort8 mp0, mp1;
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Lo_2x8);
-    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift1_Lo_2x8);
-    dst = min(dst0, dst1);
-
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
-
-__kernel void minimum_I16I16toU8_2D
-    (
-    __read_only  image2d_array_t    input0,
-    __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
-    )
-{
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
-
-    vxc_short8 src0, src1;
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    vxc_uchar16 dst0, dst1, dst;
-    vxc_ushort8 mp0, mp1;
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift0_Lo_2x8);
-    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
-        uniU8MulAndPostShift1_Lo_2x8);
-    dst = min(dst0, dst1);
-
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx b/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx
index eb248fb..4d120f8 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx
@@ -41,14 +41,17 @@ __kernel void one_hot_##name0##to##name1 \
         coord.z ++; \
     } while (coord.z < depth); \
 }
-ONE_HOT_SH_IMPL(F16, F16, vxc_ushort8, vxc_half8,  vxc_ushort8)
-ONE_HOT_SH_IMPL(F16, I16, vxc_ushort8, vxc_half8,  vxc_ushort8)
-ONE_HOT_SH_IMPL(F16, I8,  vxc_ushort8, vxc_half8,  vxc_uchar8)
-ONE_HOT_SH_IMPL(F16, U8,  vxc_ushort8, vxc_half8,  vxc_uchar8)
-ONE_HOT_SH_IMPL(I16, F16, vxc_short8,  vxc_short8, vxc_ushort8)
-ONE_HOT_SH_IMPL(I16, I16, vxc_short8,  vxc_short8, vxc_ushort8)
-ONE_HOT_SH_IMPL(I8,  F16, vxc_char8,   vxc_char8,  vxc_ushort8)
-ONE_HOT_SH_IMPL(I8,  I8,  vxc_char8,   vxc_char8,  vxc_uchar8)
+ONE_HOT_SH_IMPL(F16, F16,  vxc_ushort8, vxc_half8,  vxc_ushort8)
+ONE_HOT_SH_IMPL(F16, I16,  vxc_ushort8, vxc_half8,  vxc_ushort8)
+ONE_HOT_SH_IMPL(F16, I8,   vxc_ushort8, vxc_half8,  vxc_uchar8)
+ONE_HOT_SH_IMPL(F16, U8,   vxc_ushort8, vxc_half8,  vxc_uchar8)
+ONE_HOT_SH_IMPL(I16, F16,  vxc_short8,  vxc_short8, vxc_ushort8)
+ONE_HOT_SH_IMPL(I16, I16,  vxc_short8,  vxc_short8, vxc_ushort8)
+ONE_HOT_SH_IMPL(I16, BI16, vxc_short8,  vxc_short8, vxc_ushort8)
+ONE_HOT_SH_IMPL(I16, I8,   vxc_short8,  vxc_short8, vxc_uchar8)
+ONE_HOT_SH_IMPL(I16, U8,   vxc_short8,  vxc_short8, vxc_uchar8)
+ONE_HOT_SH_IMPL(I8,  F16,  vxc_char8,   vxc_char8,  vxc_ushort8)
+ONE_HOT_SH_IMPL(I8,  I8,   vxc_char8,   vxc_char8,  vxc_uchar8)
 
 #define ONE_HOT_SH_IMPL_2D(name0, name1, src_type, copy_type, dst_type) \
 __kernel void one_hot_##name0##to##name1##_2D \
@@ -97,14 +100,17 @@ __kernel void one_hot_##name0##to##name1##_2D \
         coord.y += 4; \
     } while (coord.y < depth); \
 }
-ONE_HOT_SH_IMPL_2D(F16, F16, vxc_ushort8, vxc_half8,  vxc_ushort8)
-ONE_HOT_SH_IMPL_2D(F16, I16, vxc_ushort8, vxc_half8,  vxc_ushort8)
-ONE_HOT_SH_IMPL_2D(F16, I8,  vxc_ushort8, vxc_half8,  vxc_uchar8)
-ONE_HOT_SH_IMPL_2D(F16, U8,  vxc_ushort8, vxc_half8,  vxc_uchar8)
-ONE_HOT_SH_IMPL_2D(I16, F16, vxc_short8,  vxc_short8, vxc_ushort8)
-ONE_HOT_SH_IMPL_2D(I16, I16, vxc_short8,  vxc_short8, vxc_ushort8)
-ONE_HOT_SH_IMPL_2D(I8,  F16, vxc_char8,   vxc_char8,  vxc_ushort8)
-ONE_HOT_SH_IMPL_2D(I8,  I8,  vxc_char8,   vxc_char8,  vxc_uchar8)
+ONE_HOT_SH_IMPL_2D(F16, F16,  vxc_ushort8, vxc_half8,  vxc_ushort8)
+ONE_HOT_SH_IMPL_2D(F16, I16,  vxc_ushort8, vxc_half8,  vxc_ushort8)
+ONE_HOT_SH_IMPL_2D(F16, I8,   vxc_ushort8, vxc_half8,  vxc_uchar8)
+ONE_HOT_SH_IMPL_2D(F16, U8,   vxc_ushort8, vxc_half8,  vxc_uchar8)
+ONE_HOT_SH_IMPL_2D(I16, F16,  vxc_short8,  vxc_short8, vxc_ushort8)
+ONE_HOT_SH_IMPL_2D(I16, I16,  vxc_short8,  vxc_short8, vxc_ushort8)
+ONE_HOT_SH_IMPL_2D(I16, BI16, vxc_short8,  vxc_short8, vxc_ushort8)
+ONE_HOT_SH_IMPL_2D(I16, I8,   vxc_short8,  vxc_short8, vxc_uchar8)
+ONE_HOT_SH_IMPL_2D(I16, U8,   vxc_short8,  vxc_short8, vxc_uchar8)
+ONE_HOT_SH_IMPL_2D(I8,  F16,  vxc_char8,   vxc_char8,  vxc_ushort8)
+ONE_HOT_SH_IMPL_2D(I8,  I8,   vxc_char8,   vxc_char8,  vxc_uchar8)
 
 _viv_uniform float input_scale;
 _viv_uniform float input_tail;
@@ -148,8 +154,11 @@ __kernel void one_hot_##name0##to##name1 \
         coord.z ++; \
     } while (coord.z < depth); \
 }
-ONE_HOT_ASYM_SH_IMPL(U8,  F16, vxc_uchar8,  vxc_uchar8, vxc_ushort8)
-ONE_HOT_ASYM_SH_IMPL(U8,  U8,  vxc_uchar8,  vxc_uchar8, vxc_uchar8)
+ONE_HOT_ASYM_SH_IMPL(U8, F16,  vxc_uchar8,  vxc_uchar8, vxc_ushort8)
+ONE_HOT_ASYM_SH_IMPL(U8, U8,   vxc_uchar8,  vxc_uchar8, vxc_uchar8)
+ONE_HOT_ASYM_SH_IMPL(U8, I8,   vxc_uchar8,  vxc_uchar8, vxc_uchar8)
+ONE_HOT_ASYM_SH_IMPL(U8, I16,  vxc_uchar8,  vxc_uchar8, vxc_ushort8)
+ONE_HOT_ASYM_SH_IMPL(U8, BI16, vxc_uchar8,  vxc_uchar8, vxc_ushort8)
 
 #define ONE_HOT_ASYM_SH_IMPL_2D(name0, name1, src_type, copy_type, dst_type) \
 __kernel void one_hot_##name0##to##name1##_2D \
@@ -200,8 +209,11 @@ __kernel void one_hot_##name0##to##name1##_2D \
         coord.y += 4; \
     } while (coord.y < depth); \
 }
-ONE_HOT_ASYM_SH_IMPL_2D(U8,  F16, vxc_uchar8,  vxc_uchar8, vxc_ushort8)
-ONE_HOT_ASYM_SH_IMPL_2D(U8,  U8,  vxc_uchar8,  vxc_uchar8, vxc_uchar8)
+ONE_HOT_ASYM_SH_IMPL_2D(U8, F16,  vxc_uchar8,  vxc_uchar8, vxc_ushort8)
+ONE_HOT_ASYM_SH_IMPL_2D(U8, U8,   vxc_uchar8,  vxc_uchar8, vxc_uchar8)
+ONE_HOT_ASYM_SH_IMPL_2D(U8, I8,   vxc_uchar8,  vxc_uchar8, vxc_uchar8)
+ONE_HOT_ASYM_SH_IMPL_2D(U8, I16,  vxc_uchar8,  vxc_uchar8, vxc_ushort8)
+ONE_HOT_ASYM_SH_IMPL_2D(U8, BI16, vxc_uchar8,  vxc_uchar8, vxc_ushort8)
 
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx
new file mode 100644
index 0000000..602f6f5
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx
@@ -0,0 +1,330 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniVecShift10;
+_viv_uniform VXC_512Bits uniAddRShift;
+_viv_uniform VXC_512Bits uniGetTempVal;
+_viv_uniform VXC_512Bits uniExtractBytes;
+
+_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+#define RESIZE_BILINEAR_4X1(input, mean, output) \
+    VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+ \
+    tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \
+    _viv_asm(CONV, dst0, tmp_dst); \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst1, 8); \
+    VXC_WriteImage(output, coord_out, dst, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+
+#define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __read_only  image2d_array_t input2, \
+    __write_only image2d_array_t output0, \
+    __write_only image2d_array_t output1, \
+    __write_only image2d_array_t output2, \
+          global int             *xRatio, \
+          global int             *yRatio, \
+          global int             *xOffset, \
+          global int             *yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           f32Var \
+    ) \
+{ \
+    int2 ratioXY = (int2)(*xRatio, *yRatio); \
+ \
+    int4 xPos = get_global_id(0); \
+    int yPos = get_global_id(1); \
+ \
+    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \
+    xPos += (int4)(0, 1, 2, 3); \
+ \
+    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \
+    int4 sx = fx0 & 0xffff8000; \
+    fx0 -= sx; \
+    sx = sx >> 15; \
+ \
+    vxc_short4 fx; \
+    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniAddRShift); \
+ \
+    int fy = yPos * ratioXY.y + ratioSufXY.y; \
+    int sy = fy & 0xffff8000; \
+ \
+    fy -= sy; \
+    sy = sy >> 15; \
+ \
+    fy = (fy + (1<< 4)) >> 5; \
+ \
+    vxc_uchar16 line0Y; \
+    vxc_uchar16 line1Y; \
+    int4 coord; \
+    sx = sx + *xOffset; \
+    coord.xyz = sx.xyz; \
+    coord.w = sy + *yOffset; \
+    int2 coord1 = (int2)(sx.w, coord.w); \
+    VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 test01, temp1; \
+    int4 test02, temp2; \
+    int4 tt; \
+    vxc_uchar4 val; \
+    int2 coord_out = (int2)(xPos.x, yPos); \
+ \
+    vxc_uchar8 line1, line2; \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ \
+    vxc_float4 tmp_dst; \
+    vxc_uchar4 u8_dst; \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+ \
+    conv_type dst0; \
+    dst_type dst1; \
+    copy_type dst; \
+    tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \
+    _viv_asm(CONV, dst0, tmp_dst); \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst1, 8); \
+    VXC_WriteImage(output0, coord_out, dst, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    RESIZE_BILINEAR_4X1(input1, gMean, output1) \
+    RESIZE_BILINEAR_4X1(input2, bMean, output2) \
+}
+PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8,  half4, vxc_short8)
+PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4,  vxc_short8)
+
+#define PRE_PROCESS_RGB888_PLANAR_8BITS(dst_name, write_type) \
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __read_only  image2d_array_t input2, \
+    __write_only image2d_array_t output0, \
+    __write_only image2d_array_t output1, \
+    __write_only image2d_array_t output2, \
+          global int             *xRatio, \
+          global int             *yRatio, \
+          global int             *xOffset, \
+          global int             *yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           f32Var \
+    ) \
+{ \
+    int2 ratioXY = (int2)(*xRatio, *yRatio); \
+    int4 xPos = get_global_id(0); \
+    int yPos  = get_global_id(1); \
+ \
+    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \
+    xPos += (int4)(0, 1, 2, 3); \
+ \
+    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \
+    int4 sx = fx0 & 0xffff8000; \
+    fx0 -= sx; \
+    sx = sx >> 15; \
+ \
+    vxc_short4 fx; \
+    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \
+ \
+    int fy = yPos * ratioXY.y + ratioSufXY.y; \
+    int sy = fy & 0xffff8000; \
+ \
+    fy -= sy; \
+    sy = sy >> 15; \
+    fy = (fy + (1<< 4)) >> 5; \
+ \
+    vxc_uchar16 line0Y; \
+    vxc_uchar16 line1Y; \
+    int4 coord; \
+    sx = sx + *xOffset; \
+    coord.xyz = sx.xyz; \
+    coord.w   = sy + *yOffset; \
+    int2 coord1 = (int2)(sx.w, coord.w); \
+    VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 test01, temp1; \
+    int4 test02, temp2; \
+    int2 coord_out = (int2)(xPos.x, yPos); \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ \
+    vxc_float4 tmp_dst; \
+    vxc_uchar4 u8_dst; \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+ \
+    int4 dst0; \
+    write_type dst; \
+    tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \
+    dst0 = convert_int4_rte(tmp_dst); \
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+ \
+    VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input1, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input1, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line1Y, input1, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input1, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input1, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input1, coord1, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+    tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \
+    dst0 = convert_int4_rte(tmp_dst); \
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+ \
+    VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input2, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input2, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line1Y, input2, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input2, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input2, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input2, coord1, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+    tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \
+    dst0 = convert_int4_rte(tmp_dst); \
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+ \
+    VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)
+PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx
new file mode 100644
index 0000000..5a9942c
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx
@@ -0,0 +1,152 @@
+/*
+ ============================================================================
+ Name        : GrayScale.vx
+ Author      : Sam
+ Version     :
+ Copyright   : Your copyright notice
+ Description :
+ ============================================================================
+ */
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;
+_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;
+
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __read_only  image2d_array_t input2, \
+    __write_only image2d_array_t output0, \
+    __write_only image2d_array_t output1, \
+    __write_only image2d_array_t output2, \
+          global int             *xRatio, \
+          global int             *yRatio, \
+          global int             *xOffset, \
+          global int             *yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           f32Var \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    coord.xy += (int2)(*xOffset, *yOffset); \
+    vxc_uchar16 src0, src1, src2; \
+    dst_type dst0, dst1; \
+ \
+    VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    coord.x = coord.z + 8; \
+    float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \
+        rMean * output_scale - output_zp, output_scale); \
+ \
+    half4 paramData_f16; \
+    copy_type tmp_dst; \
+    _viv_asm(CONV, paramData_f16, paramData0); \
+    VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevLo_2x8); \
+    VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevHi_2x8); \
+    _viv_asm(COPY, tmp_dst, dst0, 16); \
+    VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, tmp_dst, dst1, 16); \
+    VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \
+        gMean * output_scale - output_zp, output_scale); \
+    _viv_asm(CONV, paramData_f16, paramData1); \
+    VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevLo_2x8); \
+    VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevHi_2x8); \
+    _viv_asm(COPY, tmp_dst, dst0, 16); \
+    VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, tmp_dst, dst1, 16); \
+    VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \
+        bMean * output_scale - output_zp, output_scale); \
+    _viv_asm(CONV, paramData_f16, paramData2); \
+    VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevLo_2x8); \
+    VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevHi_2x8); \
+    _viv_asm(COPY, tmp_dst, dst0, 16); \
+    VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, tmp_dst, dst1, 16); \
+    VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8,  vxc_short8)
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)
+
+#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __read_only  image2d_array_t input2, \
+    __write_only image2d_array_t output0, \
+    __write_only image2d_array_t output1, \
+    __write_only image2d_array_t output2, \
+          global int             *xRatio, \
+          global int             *yRatio, \
+          global int             *xOffset, \
+          global int             *yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           f32Var \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    coord.xy += (int2) (*xOffset, *yOffset); \
+    vxc_uchar16 src0, src1, src2; \
+    write_type dst; \
+ \
+    VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \
+        rMean * output_scale - output_zp, output_scale); \
+ \
+    half4 paramData_f16; \
+    _viv_asm(CONV, paramData_f16, paramData0); \
+ \
+    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevLo_2x8); \
+    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevHi_2x8); \
+    VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \
+        gMean * output_scale - output_zp, output_scale); \
+    _viv_asm(CONV, paramData_f16, paramData1); \
+ \
+    VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevLo_2x8); \
+    VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevHi_2x8); \
+    VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \
+        bMean * output_scale - output_zp, output_scale); \
+    _viv_asm(CONV, paramData_f16, paramData2); \
+ \
+    VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevLo_2x8); \
+    VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevHi_2x8); \
+    VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+}
+PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)
+PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx
new file mode 100644
index 0000000..a82a3ba
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx
@@ -0,0 +1,122 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;
+_viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;
+_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;
+_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;
+_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;
+
+__kernel void pre_process_rgb888_planar_4over3_U8toU8
+    (
+    __read_only  image2d_array_t input0,
+    __read_only  image2d_array_t input1,
+    __read_only  image2d_array_t input2,
+    __write_only image2d_array_t output0,
+    __write_only image2d_array_t output1,
+    __write_only image2d_array_t output2,
+          global int             *xRatio,
+          global int             *yRatio,
+          global int             *xOffset,
+          global int             *yOffset,
+                 float           rMean,
+                 float           gMean,
+                 float           bMean,
+                 float           f32Var
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
+    int4 coord_out;
+
+    vxc_uchar16 src0, src1, src2, src3;
+    vxc_uchar16 dst0, dst1, dst2;
+
+    VXC_ReadImage(src0, input0, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input0, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src2, input0, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src3, input0, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    coord_out.xy = (coord_in.xy >> 2) * 3;
+    coord_out.zw = coord_in.yy + (int2)(1, 2);
+
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
+
+    VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+
+    VXC_ReadImage(src0, input1, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input1, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src2, input1, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src3, input1, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
+
+    VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+
+    VXC_ReadImage(src0, input2, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input2, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src2, input2, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src3, input2, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
+
+    VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void pre_process_rgb888_planar_half_U8toU8
+    (
+    __read_only  image2d_array_t input0,
+    __read_only  image2d_array_t input1,
+    __read_only  image2d_array_t input2,
+    __write_only image2d_array_t output0,
+    __write_only image2d_array_t output1,
+    __write_only image2d_array_t output2,
+          global int             *xRatio,
+          global int             *yRatio,
+          global int             *xOffset,
+          global int             *yOffset,
+                 float           rMean,
+                 float           gMean,
+                 float           bMean,
+                 float           f32Var
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
+
+    vxc_uchar16 src0, src1, src2;
+
+    VXC_ReadImage(src0, input0, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input1, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src2, input2, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    coord_in.zw = coord_in.xy >> 1;
+
+    VXC_WriteImage(output0, coord_in.zw, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output1, coord_in.zw, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output2, coord_in.zw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx
index bd97b11..8f47577 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx
@@ -1,12 +1,4 @@
-/*
- ============================================================================
- Name        : libNNExt.vx
- Author      : VSI
- Version     :
- Copyright   : Your copyright notice
- Description :
- ============================================================================
- */
+
 #pragma OPENCL EXTENSION cl_viv_vx_extension : enable
 
 typedef struct Image
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
index 93da98e..0dd28ed 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
@@ -4937,49 +4937,14 @@ __kernel void detect_post_box_U8_U8toF32(\n\
 }\n\
 "; /* end of detect_post_box_vx*/
 
-static const char eltwise_unary_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char eltwise_unary_2d_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform float alpha;\n\
 _viv_uniform float beta;\n\
 \n\
-float4 eltwise_unary_sin(float4 x)\n\
-{\n\
-    return native_sin(x);\n\
-}\n\
-\n\
-float4 eltwise_unary_cos(float4 x)\n\
-{\n\
-    return native_cos(x);\n\
-}\n\
-\n\
 #define logE        (1.44269502f)\n\
 #define twoLogE     (logE * 2.0f)\n\
-float4 eltwise_unary_exp(float4 x)\n\
-{\n\
-    x *= logE;\n\
-    x = exp2(x);\n\
-    return x;\n\
-}\n\
-\n\
 #define rlogE    (0.693147182f)\n\
-float4 eltwise_unary_log(float4 x)\n\
-{\n\
-    x = log2(x);\n\
-    return x * rlogE;\n\
-}\n\
-\n\
-float4 eltwise_unary_elu(float4 val)\n\
-{\n\
-    float4 x = val * logE;\n\
-    x = exp2(x) * alpha - alpha;\n\
-\n\
-    return val < 0 ? x : val;\n\
-}\n\
-\n\
-float4 eltwise_unary_neg(float4 x)\n\
-{\n\
-    return x * -1;\n\
-}\n\
 \n\
 float4 eltwise_unary_hard_sigmoid(float4 x)\n\
 {\n\
@@ -5017,43 +4982,51 @@ float4 eltwise_unary_round(float4 x)\n\
     return convert_float4(convert_int4_rte(x));\n\
 }\n\
 \n\
-#define MUL2_RSQRTPI    (1.1283791670955126f)\n\
-float erf_eval(float x)\n\
+float4 evaluate_polynomial_alpha(float4 x2)\n\
 {\n\
-    float res = 0;\n\
-    float tmp = x;\n\
-    float factorial = 1;\n\
-    float x_pow = x;\n\
-    float one = 1.0f;\n\
-    float n = 1;\n\
+    float4 alpha0 = (float4){-2.72614225801306e-10f, 2.77068142495902e-08f,\n\
+                            -2.10102402082508e-06f, -5.69250639462346e-05f};\n\
+    float4 alpha1 = (float4){-7.34990630326855e-04f, -2.95459980854025e-03f,\n\
+                            -1.60960333262415e-02f, 0};\n\
 \n\
-    if (x <= -3)\n\
-        return -1;\n\
-    else if(x >= 3)\n\
-        return 1;\n\
+    float4 poly = alpha0.x * x2 + alpha0.y;\n\
+    poly = poly * x2 + alpha0.z;\n\
+    poly = poly * x2 + alpha0.w;\n\
+    poly = poly * x2 + alpha1.x;\n\
+    poly = poly * x2 + alpha1.y;\n\
+    poly = poly * x2 + alpha1.z;\n\
 \n\
-    while (fabs(tmp) > 1e-5)\n\
-    {\n\
-        res += tmp;\n\
-\n\
-        factorial *= n;\n\
-        one *= -1;\n\
-        x_pow *= x * x;\n\
-        tmp = one / factorial * x_pow / ( 2 * n + 1);\n\
-\n\
-        n += 1.0f;\n\
-    }\n\
-    return res * MUL2_RSQRTPI;\n\
+    return poly;\n\
 }\n\
+\n\
+float4 evaluate_polynomial_beta(float4 x2)\n\
+{\n\
+    float4 beta0 = (float4){-1.45660718464996e-05f, -2.13374055278905e-04f,\n\
+                            -1.68282697438203e-03f, -7.37332916720468e-03f};\n\
+    float4 beta1 = (float4){-1.42647390514189e-02f, 0, 0, 0};\n\
+\n\
+    float4 poly = beta0.x * x2 + beta0.y;\n\
+    poly = poly * x2 + beta0.z;\n\
+    poly = poly * x2 + beta0.w;\n\
+    poly = poly * x2 + beta1.x;\n\
+\n\
+    return 1.0f / poly;\n\
+}\n\
+\n\
+float4 erf_eval(float4 _x)\n\
+{\n\
+    float4 x = clamp(_x, -4, 4);\n\
+    float4 x2 = x * x;\n\
+\n\
+    return x * evaluate_polynomial_alpha(x2) * evaluate_polynomial_beta(x2);\n\
+}\n\
+\n\
 #define RSQRT2      (0.70710678118654752440084436210485f)\n\
 float4 eltwise_unary_gelu(float4 x)\n\
 {\n\
     float4 erf, data;\n\
     data = x * RSQRT2;\n\
-    erf.x = erf_eval(data.x);\n\
-    erf.y = erf_eval(data.y);\n\
-    erf.z = erf_eval(data.z);\n\
-    erf.w = erf_eval(data.w);\n\
+    erf = erf_eval(data);\n\
     x = 0.5f * x * (1 + erf);\n\
 \n\
     return x;\n\
@@ -5111,6 +5084,203 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\
     _viv_asm(COPY, dst, dst2, 16); \\\n\
     VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+//MISH\n\
+ELTSISE_UNARY_2D(mish, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(mish, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(mish, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(mish, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(mish, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(mish, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(mish, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(mish, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(mish, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(mish, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+//HARD_SIGMOID\n\
+ELTSISE_UNARY_2D(hard_sigmoid, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(hard_sigmoid, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(hard_sigmoid, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(hard_sigmoid, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(hard_sigmoid, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(hard_sigmoid, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(hard_sigmoid, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(hard_sigmoid, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(hard_sigmoid, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(hard_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+//ROUND\n\
+ELTSISE_UNARY_2D(round, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(round, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(round, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(round, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(round, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(round, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(round, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(round, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(round, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+//GELU\n\
+ELTSISE_UNARY_2D(gelu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(gelu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(gelu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(gelu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(gelu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(gelu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(gelu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(gelu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(gelu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+//HARD_GELU\n\
+ELTSISE_UNARY_2D(hard_gelu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(hard_gelu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(hard_gelu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(hard_gelu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(hard_gelu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(hard_gelu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(hard_gelu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(hard_gelu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(hard_gelu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(hard_gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
+\n\
+#define ELTSISE_UNARY_BF16_2D(func_name) \\\n\
+    __kernel void func_name##_BF16toBF16_2D( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 int              type, \\\n\
+                 float            _alpha, \\\n\
+                 float            _beta \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    vxc_ushort8   src0, src1, dst; \\\n\
+    VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 vecA; \\\n\
+    float4 vecB; \\\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
+    _viv_asm(COPY, vecA, src1, 16); \\\n\
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\
+    _viv_asm(COPY, vecB, src1, 16); \\\n\
+    vecA = eltwise_unary_##func_name(vecA); \\\n\
+    vecB = eltwise_unary_##func_name(vecB); \\\n\
+ \\\n\
+    _viv_asm(COPY, src0, vecA, 16); \\\n\
+    _viv_asm(COPY, src1, vecB, 16); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+//MISH\n\
+ELTSISE_UNARY_BF16_2D(mish)\n\
+//HARD_SIGMOID\n\
+ELTSISE_UNARY_BF16_2D(hard_sigmoid)\n\
+//ROUND\n\
+ELTSISE_UNARY_BF16_2D(round)\n\
+//GELU\n\
+ELTSISE_UNARY_BF16_2D(gelu)\n\
+//HARD_GELU\n\
+ELTSISE_UNARY_BF16_2D(hard_gelu)\n\
+"; /* end of eltwise_unary_2d_0_vx*/
+
+static const char eltwise_unary_2d_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float alpha;\n\
+_viv_uniform float beta;\n\
+\n\
+float4 eltwise_unary_sin(float4 x)\n\
+{\n\
+    return native_sin(x);\n\
+}\n\
+\n\
+float4 eltwise_unary_cos(float4 x)\n\
+{\n\
+    return native_cos(x);\n\
+}\n\
+\n\
+#define logE        (1.44269502f)\n\
+#define twoLogE     (logE * 2.0f)\n\
+float4 eltwise_unary_exp(float4 x)\n\
+{\n\
+    x *= logE;\n\
+    x = exp2(x);\n\
+    return x;\n\
+}\n\
+\n\
+#define rlogE    (0.693147182f)\n\
+float4 eltwise_unary_log(float4 x)\n\
+{\n\
+    x = log2(x);\n\
+    return x * rlogE;\n\
+}\n\
+\n\
+float4 eltwise_unary_neg(float4 x)\n\
+{\n\
+    return x * -1;\n\
+}\n\
+\n\
+float4 eltwise_unary_selu(float4 val)\n\
+{\n\
+    float4 x = val * logE;\n\
+    x = exp2(x) * alpha - alpha;\n\
+\n\
+    return val < 0 ? x : val * beta;\n\
+}\n\
+\n\
+float4 eltwise_unary_celu(float4 val)\n\
+{\n\
+    float4 x = val * logE * beta;\n\
+    x = exp2(x) * alpha - alpha;\n\
+\n\
+    return val < 0 ? x : val;\n\
+}\n\
+\n\
+_viv_uniform float inputScale;\n\
+_viv_uniform float inputTail;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform float outputZP;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;\n\
+_viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\
+\n\
+#define ELTSISE_UNARY_2D(func_name, src_type_name, dst_type_name, src_type, \\\n\
+        src_copy_type, convert_type, dst_type, dst_copy_type) \\\n\
+    __kernel void func_name##_##src_type_name##to##dst_type_name##_2D( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 int              type, \\\n\
+                 float            _alpha, \\\n\
+                 float            _beta \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    src_type      src0; \\\n\
+    src_copy_type src1; \\\n\
+    VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, src0, 16); \\\n\
+ \\\n\
+    float4 vecA; \\\n\
+    float4 vecB; \\\n\
+    VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \\\n\
+    VXC_DP4x4(vecB, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \\\n\
+    vecA = vecA * inputScale + inputTail; \\\n\
+    vecB = vecB * inputScale + inputTail; \\\n\
+    vecA = eltwise_unary_##func_name(vecA); \\\n\
+    vecB = eltwise_unary_##func_name(vecB); \\\n\
+    vecA = vecA * outputScale + outputZP; \\\n\
+    vecB = vecB * outputScale + outputZP; \\\n\
+ \\\n\
+    convert_type dst0, dst1; \\\n\
+    _viv_asm(CONV_RTE, dst0, vecA); \\\n\
+    _viv_asm(CONV_RTE, dst1, vecB); \\\n\
+    dst_type dst2; \\\n\
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    dst_copy_type dst; \\\n\
+    _viv_asm(COPY, dst, dst2, 16); \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
 //EXP\n\
 ELTSISE_UNARY_2D(exp, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
 ELTSISE_UNARY_2D(exp, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
@@ -5155,17 +5325,17 @@ ELTSISE_UNARY_2D(log, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_u
 ELTSISE_UNARY_2D(log, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
 ELTSISE_UNARY_2D(log, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
 ELTSISE_UNARY_2D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//ELU\n\
-ELTSISE_UNARY_2D(elu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(elu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(elu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(elu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(elu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(elu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(elu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(elu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(elu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(elu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+//SELU\n\
+ELTSISE_UNARY_2D(selu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(selu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(selu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(selu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(selu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(selu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(selu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(selu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(selu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(selu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
 //NEG\n\
 ELTSISE_UNARY_2D(neg, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
 ELTSISE_UNARY_2D(neg, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
@@ -5177,61 +5347,17 @@ ELTSISE_UNARY_2D(neg, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_u
 ELTSISE_UNARY_2D(neg, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
 ELTSISE_UNARY_2D(neg, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
 ELTSISE_UNARY_2D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//MISH\n\
-ELTSISE_UNARY_2D(mish, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(mish, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(mish, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(mish, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(mish, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(mish, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(mish, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(mish, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(mish, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(mish, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//HARD_SIGMOID\n\
-ELTSISE_UNARY_2D(hard_sigmoid, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(hard_sigmoid, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(hard_sigmoid, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(hard_sigmoid, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(hard_sigmoid, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(hard_sigmoid, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(hard_sigmoid, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(hard_sigmoid, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(hard_sigmoid, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(hard_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//ROUND\n\
-ELTSISE_UNARY_2D(round, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(round, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(round, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(round, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(round, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(round, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(round, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(round, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(round, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//GELU\n\
-ELTSISE_UNARY_2D(gelu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(gelu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(gelu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(gelu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(gelu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(gelu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(gelu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(gelu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(gelu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//HARD_GELU\n\
-ELTSISE_UNARY_2D(hard_gelu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(hard_gelu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(hard_gelu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(hard_gelu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(hard_gelu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_2D(hard_gelu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(hard_gelu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_2D(hard_gelu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_2D(hard_gelu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_2D(hard_gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+//CELU\n\
+ELTSISE_UNARY_2D(celu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(celu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(celu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(celu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(celu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(celu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(celu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(celu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(celu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(celu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
 \n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
@@ -5274,65 +5400,22 @@ ELTSISE_UNARY_BF16_2D(sin)\n\
 ELTSISE_UNARY_BF16_2D(cos)\n\
 //LOG\n\
 ELTSISE_UNARY_BF16_2D(log)\n\
-//ELU\n\
-ELTSISE_UNARY_BF16_2D(elu)\n\
+//SELU\n\
+ELTSISE_UNARY_BF16_2D(selu)\n\
 //NEG\n\
 ELTSISE_UNARY_BF16_2D(neg)\n\
-//MISH\n\
-ELTSISE_UNARY_BF16_2D(mish)\n\
-//HARD_SIGMOID\n\
-ELTSISE_UNARY_BF16_2D(hard_sigmoid)\n\
-//ROUND\n\
-ELTSISE_UNARY_BF16_2D(round)\n\
-//GELU\n\
-ELTSISE_UNARY_BF16_2D(gelu)\n\
-//HARD_GELU\n\
-ELTSISE_UNARY_BF16_2D(hard_gelu)\n\
-"; /* end of eltwise_unary_2d_vx*/
+//CELU\n\
+ELTSISE_UNARY_BF16_2D(celu)\n\
+"; /* end of eltwise_unary_2d_1_vx*/
 
-static const char eltwise_unary_3d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char eltwise_unary_3d_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform float alpha;\n\
 _viv_uniform float beta;\n\
 \n\
-float4 eltwise_unary_sin(float4 x)\n\
-{\n\
-    return native_sin(x);\n\
-}\n\
-\n\
-float4 eltwise_unary_cos(float4 x)\n\
-{\n\
-    return native_cos(x);\n\
-}\n\
-\n\
 #define logE        (1.44269502f)\n\
 #define twoLogE     (logE * 2.0f)\n\
-float4 eltwise_unary_exp(float4 x)\n\
-{\n\
-    x *= logE;\n\
-    x = exp2(x);\n\
-    return x;\n\
-}\n\
-\n\
 #define rlogE    (0.693147182f)\n\
-float4 eltwise_unary_log(float4 x)\n\
-{\n\
-    x = log2(x);\n\
-    return x * rlogE;\n\
-}\n\
-\n\
-float4 eltwise_unary_elu(float4 val)\n\
-{\n\
-    float4 x = val * logE;\n\
-    x = exp2(x) * alpha - alpha;\n\
-\n\
-    return val < 0 ? x : val;\n\
-}\n\
-\n\
-float4 eltwise_unary_neg(float4 x)\n\
-{\n\
-    return x * -1;\n\
-}\n\
 \n\
 float4 eltwise_unary_hard_sigmoid(float4 x)\n\
 {\n\
@@ -5370,43 +5453,51 @@ float4 eltwise_unary_round(float4 x)\n\
     return convert_float4(convert_int4_rte(x));\n\
 }\n\
 \n\
-#define MUL2_RSQRTPI    (1.1283791670955126f)\n\
-float erf_eval(float x)\n\
+float4 evaluate_polynomial_alpha(float4 x2)\n\
 {\n\
-    float res = 0;\n\
-    float tmp = x;\n\
-    float factorial = 1;\n\
-    float x_pow = x;\n\
-    float one = 1.0f;\n\
-    float n = 1;\n\
+    float4 alpha0 = (float4){-2.72614225801306e-10f, 2.77068142495902e-08f,\n\
+                            -2.10102402082508e-06f, -5.69250639462346e-05f};\n\
+    float4 alpha1 = (float4){-7.34990630326855e-04f, -2.95459980854025e-03f,\n\
+                            -1.60960333262415e-02f, 0};\n\
 \n\
-    if (x <= -3)\n\
-        return -1;\n\
-    else if(x >= 3)\n\
-        return 1;\n\
+    float4 poly = alpha0.x * x2 + alpha0.y;\n\
+    poly = poly * x2 + alpha0.z;\n\
+    poly = poly * x2 + alpha0.w;\n\
+    poly = poly * x2 + alpha1.x;\n\
+    poly = poly * x2 + alpha1.y;\n\
+    poly = poly * x2 + alpha1.z;\n\
 \n\
-    while (fabs(tmp) > 1e-5)\n\
-    {\n\
-        res += tmp;\n\
-\n\
-        factorial *= n;\n\
-        one *= -1;\n\
-        x_pow *= x * x;\n\
-        tmp = one / factorial * x_pow / ( 2 * n + 1);\n\
-\n\
-        n += 1.0f;\n\
-    }\n\
-    return res * MUL2_RSQRTPI;\n\
+    return poly;\n\
 }\n\
+\n\
+float4 evaluate_polynomial_beta(float4 x2)\n\
+{\n\
+    float4 beta0 = (float4){-1.45660718464996e-05f, -2.13374055278905e-04f,\n\
+                            -1.68282697438203e-03f, -7.37332916720468e-03f};\n\
+    float4 beta1 = (float4){-1.42647390514189e-02f, 0, 0, 0};\n\
+\n\
+    float4 poly = beta0.x * x2 + beta0.y;\n\
+    poly = poly * x2 + beta0.z;\n\
+    poly = poly * x2 + beta0.w;\n\
+    poly = poly * x2 + beta1.x;\n\
+\n\
+    return 1.0f / poly;\n\
+}\n\
+\n\
+float4 erf_eval(float4 _x)\n\
+{\n\
+    float4 x = clamp(_x, -4, 4);\n\
+    float4 x2 = x * x;\n\
+\n\
+    return x * evaluate_polynomial_alpha(x2) * evaluate_polynomial_beta(x2);\n\
+}\n\
+\n\
 #define RSQRT2      (0.70710678118654752440084436210485f)\n\
 float4 eltwise_unary_gelu(float4 x)\n\
 {\n\
     float4 erf, data;\n\
     data = x * RSQRT2;\n\
-    erf.x = erf_eval(data.x);\n\
-    erf.y = erf_eval(data.y);\n\
-    erf.z = erf_eval(data.z);\n\
-    erf.w = erf_eval(data.w);\n\
+    erf = erf_eval(data);\n\
     x = 0.5f * x * (1 + erf);\n\
 \n\
     return x;\n\
@@ -5464,6 +5555,201 @@ __kernel void func_name##_##src_type_name##to##dst_type_name( \\\n\
     _viv_asm(COPY, dst, dst2, 16); \\\n\
     VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+//MISH\n\
+ELTSISE_UNARY_3D(mish, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(mish, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(mish, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(mish, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(mish, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(mish, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(mish, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(mish, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(mish, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(mish, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+//HARD_SIGMOID\n\
+ELTSISE_UNARY_3D(hard_sigmoid, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(hard_sigmoid, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(hard_sigmoid, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(hard_sigmoid, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(hard_sigmoid, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(hard_sigmoid, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(hard_sigmoid, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(hard_sigmoid, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(hard_sigmoid, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(hard_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+//ROUND\n\
+ELTSISE_UNARY_3D(round, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(round, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(round, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(round, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(round, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(round, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(round, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(round, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(round, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+//GELU\n\
+ELTSISE_UNARY_3D(gelu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(gelu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(gelu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(gelu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(gelu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(gelu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(gelu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(gelu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(gelu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+//HARD_GELU\n\
+ELTSISE_UNARY_3D(hard_gelu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(hard_gelu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(hard_gelu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(hard_gelu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(hard_gelu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(hard_gelu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(hard_gelu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(hard_gelu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(hard_gelu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(hard_gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
+#define ELTSISE_UNARY_BF16(func_name) \\\n\
+    __kernel void func_name##_BF16toBF16( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 int              type, \\\n\
+                 float            _alpha, \\\n\
+                 float            _beta \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    vxc_ushort8   src0, src1, dst; \\\n\
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 vecA; \\\n\
+    float4 vecB; \\\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
+    _viv_asm(COPY, vecA, src1, 16); \\\n\
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\
+    _viv_asm(COPY, vecB, src1, 16); \\\n\
+    vecA = eltwise_unary_##func_name(vecA); \\\n\
+    vecB = eltwise_unary_##func_name(vecB); \\\n\
+ \\\n\
+    _viv_asm(COPY, src0, vecA, 16); \\\n\
+    _viv_asm(COPY, src1, vecB, 16); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+//MISH\n\
+ELTSISE_UNARY_BF16(mish)\n\
+//HARD_SIGMOID\n\
+ELTSISE_UNARY_BF16(hard_sigmoid)\n\
+//ROUND\n\
+ELTSISE_UNARY_BF16(round)\n\
+//GELU\n\
+ELTSISE_UNARY_BF16(gelu)\n\
+//HARD_GELU\n\
+ELTSISE_UNARY_BF16(hard_gelu)"; /* end of eltwise_unary_3d_0_vx*/
+
+static const char eltwise_unary_3d_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float alpha;\n\
+_viv_uniform float beta;\n\
+\n\
+float4 eltwise_unary_sin(float4 x)\n\
+{\n\
+    return native_sin(x);\n\
+}\n\
+\n\
+float4 eltwise_unary_cos(float4 x)\n\
+{\n\
+    return native_cos(x);\n\
+}\n\
+\n\
+#define logE        (1.44269502f)\n\
+#define twoLogE     (logE * 2.0f)\n\
+float4 eltwise_unary_exp(float4 x)\n\
+{\n\
+    x *= logE;\n\
+    x = exp2(x);\n\
+    return x;\n\
+}\n\
+\n\
+#define rlogE    (0.693147182f)\n\
+float4 eltwise_unary_log(float4 x)\n\
+{\n\
+    x = log2(x);\n\
+    return x * rlogE;\n\
+}\n\
+\n\
+float4 eltwise_unary_neg(float4 x)\n\
+{\n\
+    return x * -1;\n\
+}\n\
+\n\
+float4 eltwise_unary_selu(float4 val)\n\
+{\n\
+    float4 x = val * logE;\n\
+    x = exp2(x) * alpha - alpha;\n\
+\n\
+    return val < 0 ? x : val * beta;\n\
+}\n\
+\n\
+float4 eltwise_unary_celu(float4 val)\n\
+{\n\
+    float4 x = val * logE * beta;\n\
+    x = exp2(x) * alpha - alpha;\n\
+\n\
+    return val < 0 ? x : val;\n\
+}\n\
+\n\
+_viv_uniform float inputScale;\n\
+_viv_uniform float inputTail;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform float outputZP;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;\n\
+_viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\
+\n\
+#define ELTSISE_UNARY_3D(func_name, src_type_name, dst_type_name, src_type, \\\n\
+                src_copy_type, convert_type, dst_type, dst_copy_type) \\\n\
+__kernel void func_name##_##src_type_name##to##dst_type_name( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 int              type, \\\n\
+                 float            _alpha, \\\n\
+                 float            _beta \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    src_type      src0; \\\n\
+    src_copy_type src1; \\\n\
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, src0, 16); \\\n\
+ \\\n\
+    float4 vecA; \\\n\
+    float4 vecB; \\\n\
+    VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \\\n\
+    VXC_DP4x4(vecB, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \\\n\
+    vecA = vecA * inputScale + inputTail; \\\n\
+    vecB = vecB * inputScale + inputTail; \\\n\
+    vecA = eltwise_unary_##func_name(vecA); \\\n\
+    vecB = eltwise_unary_##func_name(vecB); \\\n\
+    vecA = vecA * outputScale + outputZP; \\\n\
+    vecB = vecB * outputScale + outputZP; \\\n\
+ \\\n\
+    convert_type dst0, dst1; \\\n\
+    _viv_asm(CONV_RTE, dst0, vecA); \\\n\
+    _viv_asm(CONV_RTE, dst1, vecB); \\\n\
+    dst_type dst2; \\\n\
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    dst_copy_type dst; \\\n\
+    _viv_asm(COPY, dst, dst2, 16); \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
 //EXP\n\
 ELTSISE_UNARY_3D(exp, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
 ELTSISE_UNARY_3D(exp, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
@@ -5508,17 +5794,17 @@ ELTSISE_UNARY_3D(log, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_u
 ELTSISE_UNARY_3D(log, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
 ELTSISE_UNARY_3D(log, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
 ELTSISE_UNARY_3D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//ELU\n\
-ELTSISE_UNARY_3D(elu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(elu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(elu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(elu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(elu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(elu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(elu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(elu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(elu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(elu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+//SELU\n\
+ELTSISE_UNARY_3D(selu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(selu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(selu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(selu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(selu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(selu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(selu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(selu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(selu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(selu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
 //NEG\n\
 ELTSISE_UNARY_3D(neg, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
 ELTSISE_UNARY_3D(neg, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
@@ -5530,61 +5816,17 @@ ELTSISE_UNARY_3D(neg, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_u
 ELTSISE_UNARY_3D(neg, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
 ELTSISE_UNARY_3D(neg, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
 ELTSISE_UNARY_3D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//MISH\n\
-ELTSISE_UNARY_3D(mish, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(mish, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(mish, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(mish, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(mish, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(mish, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(mish, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(mish, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(mish, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(mish, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//HARD_SIGMOID\n\
-ELTSISE_UNARY_3D(hard_sigmoid, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(hard_sigmoid, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(hard_sigmoid, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(hard_sigmoid, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(hard_sigmoid, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(hard_sigmoid, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(hard_sigmoid, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(hard_sigmoid, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(hard_sigmoid, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(hard_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//ROUND\n\
-ELTSISE_UNARY_3D(round, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(round, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(round, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(round, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(round, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(round, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(round, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(round, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(round, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//GELU\n\
-ELTSISE_UNARY_3D(gelu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(gelu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(gelu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(gelu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(gelu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(gelu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(gelu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(gelu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(gelu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-//HARD_GELU\n\
-ELTSISE_UNARY_3D(hard_gelu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(hard_gelu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(hard_gelu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(hard_gelu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(hard_gelu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
-ELTSISE_UNARY_3D(hard_gelu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(hard_gelu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
-ELTSISE_UNARY_3D(hard_gelu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
-ELTSISE_UNARY_3D(hard_gelu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
-ELTSISE_UNARY_3D(hard_gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+//CELU\n\
+ELTSISE_UNARY_3D(celu, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(celu, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(celu, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(celu, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(celu, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(celu, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(celu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(celu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(celu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(celu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
 \n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
@@ -5626,46 +5868,53 @@ ELTSISE_UNARY_BF16(sin)\n\
 ELTSISE_UNARY_BF16(cos)\n\
 //LOG\n\
 ELTSISE_UNARY_BF16(log)\n\
-//ELU\n\
-ELTSISE_UNARY_BF16(elu)\n\
+//SELU\n\
+ELTSISE_UNARY_BF16(selu)\n\
 //NEG\n\
 ELTSISE_UNARY_BF16(neg)\n\
-//MISH\n\
-ELTSISE_UNARY_BF16(mish)\n\
-//HARD_SIGMOID\n\
-ELTSISE_UNARY_BF16(hard_sigmoid)\n\
-//ROUND\n\
-ELTSISE_UNARY_BF16(round)\n\
-//GELU\n\
-ELTSISE_UNARY_BF16(gelu)\n\
-//HARD_GELU\n\
-ELTSISE_UNARY_BF16(hard_gelu)"; /* end of eltwise_unary_3d_vx*/
+//CELU\n\
+ELTSISE_UNARY_BF16(selu)\n\
+"; /* end of eltwise_unary_3d_1_vx*/
 
 static const char erf_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
-#define MUL2_RSQRTPI    (1.1283791670955126f)\n\
-float eltwise_unary_erf(float _x)\n\
+float4 evaluate_polynomial_alpha(float4 x2)\n\
 {\n\
-    float x = clamp(_x, -2, 2);\n\
-    float res = 0;\n\
-    float tmp = x;\n\
-    float factorial = 1;\n\
-    float x_pow = x;\n\
-    float one = 1.0f;\n\
-    float n = 1;\n\
+    float4 alpha0 = (float4){-2.72614225801306e-10f, 2.77068142495902e-08f,\n\
+                            -2.10102402082508e-06f, -5.69250639462346e-05f};\n\
+    float4 alpha1 = (float4){-7.34990630326855e-04f, -2.95459980854025e-03f,\n\
+                            -1.60960333262415e-02f, 0};\n\
 \n\
-    while (fabs(tmp) > 1e-5)\n\
-    {\n\
-        res += tmp;\n\
+    float4 poly = alpha0.x * x2 + alpha0.y;\n\
+    poly = poly * x2 + alpha0.z;\n\
+    poly = poly * x2 + alpha0.w;\n\
+    poly = poly * x2 + alpha1.x;\n\
+    poly = poly * x2 + alpha1.y;\n\
+    poly = poly * x2 + alpha1.z;\n\
 \n\
-        factorial *= n;\n\
-        one *= -1;\n\
-        x_pow *= x * x;\n\
-        tmp = one / factorial * x_pow / ( 2 * n + 1);\n\
+    return poly;\n\
+}\n\
 \n\
-        n += 1.0f;\n\
-    }\n\
-    return res * MUL2_RSQRTPI;\n\
+float4 evaluate_polynomial_beta(float4 x2)\n\
+{\n\
+    float4 beta0 = (float4){-1.45660718464996e-05f, -2.13374055278905e-04f,\n\
+                            -1.68282697438203e-03f, -7.37332916720468e-03f};\n\
+    float4 beta1 = (float4){-1.42647390514189e-02f, 0, 0, 0};\n\
+\n\
+    float4 poly = beta0.x * x2 + beta0.y;\n\
+    poly = poly * x2 + beta0.z;\n\
+    poly = poly * x2 + beta0.w;\n\
+    poly = poly * x2 + beta1.x;\n\
+\n\
+    return 1.0f / poly;\n\
+}\n\
+\n\
+float4 eltwise_unary_erf(float4 _x)\n\
+{\n\
+    float4 x = clamp(_x, -4, 4);\n\
+    float4 x2 = x * x;\n\
+\n\
+    return x * evaluate_polynomial_alpha(x2) * evaluate_polynomial_beta(x2);\n\
 }\n\
 \n\
 _viv_uniform float inputScale;\n\
@@ -5691,10 +5940,7 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;\n\
     float4 vecA; \\\n\
     VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \\\n\
     vecA = vecA * inputScale + inputTail; \\\n\
-    vecA.x = eltwise_unary_##func_name(vecA.x); \\\n\
-    vecA.y = eltwise_unary_##func_name(vecA.y); \\\n\
-    vecA.z = eltwise_unary_##func_name(vecA.z); \\\n\
-    vecA.w = eltwise_unary_##func_name(vecA.w); \\\n\
+    vecA = eltwise_unary_##func_name(vecA); \\\n\
     vecA = vecA * outputScale + outputZP; \\\n\
  \\\n\
     convert_type dst0; \\\n\
@@ -5735,10 +5981,7 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
     vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\
     VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
     _viv_asm(COPY, vecA, src1, 16); \\\n\
-    vecA.x = eltwise_unary_##func_name(vecA.x); \\\n\
-    vecA.y = eltwise_unary_##func_name(vecA.y); \\\n\
-    vecA.z = eltwise_unary_##func_name(vecA.z); \\\n\
-    vecA.w = eltwise_unary_##func_name(vecA.w); \\\n\
+    vecA = eltwise_unary_##func_name(vecA); \\\n\
  \\\n\
     _viv_asm(COPY, src0, vecA, 16); \\\n\
  \\\n\
@@ -5764,10 +6007,7 @@ __write_only image2d_array_t  output \\\n\
     float4 vecA; \\\n\
     VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \\\n\
     vecA = vecA * inputScale + inputTail; \\\n\
-    vecA.x = eltwise_unary_##func_name(vecA.x); \\\n\
-    vecA.y = eltwise_unary_##func_name(vecA.y); \\\n\
-    vecA.z = eltwise_unary_##func_name(vecA.z); \\\n\
-    vecA.w = eltwise_unary_##func_name(vecA.w); \\\n\
+    vecA = eltwise_unary_##func_name(vecA); \\\n\
     vecA = vecA * outputScale + outputZP; \\\n\
  \\\n\
     convert_type dst0; \\\n\
@@ -5804,10 +6044,7 @@ ELTSISE_UNARY_3D(erf, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_s
     vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\
     VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
     _viv_asm(COPY, vecA, src1, 16); \\\n\
-    vecA.x = eltwise_unary_##func_name(vecA.x); \\\n\
-    vecA.y = eltwise_unary_##func_name(vecA.y); \\\n\
-    vecA.z = eltwise_unary_##func_name(vecA.z); \\\n\
-    vecA.w = eltwise_unary_##func_name(vecA.w); \\\n\
+    vecA = eltwise_unary_##func_name(vecA); \\\n\
  \\\n\
     _viv_asm(COPY, src0, vecA, 16); \\\n\
  \\\n\
@@ -5944,21 +6181,21 @@ __kernel void floordiv_##src0_name##src1_name##to##dst_name \\\n\
 TENSOR_FLOORDIV(F16, F16, F16, half4, vxc_short8, vxc_short8,\\\n\
                 vxc_half8, CONV, 1, 0, 1, 0, 1, 0)\n\
 TENSOR_FLOORDIV(F16, F16, I16, short4, vxc_short8, vxc_short8,\\\n\
-               vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, 0)\n\
+               vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\
 TENSOR_FLOORDIV(F16, F16, I8,  char4, vxc_char8, vxc_short8,\\\n\
-                vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, 0)\n\
+                vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\
 TENSOR_FLOORDIV(F16, F16, U8,  uchar4, vxc_uchar8, vxc_short8,\\\n\
                vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\
 \n\
 TENSOR_FLOORDIV(I16, I16, I16, short4, vxc_short8, vxc_short8,\\\n\
-                vxc_short8, CONV_SAT_RTE, in_scale0, 0, in_scale1, 0, out_scale, 0)\n\
+                vxc_short8, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\
 TENSOR_FLOORDIV(I16, I16, F16, half4, vxc_short8, vxc_short8,\\\n\
-                vxc_short8, CONV, in_scale0, 0, in_scale1, 0, 1, 0)\n\
+                vxc_short8, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\
 \n\
 TENSOR_FLOORDIV(I8, I8, I8, char4, vxc_char8, vxc_char16,\\\n\
-                vxc_char16, CONV_SAT_RTE, in_scale0, 0, in_scale1, 0, out_scale, 0)\n\
+                vxc_char16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\
 TENSOR_FLOORDIV(I8, I8, F16, half4, vxc_short8, vxc_char16,\\\n\
-                vxc_char16, CONV, in_scale0, 0, in_scale1, 0, 1, 0)\n\
+                vxc_char16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\
 \n\
 TENSOR_FLOORDIV(U8, U8, U8,  uchar4, vxc_uchar8, vxc_uchar16,\\\n\
                 vxc_uchar16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\
@@ -5966,7 +6203,6 @@ TENSOR_FLOORDIV(U8, U8, F16, half4, vxc_short8, vxc_uchar16,\\\n\
                 vxc_uchar16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\
 \n\
 \n\
-\n\
 #define TENSOR_FLOORDIV_2D(src0_name, src1_name, dst_name, dst_type, save_type, read_type, copy_type, \\\n\
     conv_mode, IN0_SCALE, IN0_TAIL, IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET) \\\n\
 __kernel void floordiv_##src0_name##src1_name##to##dst_name##_2D \\\n\
@@ -5985,21 +6221,21 @@ __kernel void floordiv_##src0_name##src1_name##to##dst_name##_2D \\\n\
 TENSOR_FLOORDIV_2D(F16, F16, F16, half4, vxc_short8, vxc_short8,\\\n\
                 vxc_half8, CONV, 1, 0, 1, 0, 1, 0)\n\
 TENSOR_FLOORDIV_2D(F16, F16, I16, short4, vxc_short8, vxc_short8,\\\n\
-               vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, 0)\n\
+               vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\
 TENSOR_FLOORDIV_2D(F16, F16, I8,  char4, vxc_char8, vxc_short8,\\\n\
-                vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, 0)\n\
+                vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\
 TENSOR_FLOORDIV_2D(F16, F16, U8,  uchar4, vxc_uchar8, vxc_short8,\\\n\
                vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\
 \n\
 TENSOR_FLOORDIV_2D(I16, I16, I16, short4, vxc_short8, vxc_short8,\\\n\
-                vxc_short8, CONV_SAT_RTE, in_scale0, 0, in_scale1, 0, out_scale, 0)\n\
+                vxc_short8, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\
 TENSOR_FLOORDIV_2D(I16, I16, F16, half4, vxc_short8, vxc_short8,\\\n\
-                vxc_short8, CONV, in_scale0, 0, in_scale1, 0, 1, 0)\n\
+                vxc_short8, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\
 \n\
 TENSOR_FLOORDIV_2D(I8, I8, I8, char4, vxc_char8, vxc_char16,\\\n\
-                vxc_char16, CONV_SAT_RTE, in_scale0, 0, in_scale1, 0, out_scale, 0)\n\
+                vxc_char16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\
 TENSOR_FLOORDIV_2D(I8, I8, F16, half4, vxc_short8, vxc_char16,\\\n\
-                vxc_char16, CONV, in_scale0, 0, in_scale1, 0, 1, 0)\n\
+                vxc_char16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\
 \n\
 TENSOR_FLOORDIV_2D(U8, U8, U8,  uchar4, vxc_uchar8, vxc_uchar16,\\\n\
                 vxc_uchar16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\
@@ -6664,6 +6900,161 @@ __kernel void gather_batch_F16toF16_axis0(\n\
 }\n\
 "; /* end of gather_batch_vx*/
 
+static const char gather_elements_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int axis_size;\n\
+\n\
+#define GATHER_ELEMENTS_AXIS0_2D(name, data_type) \\\n\
+__kernel void gather_elements_axis0_##name##_I32to##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_t input0, \\\n\
+    __read_only  image2d_t input1, \\\n\
+    __write_only image2d_t output, \\\n\
+                 int       axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    Image img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\
+    int4 indice = ((int4 *)indice_ptr)[0]; \\\n\
+    int4 indice1 = indice + axis_size; \\\n\
+    indice = indice < 0 ? indice1 : indice; \\\n\
+ \\\n\
+    data_type src; \\\n\
+    VXC_ReadImage(src, input0, (int2)(indice.x, coord.y), 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src, input0, (int2)(indice.y, coord.y), 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src, input0, (int2)(indice.z, coord.y), 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src, input0, (int2)(indice.w, coord.y), 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GATHER_ELEMENTS_AXIS0_2D(F16, vxc_short4)\n\
+GATHER_ELEMENTS_AXIS0_2D(I16, vxc_short4)\n\
+GATHER_ELEMENTS_AXIS0_2D(I8,  vxc_char4)\n\
+GATHER_ELEMENTS_AXIS0_2D(U8,  vxc_uchar4)\n\
+\n\
+#define GATHER_ELEMENTS_AXIS0(name, data_type) \\\n\
+__kernel void gather_elements_axis0_##name##_I32to##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    Tensor img = create_tensor_from_image2d_array(input1, 4); \\\n\
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord); \\\n\
+    int4 indice = ((int4 *)indice_ptr)[0]; \\\n\
+    int4 indice1 = indice + axis_size; \\\n\
+    indice = indice < 0 ? indice1 : indice; \\\n\
+ \\\n\
+    data_type src; \\\n\
+    int4 coord_in = coord; \\\n\
+    coord_in.x = indice.x; \\\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, \\\n\
+                VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = indice.y; \\\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, \\\n\
+                VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = indice.z; \\\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, \\\n\
+                VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = indice.w; \\\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, \\\n\
+                VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GATHER_ELEMENTS_AXIS0(F16, vxc_short4)\n\
+GATHER_ELEMENTS_AXIS0(I16, vxc_short4)\n\
+GATHER_ELEMENTS_AXIS0(I8,  vxc_char4)\n\
+GATHER_ELEMENTS_AXIS0(U8,  vxc_uchar4)\n\
+\n\
+#define GATHER_ELEMENTS_AXIS1_2D(name, data_type) \\\n\
+__kernel void gather_elements_axis1_##name##_I32to##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_t input0, \\\n\
+    __read_only  image2d_t input1, \\\n\
+    __write_only image2d_t output, \\\n\
+                 int       axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    int index = read_imagei(input1, coord).x; \\\n\
+    int index1 = index + axis_size; \\\n\
+    index = index < 0 ? index1 : index; \\\n\
+ \\\n\
+    data_type src; \\\n\
+    VXC_ReadImage(src, input0, (int2)(coord.x, index), 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GATHER_ELEMENTS_AXIS1_2D(F16, vxc_short4)\n\
+GATHER_ELEMENTS_AXIS1_2D(I16, vxc_short4)\n\
+GATHER_ELEMENTS_AXIS1_2D(I8,  vxc_char4)\n\
+GATHER_ELEMENTS_AXIS1_2D(U8,  vxc_uchar4)\n\
+\n\
+#define GATHER_ELEMENTS_AXIS1(name, data_type) \\\n\
+__kernel void gather_elements_axis1_##name##_I32to##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    int index = read_imagei(input1, coord).x; \\\n\
+    int index1 = index + axis_size; \\\n\
+    index = index < 0 ? index1 : index; \\\n\
+ \\\n\
+    data_type src; \\\n\
+    int4 coord_in = coord; \\\n\
+    coord_in.y = index; \\\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, \\\n\
+                VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GATHER_ELEMENTS_AXIS1(F16, vxc_short4)\n\
+GATHER_ELEMENTS_AXIS1(I16, vxc_short4)\n\
+GATHER_ELEMENTS_AXIS1(I8,  vxc_char4)\n\
+GATHER_ELEMENTS_AXIS1(U8,  vxc_uchar4)\n\
+\n\
+#define GATHER_ELEMENTS_AXIS2(name, data_type) \\\n\
+__kernel void gather_elements_axis2_##name##_I32to##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    int index = read_imagei(input1, coord).x; \\\n\
+    int index1 = index + axis_size; \\\n\
+    index = index < 0 ? index1 : index; \\\n\
+ \\\n\
+    data_type src; \\\n\
+    int4 coord_in = coord; \\\n\
+    coord_in.z = index; \\\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, \\\n\
+                VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GATHER_ELEMENTS_AXIS2(F16, vxc_short4)\n\
+GATHER_ELEMENTS_AXIS2(I16, vxc_short4)\n\
+GATHER_ELEMENTS_AXIS2(I8,  vxc_char4)\n\
+GATHER_ELEMENTS_AXIS2(U8,  vxc_uchar4)\n\
+"; /* end of gather_elements_vx*/
+
 static const char gather_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int indices_num;\n\
@@ -13812,21 +14203,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\
 \n\
 L2NORMSCALE_AXIS0_2D(F16, F16, F16, ushort, vxc_ushort8, vxc_half8, 1, \\\n\
                      ushort, half4, vxc_half8, vxc_ushort8)\n\
-L2NORMSCALE_AXIS0_2D(I16, F16, F16, short,  vxc_short8, vxc_short8, r_inputScale, \\\n\
-                     ushort, half4, vxc_half8, vxc_ushort8)\n\
-L2NORMSCALE_AXIS0_2D(I16, F16, I16, short,  vxc_short8, vxc_short8, r_inputScale, \\\n\
-                     short, int4, vxc_short8, vxc_short8)\n\
-L2NORMSCALE_AXIS0_2D(I8,  F16, F16, char,   vxc_char8,  vxc_char8, r_inputScale, \\\n\
-                     ushort, half4, vxc_half8, vxc_ushort8)\n\
-L2NORMSCALE_AXIS0_2D(I8,  F16, I8,  char,   vxc_char8,  vxc_char8, r_inputScale, \\\n\
-                     char,  int4, vxc_char8, vxc_char8)\n\
 \n\
-\n\
-\n\
-#define L2NORMSCALE_AXIS0_U8_2D(in1_name, out_name,\\\n\
-                            dst_type, convert_type, output_type, copy_type) \\\n\
+#define L2NORMSCALE_AXIS0_QNT_2D(in0_name, in1_name, out_name,\\\n\
+                    src_type, src_scalar_type, dst_type, convert_type, output_type, copy_type) \\\n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\
-     void l2normalizescale_axis0_U8_##in1_name##to##out_name##_2D \\\n\
+void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \\\n\
     (\\\n\
     __read_only  image2d_t input,\\\n\
     __read_only  image2d_t scale,\\\n\
@@ -13839,8 +14220,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\
     Image src_img = create_image_from_image2d(input, 1); \\\n\
     uchar *src_ptr_base = (uchar *)src_img.ptr; \\\n\
     uchar *src_ptr; \\\n\
-    vxc_uchar8 src0, src1; \\\n\
-    vxc_uchar8   val0, val1; \\\n\
+    src_type src0, src1; \\\n\
+    src_type val0, val1; \\\n\
     int   inputRemain; \\\n\
     vxc_float4 sum = {0.0f}; \\\n\
     vxc_uchar8 input_ZP ; \\\n\
@@ -13848,10 +14229,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\
     src_ptr = src_ptr_base + (get_global_id(0) + get_global_id(1) * inputWidth); \\\n\
     for (int i = 0; i < inputWidthCount; i++) \\\n\
     { \\\n\
-        VXC_Vload8(src0, src_ptr, 0); \\\n\
-        VXC_Vload8(src1, src_ptr, 1); \\\n\
-        _viv_asm(COPY, val0, src0, 16); \\\n\
-        _viv_asm(COPY, val1, src1, 16); \\\n\
+        VXC_Vload8(val0, src_ptr, 0); \\\n\
+        VXC_Vload8(val1, src_ptr, 1); \\\n\
         VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 1),\\\n\
             uniSumSqrt_16x1); \\\n\
         VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 1),\\\n\
@@ -13866,7 +14245,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\
         inputRemain = inputWidth - offset; \\\n\
         if (inputRemain > 0) \\\n\
         { \\\n\
-            L2NORMSCALE_REM_PROCESS((uchar)inputZP) \\\n\
+            L2NORMSCALE_REM_PROCESS((src_scalar_type)inputZP) \\\n\
             _viv_asm(COPY, val0, src0, 16); \\\n\
             _viv_asm(COPY, val1, src1, 16); \\\n\
             VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 1),\\\n\
@@ -13888,8 +14267,12 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\
     L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \\\n\
 }\n\
 \n\
-L2NORMSCALE_AXIS0_U8_2D(F16, F16, ushort, half4, vxc_half8,  vxc_ushort8)\n\
-L2NORMSCALE_AXIS0_U8_2D(F16, U8,  uchar,  int4,  vxc_uchar8, vxc_uchar8)\n\
+L2NORMSCALE_AXIS0_QNT_2D(U8,  F16, F16, vxc_uchar8, uchar, ushort, half4, vxc_half8,  vxc_ushort8)\n\
+L2NORMSCALE_AXIS0_QNT_2D(U8,  F16, U8,  vxc_uchar8, uchar, uchar,  int4,  vxc_uchar8, vxc_uchar8)\n\
+L2NORMSCALE_AXIS0_QNT_2D(I8,  F16, F16, vxc_char8,  char,  ushort, half4, vxc_half8,  vxc_ushort8)\n\
+L2NORMSCALE_AXIS0_QNT_2D(I8,  F16, I8,  vxc_char8,  char,  char,   int4,  vxc_char8,  vxc_char8)\n\
+L2NORMSCALE_AXIS0_QNT_2D(I16, F16, F16, vxc_short8, short, ushort, half4, vxc_half8,  vxc_ushort8)\n\
+L2NORMSCALE_AXIS0_QNT_2D(I16, F16, I16, vxc_short8, short, short,  int4,  vxc_short8, vxc_short8)\n\
 "; /* end of l2normalizescale_axis0_vx*/
 
 static const char l2normalizescale_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -13901,10 +14284,6 @@ _viv_uniform VXC_512Bits UniFp16MulHi_dp4x4;\n\
 \n\
 //int8 version\n\
 _viv_uniform float r_inputScale;\n\
-_viv_uniform VXC_512Bits uniIntegerSquareLo_4x4;\n\
-_viv_uniform VXC_512Bits uniIntegerSquareHi_4x4;\n\
-_viv_uniform VXC_512Bits uniDataSquareAddU32Lo_4x4;\n\
-_viv_uniform VXC_512Bits uniDataSquareAddU32Hi_4x4;\n\
 \n\
 _viv_uniform VXC_512Bits uniUInt8SquareLo_4x4;\n\
 _viv_uniform VXC_512Bits uniUInt8SquareHi_4x4;\n\
@@ -14021,10 +14400,9 @@ __kernel void l2normalizescale_axis1_F16_##in1_name##to##out_name##_2D \\\n\
 \n\
 L2NORMSCALE_AXIS1_F16_2D(F16, F16, vxc_short8,  vxc_half8,   vxc_half8,   half4,        vxc_short8)\n\
 \n\
-\n\
-#define L2NORMSCALE_AXIS1_I8_2D(in1_name, out_name,\\\n\
+#define L2NORMSCALE_AXIS1_QNT_2D(in0_name, in1_name, out_name,\\\n\
        input_type, incopy_type, output_type, convert_type, copy_type) \\\n\
-__kernel void l2normalizescale_axis1_I8_##in1_name##to##out_name##_2D \\\n\
+__kernel void l2normalizescale_axis1_##in0_name##_##in1_name##to##out_name##_2D \\\n\
     (\\\n\
     __read_only  image2d_array_t input,\\\n\
     __read_only  image2d_array_t scale,\\\n\
@@ -14033,93 +14411,11 @@ __kernel void l2normalizescale_axis1_I8_##in1_name##to##out_name##_2D \\\n\
     )\\\n\
 { \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\
-    vxc_char8 src0_I8, src1_I8; \\\n\
-    vxc_uint4 dst0_I8 = 0, dst1_I8 = 0; \\\n\
-    for(int i = 0; i < L2NorS_depth; i += 2) \\\n\
-    { \\\n\
-        VXC_ReadImage(src0_I8, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-        VXC_ReadImage(src1_I8, input, coord.xy, VXC_5BITOFFSET_XY(0, 1),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-        coord.y += 2; \\\n\
-        VXC_DP4x4(dst0_I8, src0_I8, dst0_I8, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniDataSquareAddU32Lo_4x4); \\\n\
-        VXC_DP4x4(dst1_I8, src0_I8, dst1_I8, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniDataSquareAddU32Hi_4x4); \\\n\
-        VXC_DP4x4(dst0_I8, src1_I8, dst0_I8, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniDataSquareAddU32Lo_4x4); \\\n\
-        VXC_DP4x4(dst1_I8, src1_I8, dst1_I8, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniDataSquareAddU32Hi_4x4); \\\n\
-    } \\\n\
-    vxc_float4 sum_lo, sum_hi; \\\n\
-    sum_lo = convert_float4(dst0_I8); \\\n\
-    sum_hi = convert_float4(dst1_I8); \\\n\
-    sum_lo = rsqrt(sum_lo) * r_inputScale; \\\n\
-    sum_hi = rsqrt(sum_hi) * r_inputScale; \\\n\
-    L2NORMSCALE_MUL_AXIS1_PROCESS(input_type, incopy_type, output_type, convert_type, copy_type) \\\n\
-}\n\
-\n\
-L2NORMSCALE_AXIS1_I8_2D(F16, I8,  vxc_char16,  vxc_char16,  vxc_char16,  int4,   vxc_char16)\n\
-L2NORMSCALE_AXIS1_I8_2D(F16, F16, vxc_char16,  vxc_char16,  vxc_half8,   half4,  vxc_short8)\n\
-\n\
-\n\
-#define L2NORMSCALE_AXIS1_I16_2D(in1_name, out_name,\\\n\
-       input_type, incopy_type, output_type, convert_type, copy_type) \\\n\
-__kernel void l2normalizescale_axis1_I16_##in1_name##to##out_name##_2D \\\n\
-    (\\\n\
-    __read_only  image2d_array_t input,\\\n\
-    __read_only  image2d_array_t scale,\\\n\
-    __write_only image2d_array_t output,\\\n\
-    int axis\\\n\
-    )\\\n\
-{ \\\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\
-    vxc_short8 src0_I16, src1_I16; \\\n\
+    input_type src0_U8, src1_U8; \\\n\
     vxc_float4 squr, sum_lo = 0, sum_hi = 0; \\\n\
     for(int i = 0; i < L2NorS_depth; i += 2) \\\n\
     { \\\n\
-        VXC_ReadImage(src0_I16, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-        VXC_ReadImage(src1_I16, input, coord.xy, VXC_5BITOFFSET_XY(0, 1),\\\n\
-            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-        coord.y += 2; \\\n\
-        VXC_DP4x4(squr, src0_I16, src0_I16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniIntegerSquareLo_4x4); \\\n\
-        sum_lo = squr + sum_lo; \\\n\
-        VXC_DP4x4(squr, src0_I16, src0_I16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniIntegerSquareHi_4x4); \\\n\
-        sum_hi = squr + sum_hi; \\\n\
-        VXC_DP4x4(squr, src1_I16, src1_I16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniIntegerSquareLo_4x4); \\\n\
-        sum_lo = squr + sum_lo; \\\n\
-        VXC_DP4x4(squr, src1_I16, src1_I16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-            uniIntegerSquareHi_4x4); \\\n\
-        sum_hi = squr + sum_hi; \\\n\
-    } \\\n\
-    sum_lo = rsqrt(sum_lo) * r_inputScale; \\\n\
-    sum_hi = rsqrt(sum_hi) * r_inputScale; \\\n\
-    L2NORMSCALE_MUL_AXIS1_PROCESS(input_type, incopy_type, output_type, convert_type, copy_type) \\\n\
-}\n\
-\n\
-L2NORMSCALE_AXIS1_I16_2D(F16, I16, vxc_short8, vxc_short8, vxc_short8, int4,  vxc_short8)\n\
-L2NORMSCALE_AXIS1_I16_2D(F16, F16, vxc_short8, vxc_short8, vxc_half8,  half4, vxc_short8)\n\
-\n\
-#define L2NORMSCALE_AXIS1_U8_2D(in1_name, out_name,\\\n\
-       input_type, incopy_type, output_type, convert_type, copy_type) \\\n\
-__kernel void l2normalizescale_axis1_U8_##in1_name##to##out_name##_2D \\\n\
-    (\\\n\
-    __read_only  image2d_array_t input,\\\n\
-    __read_only  image2d_array_t scale,\\\n\
-    __write_only image2d_array_t output,\\\n\
-    int axis\\\n\
-    )\\\n\
-{ \\\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\
-    vxc_uchar8 src0_U8, src1_U8; \\\n\
-    vxc_float4 squr, sum_lo = 0, sum_hi = 0; \\\n\
-    for(int i = 0; i < L2NorS_depth; i += 2) \\\n\
-    { \\\n\
-        vxc_uchar8 zero; \\\n\
+        vxc_short2 zero; \\\n\
         VXC_ReadImage(src0_U8, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
         VXC_ReadImage(src1_U8, input, coord.xy, VXC_5BITOFFSET_XY(0, 1),\\\n\
@@ -14140,8 +14436,12 @@ __kernel void l2normalizescale_axis1_U8_##in1_name##to##out_name##_2D \\\n\
     L2NORMSCALE_MUL_AXIS1_PROCESS(input_type, incopy_type, output_type, convert_type, copy_type) \\\n\
 }\n\
 \n\
-L2NORMSCALE_AXIS1_U8_2D(F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8,   half4,  vxc_short8)\n\
-L2NORMSCALE_AXIS1_U8_2D(F16, U8,  vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,   vxc_uchar16)\n\
+L2NORMSCALE_AXIS1_QNT_2D(U8,  F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8,   half4,  vxc_short8)\n\
+L2NORMSCALE_AXIS1_QNT_2D(U8,  F16, U8,  vxc_uchar16, vxc_uchar16, vxc_uchar16, int4,   vxc_uchar16)\n\
+L2NORMSCALE_AXIS1_QNT_2D(I8,  F16, F16, vxc_char16,  vxc_char16,  vxc_half8,   half4,  vxc_short8)\n\
+L2NORMSCALE_AXIS1_QNT_2D(I8,  F16, I8,  vxc_char16,  vxc_char16,  vxc_uchar16, int4,   vxc_char16)\n\
+L2NORMSCALE_AXIS1_QNT_2D(I16, F16, F16, vxc_short8,  vxc_short8,  vxc_half8,   half4,  vxc_short8)\n\
+L2NORMSCALE_AXIS1_QNT_2D(I16, F16, I16, vxc_short8,  vxc_short8,  vxc_short8,  int4,   vxc_short8)\n\
 "; /* end of l2normalizescale_axis1_vx*/
 
 static const char layer_normalization_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -25168,7 +25468,7 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \\\n\
 GEMM_QINT16_TO_F16(I16, vxc_short8)\n\
 "; /* end of matrixmul_u8u8_f16_vx*/
 
-static const char maximum_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char maximum_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 __kernel void maximum_F16F16toF16\n\
     (\n\
@@ -25181,10 +25481,10 @@ __kernel void maximum_F16F16toF16\n\
 \n\
     vxc_short8 vec0, vec1, dst;\n\
     vxc_half8  src0, src1;\n\
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+    VXC_ReadImage2DArray(vec0, input0, coord, 0,\\\n\
         VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     _viv_asm(COPY, src0, vec0, 16);\n\
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+    VXC_ReadImage2DArray(vec1, input1, coord, 0,\\\n\
         VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     _viv_asm(COPY, src1, vec1, 16);\n\
 \n\
@@ -25205,10 +25505,10 @@ __kernel void maximum_F16F16toF16_2D\n\
 \n\
     vxc_short8 vec0, vec1, dst;\n\
     vxc_half8  src0, src1;\n\
-    VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+    VXC_ReadImage(vec0, input0, coord.xy, 0,\\\n\
         VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     _viv_asm(COPY, src0, vec0, 16);\n\
-    VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+    VXC_ReadImage(vec1, input1, coord.xy, 0,\\\n\
         VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     _viv_asm(COPY, src1, vec1, 16);\n\
 \n\
@@ -25220,835 +25520,363 @@ __kernel void maximum_F16F16toF16_2D\n\
     VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
-_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8;\n\
-\n\
-__kernel void maximum_F16F16toI8\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 vec0, vec1;\n\
-    vxc_char8  dst;\n\
-    vxc_half8  src0, src1;\n\
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src0, vec0, 16);\n\
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void maximum_F16F16toI8_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_short8 vec0, vec1;\n\
-    vxc_char8  dst;\n\
-    vxc_half8  src0, src1;\n\
-    VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src0, vec0, 16);\n\
-    VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertI8toI8_0_part0_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertI8toI8_0_part1_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertI8toI8_1_part0_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertI8toI8_1_part1_2x8;\n\
-__kernel void maximum_I8I8toI8\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_char16 src0, src1, dst;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);\n\
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part0_2x8);\n\
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part1_2x8);\n\
-    dst = max(src0, src1);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void maximum_I8I8toI8_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
-\n\
-    vxc_char16 src0, src1, dst;\n\
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord.z ++;\n\
-\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);\n\
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part0_2x8);\n\
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part1_2x8);\n\
-    dst = max(src0, src1);\n\
-\n\
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
 _viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\
 _viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8;\n\
 _viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;\n\
 _viv_uniform VXC_512Bits uniU8MulAndPostShift1_Hi_2x8;\n\
 _viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\
 _viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\
-__kernel void maximum_U8U8toU8\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
 \n\
-    vxc_uchar16 src0, src1, dst;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_ushort8 mp0, mp1;\n\
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Lo_2x8);\n\
-    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Hi_2x8);\n\
-    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift1_Lo_2x8);\n\
-    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift1_Hi_2x8);\n\
-    dst = max(src0, src1);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+#define MAXIMUM_8BITS_QUANT_IMPL(name, dtype) \\\n\
+__kernel void maximum_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    dtype src0, src1, dst; \\\n\
+    VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Hi_2x8); \\\n\
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Hi_2x8); \\\n\
+    dst = max(src0, src1); \\\n\
+ \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+MAXIMUM_8BITS_QUANT_IMPL(U8U8toU8, vxc_uchar16)\n\
+MAXIMUM_8BITS_QUANT_IMPL(I8I8toI8, vxc_char16)\n\
 \n\
-__kernel void maximum_U8U8toU8_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_uchar16 src0, src1, dst;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_ushort8 mp0, mp1;\n\
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Lo_2x8);\n\
-    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Hi_2x8);\n\
-    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift1_Lo_2x8);\n\
-    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift1_Hi_2x8);\n\
-    dst = max(src0, src1);\n\
-\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+#define MAXIMUM_8BITS_2D_QUANT_IMPL(name, dtype) \\\n\
+__kernel void maximum_##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    dtype src0, src1, dst; \\\n\
+    VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Hi_2x8); \\\n\
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Hi_2x8); \\\n\
+    dst = max(src0, src1); \\\n\
+ \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+MAXIMUM_8BITS_2D_QUANT_IMPL(U8U8toU8, vxc_uchar16)\n\
+MAXIMUM_8BITS_2D_QUANT_IMPL(I8I8toI8, vxc_char16)\n\
 \n\
-__kernel void maximum_U8U8toI16\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_uchar16 src0, src1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_short8 dst0, dst1, dst;\n\
-    vxc_ushort8 mp0, mp1;\n\
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Lo_2x8);\n\
-    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift1_Lo_2x8);\n\
-    dst = max(dst0, dst1);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+#define MAXIMUM_QUANT_IMPL(name, src_type, copy_type, dst_type) \\\n\
+__kernel void maximum_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output  \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    copy_type data0, data1; \\\n\
+    src_type src0, src1; \\\n\
+    VXC_ReadImage2DArray(data0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src0, data0, 16); \\\n\
+    VXC_ReadImage2DArray(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, data1, 16); \\\n\
+ \\\n\
+    dst_type dst0, dst1, dst; \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+    dst = max(dst0, dst1); \\\n\
+ \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+MAXIMUM_QUANT_IMPL(U8U8toI16,   vxc_uchar16, vxc_uchar16, vxc_short8)\n\
+MAXIMUM_QUANT_IMPL(I16I16toI16, vxc_short8,  vxc_short8,  vxc_short8)\n\
+MAXIMUM_QUANT_IMPL(I16I16toU8,  vxc_short8,  vxc_short8,  vxc_uchar16)\n\
+MAXIMUM_QUANT_IMPL(F16F16toU8,  vxc_half8,   vxc_short8,  vxc_uchar16)\n\
+MAXIMUM_QUANT_IMPL(F16F16toI8,  vxc_half8,   vxc_short8,  vxc_char16)\n\
+MAXIMUM_QUANT_IMPL(F16F16toI16, vxc_half8,   vxc_short8,  vxc_short8)\n\
 \n\
-__kernel void maximum_U8U8toI16_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_uchar16 src0, src1;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_short8 dst0, dst1, dst;\n\
-    vxc_ushort8 mp0, mp1;\n\
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Lo_2x8);\n\
-    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift1_Lo_2x8);\n\
-    dst = max(dst0, dst1);\n\
-\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+#define MAXIMUM_QUANT_2D_IMPL(name, src_type, copy_type, dst_type) \\\n\
+__kernel void maximum_##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output  \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    copy_type data0, data1; \\\n\
+    src_type src0, src1; \\\n\
+    VXC_ReadImage(data0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src0, data0, 16); \\\n\
+    VXC_ReadImage(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, data1, 16); \\\n\
+ \\\n\
+    dst_type dst0, dst1, dst; \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+    dst = max(dst0, dst1); \\\n\
+ \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8;\n\
-__kernel void maximum_I16I16toI16\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_0_2x8);\n\
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_1_2x8);\n\
-    dst = max(src0, src1);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void maximum_I16I16toI16_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord.z ++;\n\
-\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_0_2x8);\n\
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_1_2x8);\n\
-    dst = max(src0, src1);\n\
-\n\
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-"; /* end of maximum_vx*/
+MAXIMUM_QUANT_2D_IMPL(U8U8toI16,   vxc_uchar16, vxc_uchar16, vxc_short8)\n\
+MAXIMUM_QUANT_2D_IMPL(I16I16toI16, vxc_short8,  vxc_short8,  vxc_short8)\n\
+MAXIMUM_QUANT_2D_IMPL(I16I16toU8,  vxc_short8,  vxc_short8,  vxc_uchar16)\n\
+MAXIMUM_QUANT_2D_IMPL(F16F16toU8,  vxc_half8,   vxc_short8,  vxc_uchar16)\n\
+MAXIMUM_QUANT_2D_IMPL(F16F16toI8,  vxc_half8,   vxc_short8,  vxc_char16)\n\
+MAXIMUM_QUANT_2D_IMPL(F16F16toI16, vxc_half8,   vxc_short8,  vxc_short8)\n\
+"; /* end of maximum_0_vx*/
 
-static const char maximum_fp16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char maximum_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
-_viv_uniform VXC_512Bits uniConvertI8toI8_0_part0_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertI8toI8_0_part1_2x8;\n\
-_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertInt8toFp16_2x8;\n\
-\n\
-__kernel void maximum_I8F16toI8\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_char16 src0, src2, dst;\n\
-    vxc_short8 src1, src3, src4, src5;\n\
-    vxc_half8 data0, data1, data2, data3;\n\
-    vxc_char16 tmp0, tmp1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src4, input1, coord, VXC_5BITOFFSET_XY(8, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, data0, src1, 16);\n\
-    _viv_asm(COPY, data1, src4, 16);\n\
-\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);\n\
-    VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\
-    VXC_DP2x8(tmp0, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\
-    dst = max(src0, tmp0);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void maximum_I8F16toI8_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
-\n\
-    vxc_char16 src0, src2, dst;\n\
-    vxc_short8 src1, src3, src4, src5;\n\
-    vxc_half8 data0, data1, data2, data3;\n\
-    vxc_char16 tmp0;\n\
-\n\
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src4, input1, coord.xy, VXC_5BITOFFSET_XY(8, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, data0, src1, 16);\n\
-    _viv_asm(COPY, data1, src4, 16);\n\
-\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);\n\
-    VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\
-    VXC_DP2x8(tmp0, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\
-    dst = max(src0, tmp0);\n\
-\n\
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void maximum_I8F16toF16\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_char8 vec0, vec2;\n\
-    vxc_short8 vec1, vec3, dst;\n\
-    vxc_half8  src0, src1, src2, src3;\n\
-\n\
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt8toFp16_2x8);\n\
-\n\
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    _viv_asm(COPY, dst, src0, 16);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void maximum_I8F16toF16_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
-\n\
-    vxc_char8 vec0, vec2;\n\
-    vxc_short8 vec1, vec3, dst;\n\
-    vxc_half8  src0, src1, src2, src3;\n\
-    VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt8toFp16_2x8);\n\
-\n\
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    _viv_asm(COPY, dst, src0, 16);\n\
-\n\
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
 \n\
 _viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\
-_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\
-\n\
-__kernel void maximum_U8F16toF16\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_uchar8 vec0, vec2;\n\
-    vxc_short8 vec1, vec3, dst;\n\
-    vxc_half8  src0, src1, src2, src3;\n\
-\n\
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    vxc_ushort8 ms0;\n\
-    _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\
-    VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-                uniU8MulAndPostShift_0_Lo_2x8);\n\
-\n\
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    _viv_asm(COPY, dst, src0, 16);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void maximum_U8F16toF16_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
-\n\
-    vxc_uchar8 vec0, vec2;\n\
-    vxc_short8 vec1, vec3, dst;\n\
-    vxc_half8  src0, src1, src2, src3;\n\
-\n\
-    VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    vxc_ushort8 ms0;\n\
-    _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\
-    VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-                uniU8MulAndPostShift_0_Lo_2x8);\n\
-\n\
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    _viv_asm(COPY, dst, src0, 16);\n\
-\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\
-_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8;\n\
 _viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\
-__kernel void maximum_U8F16toU8\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_uchar16 src0, dst0, dst1;\n\
-    vxc_ushort8 src1, src2;\n\
-    vxc_half8 data1, data2;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src2, input1, coord, VXC_5BITOFFSET_XY(8, 0), \\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    _viv_asm(COPY, data2, src2, 16);\n\
-\n\
-    vxc_ushort8 mp0, mp1;\n\
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Lo_2x8);\n\
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Hi_2x8);\n\
-    VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniConvertFp16toU8_2x8);\n\
-    VXC_DP2x8(dst1, data2, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniConvertFp16toU8_2x8);\n\
-    dst0 = max(dst0, dst1);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void maximum_U8F16toU8_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_uchar16 src0, dst0, dst1;\n\
-    vxc_ushort8 src1, src2;\n\
-    vxc_half8 data1, data2;\n\
-    VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src2, input1, coord, VXC_5BITOFFSET_XY(8, 0), \\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    _viv_asm(COPY, data2, src2, 16);\n\
-\n\
-    vxc_ushort8 mp0, mp1;\n\
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Lo_2x8);\n\
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Hi_2x8);\n\
-    VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniConvertFp16toU8_2x8);\n\
-    VXC_DP2x8(dst1, data2, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniConvertFp16toU8_2x8);\n\
-    dst0 = max(dst0, dst1);\n\
-\n\
-    VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void maximum_F16F16toU8\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_ushort8 src0, src1;\n\
-    vxc_half8 data0, data1;\n\
-    vxc_uchar16 dst0, dst1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-\n\
-    vxc_ushort8 mp1;\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(dst0, data0, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniConvertFp16toU8_2x8);\n\
-    VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniConvertFp16toU8_2x8);\n\
-    dst0 = max(dst0, dst1);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void maximum_F16F16toU8_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_ushort8 src0, src1;\n\
-    vxc_half8 data0, data1;\n\
-    vxc_uchar16 dst0, dst1;\n\
-    VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-\n\
-    vxc_ushort8 mp1;\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(dst0, data0, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniConvertFp16toU8_2x8);\n\
-    VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniConvertFp16toU8_2x8);\n\
-    dst0 = max(dst0, dst1);\n\
-\n\
-    VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-"; /* end of maximum_fp16_vx*/
-
-static const char maximum_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertI16toI16_2x8;\n\
-_viv_uniform VXC_512Bits uinConvertFp16ToInt16_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8;\n\
-_viv_uniform float outputScale;\n\
-_viv_uniform float output_zp;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-_viv_uniform VXC_512Bits uniConvert1stFp16ToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert2ndFp16ToFp32_4x4;\n\
-\n\
-\n\
-__kernel void maximum_I16F16toI16\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0, src1, tmp0, dst;\n\
-    vxc_half8 data0;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, data0, src1, 16);\n\
-\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_2x8);\n\
-    VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt16_2x8);\n\
-    dst = max(src0, tmp0);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void maximum_I16F16toI16_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
-\n\
-    vxc_short8 src0, src1, tmp0, dst;\n\
-    vxc_half8 data0;\n\
-\n\
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, data0, src1, 16);\n\
-\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_2x8);\n\
-    VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt16_2x8);\n\
-    dst = max(src0, tmp0);\n\
-\n\
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void maximum_I16F16toF16\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 vec0, vec1, dst;\n\
-    vxc_half8  src0, src1;\n\
-\n\
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt16toFp16_2x8);\n\
-\n\
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    _viv_asm(COPY, dst, src0, 16);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void maximum_I16F16toF16_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
-\n\
-    vxc_short8 vec0, vec1, dst;\n\
-    vxc_half8  src0, src1;\n\
-    VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt16toFp16_2x8);\n\
-\n\
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    _viv_asm(COPY, dst, src0, 16);\n\
-\n\
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void maximum_F16F16toI16\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 vec0, vec1;\n\
-    vxc_short8 dst;\n\
-    vxc_half8  src0, src1;\n\
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src0, vec0, 16);\n\
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    int4 tmpDst0, tmpDst1;\n\
-    float4 tmpData0, tmpData1;\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert1stFp16ToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert2ndFp16ToFp32_4x4);\n\
-    tmpDst0 = convert_int4_rte(tmpData0 * outputScale + output_zp);\n\
-    tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);\n\
-    VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void maximum_F16F16toI16_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_short8 vec0, vec1;\n\
-    vxc_short8 dst;\n\
-    vxc_half8  src0, src1;\n\
-    VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src0, vec0, 16);\n\
-    VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    int4 tmpDst0, tmpDst1;\n\
-    float4 tmpData0, tmpData1;\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert1stFp16ToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert2ndFp16ToFp32_4x4);\n\
-    tmpDst0 = convert_int4_rte(tmpData0 * outputScale + output_zp);\n\
-    tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);\n\
-    VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
 _viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\
 _viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;\n\
-_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\
-_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\
-__kernel void maximum_I16I16toU8\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
 \n\
-    vxc_short8 src0, src1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_uchar16 dst0, dst1, dst;\n\
-    vxc_ushort8 mp0, mp1;\n\
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Lo_2x8);\n\
-    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift1_Lo_2x8);\n\
-    dst = max(dst0, dst1);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+#define MAXIMUM_F16TOQUANT_IMPL(name, src0_type, copy_type, dst_type) \\\n\
+__kernel void maximum_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output  \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    copy_type data0; \\\n\
+    src0_type src0; \\\n\
+    vxc_half8 src1; \\\n\
+    vxc_short8 data1; \\\n\
+    dst_type dst0, dst1, dst; \\\n\
+    VXC_ReadImage2DArray(data0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src0, data0, 16); \\\n\
+    VXC_ReadImage2DArray(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, data1, 16); \\\n\
+ \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+    dst = max(dst0, dst1); \\\n\
+ \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+MAXIMUM_F16TOQUANT_IMPL(U8F16toU8,   vxc_uchar16, vxc_uchar16, vxc_uchar8)\n\
+MAXIMUM_F16TOQUANT_IMPL(I8F16toI8,   vxc_char16,  vxc_char16,  vxc_char8)\n\
+MAXIMUM_F16TOQUANT_IMPL(I16F16toI16, vxc_short8,  vxc_short8,  vxc_short8)\n\
+MAXIMUM_F16TOQUANT_IMPL(F16F16toI16, vxc_half8,   vxc_short8,  vxc_short8)\n\
+MAXIMUM_F16TOQUANT_IMPL(F16F16toI8,  vxc_half8,   vxc_short8,  vxc_char8)\n\
+MAXIMUM_F16TOQUANT_IMPL(F16F16toU8,  vxc_half8,   vxc_short8,  vxc_uchar8)\n\
 \n\
-__kernel void maximum_I16I16toU8_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
+#define MAXIMUM_F16TOQUANT_2D_IMPL(name, src0_type, copy_type, dst_type) \\\n\
+__kernel void maximum_##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output  \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    copy_type data0; \\\n\
+    src0_type src0; \\\n\
+    vxc_half8 src1; \\\n\
+    vxc_short8 data1; \\\n\
+    dst_type dst0, dst1, dst; \\\n\
+    VXC_ReadImage(data0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src0, data0, 16); \\\n\
+    VXC_ReadImage(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, data1, 16); \\\n\
+ \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+    dst = max(dst0, dst1); \\\n\
+ \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+MAXIMUM_F16TOQUANT_2D_IMPL(U8F16toU8,   vxc_uchar16, vxc_uchar16, vxc_uchar8)\n\
+MAXIMUM_F16TOQUANT_2D_IMPL(I8F16toI8,   vxc_char16,  vxc_char16,  vxc_char8)\n\
+MAXIMUM_F16TOQUANT_2D_IMPL(I16F16toI16, vxc_short8,  vxc_short8,  vxc_short8)\n\
+MAXIMUM_F16TOQUANT_2D_IMPL(F16F16toI16, vxc_half8,   vxc_short8,  vxc_short8)\n\
+MAXIMUM_F16TOQUANT_2D_IMPL(F16F16toI8,  vxc_half8,   vxc_short8,  vxc_char8)\n\
+MAXIMUM_F16TOQUANT_2D_IMPL(F16F16toU8,  vxc_half8,   vxc_short8,  vxc_uchar8)\n\
 \n\
-    vxc_short8 src0, src1;\n\
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+#define MAXIMUM_QUANT_F16TOF16_IMPL(name, src_type) \\\n\
+__kernel void maximum_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output  \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    src_type vec0; \\\n\
+    vxc_half8 src0, src1; \\\n\
+    vxc_short8 data1, dst; \\\n\
+    VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, data1, 16); \\\n\
+ \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                uniU8MulAndPostShift0_Lo_2x8); \\\n\
+ \\\n\
+    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \\\n\
+    _viv_asm(COPY, dst, src0, 16); \\\n\
+ \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+MAXIMUM_QUANT_F16TOF16_IMPL(U8F16toF16,  vxc_uchar16)\n\
+MAXIMUM_QUANT_F16TOF16_IMPL(I8F16toF16,  vxc_char16)\n\
+MAXIMUM_QUANT_F16TOF16_IMPL(I16F16toF16, vxc_short8)\n\
 \n\
-    vxc_uchar16 dst0, dst1, dst;\n\
-    vxc_ushort8 mp0, mp1;\n\
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Lo_2x8);\n\
-    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift1_Lo_2x8);\n\
-    dst = max(dst0, dst1);\n\
+#define MAXIMUM_QUANT_F16TOF16_2D_IMPL(name, src_type) \\\n\
+__kernel void maximum_##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output  \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    src_type vec0; \\\n\
+    vxc_half8 src0, src1; \\\n\
+    vxc_short8 data1, dst; \\\n\
+    VXC_ReadImage(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, data1, 16); \\\n\
+ \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                uniU8MulAndPostShift0_Lo_2x8); \\\n\
+ \\\n\
+    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \\\n\
+    _viv_asm(COPY, dst, src0, 16); \\\n\
+ \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+MAXIMUM_QUANT_F16TOF16_2D_IMPL(U8F16toF16,  vxc_uchar16)\n\
+MAXIMUM_QUANT_F16TOF16_2D_IMPL(I8F16toF16,  vxc_char16)\n\
+MAXIMUM_QUANT_F16TOF16_2D_IMPL(I16F16toF16, vxc_short8)\n\
 \n\
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}"; /* end of maximum_i16_vx*/
+#define MAXIMUM_QUANTTOF16_IMPL(name, src_type) \\\n\
+__kernel void maximum_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output  \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    src_type vec0, vec1; \\\n\
+    vxc_half8 src0, src1; \\\n\
+    vxc_short8 dst; \\\n\
+    VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(src0, vec0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(src1, vec1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+ \\\n\
+    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \\\n\
+    _viv_asm(COPY, dst, src0, 16); \\\n\
+ \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+MAXIMUM_QUANTTOF16_IMPL(U8U8toF16,   vxc_uchar16)\n\
+MAXIMUM_QUANTTOF16_IMPL(I8I8toF16,   vxc_char16)\n\
+MAXIMUM_QUANTTOF16_IMPL(I16I16toF16, vxc_short8)\n\
+\n\
+#define MAXIMUM_QUANTTOF16_2D_IMPL(name, src_type) \\\n\
+__kernel void maximum_##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output  \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    src_type vec0, vec1; \\\n\
+    vxc_half8 src0, src1; \\\n\
+    vxc_short8 dst; \\\n\
+    VXC_ReadImage(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(src0, vec0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(src1, vec1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+ \\\n\
+    VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \\\n\
+    _viv_asm(COPY, dst, src0, 16); \\\n\
+ \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+MAXIMUM_QUANTTOF16_2D_IMPL(U8U8toF16,   vxc_uchar16)\n\
+MAXIMUM_QUANTTOF16_2D_IMPL(I8I8toF16,   vxc_char16)\n\
+MAXIMUM_QUANTTOF16_2D_IMPL(I16I16toF16, vxc_short8)"; /* end of maximum_1_vx*/
 
-static const char minimum_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char minimum_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 __kernel void minimum_F16F16toF16\n\
     (\n\
@@ -26061,10 +25889,10 @@ __kernel void minimum_F16F16toF16\n\
 \n\
     vxc_short8 vec0, vec1, dst;\n\
     vxc_half8  src0, src1;\n\
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+    VXC_ReadImage2DArray(vec0, input0, coord, 0,\\\n\
         VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     _viv_asm(COPY, src0, vec0, 16);\n\
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+    VXC_ReadImage2DArray(vec1, input1, coord, 0,\\\n\
         VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     _viv_asm(COPY, src1, vec1, 16);\n\
 \n\
@@ -26085,10 +25913,10 @@ __kernel void minimum_F16F16toF16_2D\n\
 \n\
     vxc_short8 vec0, vec1, dst;\n\
     vxc_half8  src0, src1;\n\
-    VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+    VXC_ReadImage(vec0, input0, coord.xy, 0,\\\n\
         VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     _viv_asm(COPY, src0, vec0, 16);\n\
-    VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+    VXC_ReadImage(vec1, input1, coord.xy, 0,\\\n\
         VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     _viv_asm(COPY, src1, vec1, 16);\n\
 \n\
@@ -26100,839 +25928,360 @@ __kernel void minimum_F16F16toF16_2D\n\
     VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
-_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8;\n\
-\n\
-__kernel void minimum_F16F16toI8\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 vec0, vec1;\n\
-    vxc_char8  dst;\n\
-    vxc_half8  src0, src1;\n\
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src0, vec0, 16);\n\
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void minimum_F16F16toI8_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_short8 vec0, vec1;\n\
-    vxc_char8  dst;\n\
-    vxc_half8  src0, src1;\n\
-    VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src0, vec0, 16);\n\
-    VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\
-\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertI8toI8_0_part0_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertI8toI8_0_part1_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertI8toI8_1_part0_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertI8toI8_1_part1_2x8;\n\
-__kernel void minimum_I8I8toI8\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_char16 src0, src1, dst;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);\n\
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part0_2x8);\n\
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part1_2x8);\n\
-    dst = min(src0, src1);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void minimum_I8I8toI8_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
-\n\
-    vxc_char16 src0, src1, dst;\n\
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord.z ++;\n\
-\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);\n\
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part0_2x8);\n\
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part1_2x8);\n\
-    dst = min(src0, src1);\n\
-\n\
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
 _viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\
 _viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8;\n\
 _viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;\n\
 _viv_uniform VXC_512Bits uniU8MulAndPostShift1_Hi_2x8;\n\
 _viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\
 _viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\
-__kernel void minimum_U8U8toU8\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
 \n\
-    vxc_uchar16 src0, src1, dst;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_ushort8 mp0, mp1;\n\
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Lo_2x8);\n\
-    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Hi_2x8);\n\
-    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift1_Lo_2x8);\n\
-    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift1_Hi_2x8);\n\
-    dst = min(src0, src1);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+#define MINIMUM_8BITS_QUANT_IMPL(name, dtype) \\\n\
+__kernel void minimum_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    dtype src0, src1, dst; \\\n\
+    VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Hi_2x8); \\\n\
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Hi_2x8); \\\n\
+    dst = min(src0, src1); \\\n\
+ \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+MINIMUM_8BITS_QUANT_IMPL(U8U8toU8, vxc_uchar16)\n\
+MINIMUM_8BITS_QUANT_IMPL(I8I8toI8, vxc_char16)\n\
 \n\
-__kernel void minimum_U8U8toU8_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_uchar16 src0, src1, dst;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_ushort8 mp0, mp1;\n\
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Lo_2x8);\n\
-    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Hi_2x8);\n\
-    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift1_Lo_2x8);\n\
-    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift1_Hi_2x8);\n\
-    dst = min(src0, src1);\n\
-\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+#define MINIMUM_8BITS_2D_QUANT_IMPL(name, dtype) \\\n\
+__kernel void minimum_##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    dtype src0, src1, dst; \\\n\
+    VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Hi_2x8); \\\n\
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+    VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Hi_2x8); \\\n\
+    dst = min(src0, src1); \\\n\
+ \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+MINIMUM_8BITS_2D_QUANT_IMPL(U8U8toU8, vxc_uchar16)\n\
+MINIMUM_8BITS_2D_QUANT_IMPL(I8I8toI8, vxc_char16)\n\
 \n\
-__kernel void minimum_U8U8toI16\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_uchar16 src0, src1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_short8 dst0, dst1, dst;\n\
-    vxc_ushort8 mp0, mp1;\n\
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Lo_2x8);\n\
-    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift1_Lo_2x8);\n\
-    dst = min(dst0, dst1);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+#define MINIMUM_QUANT_IMPL(name, src_type, copy_type, dst_type) \\\n\
+__kernel void minimum_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output  \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    copy_type data0, data1; \\\n\
+    src_type src0, src1; \\\n\
+    VXC_ReadImage2DArray(data0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src0, data0, 16); \\\n\
+    VXC_ReadImage2DArray(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, data1, 16); \\\n\
+ \\\n\
+    dst_type dst0, dst1, dst; \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+    dst = min(dst0, dst1); \\\n\
+ \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+MINIMUM_QUANT_IMPL(U8U8toI16,   vxc_uchar16, vxc_uchar16, vxc_short8)\n\
+MINIMUM_QUANT_IMPL(I16I16toI16, vxc_short8,  vxc_short8,  vxc_short8)\n\
+MINIMUM_QUANT_IMPL(I16I16toU8,  vxc_short8,  vxc_short8,  vxc_uchar16)\n\
+MINIMUM_QUANT_IMPL(F16F16toU8,  vxc_half8,   vxc_short8,  vxc_uchar16)\n\
+MINIMUM_QUANT_IMPL(F16F16toI8,  vxc_half8,   vxc_short8,  vxc_char16)\n\
+MINIMUM_QUANT_IMPL(F16F16toI16, vxc_half8,   vxc_short8,  vxc_short8)\n\
 \n\
-__kernel void minimum_U8U8toI16_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_uchar16 src0, src1;\n\
-    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_short8 dst0, dst1, dst;\n\
-    vxc_ushort8 mp0, mp1;\n\
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Lo_2x8);\n\
-    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift1_Lo_2x8);\n\
-    dst = min(dst0, dst1);\n\
-\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+#define MINIMUM_QUANT_2D_IMPL(name, src_type, copy_type, dst_type) \\\n\
+__kernel void minimum_##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output  \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    copy_type data0, data1; \\\n\
+    src_type src0, src1; \\\n\
+    VXC_ReadImage(data0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src0, data0, 16); \\\n\
+    VXC_ReadImage(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, data1, 16); \\\n\
+ \\\n\
+    dst_type dst0, dst1, dst; \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+    dst = min(dst0, dst1); \\\n\
+ \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8;\n\
-__kernel void minimum_I16I16toI16\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_0_2x8);\n\
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_1_2x8);\n\
-    dst = min(src0, src1);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void minimum_I16I16toI16_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord.z ++;\n\
-\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_0_2x8);\n\
-    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_1_2x8);\n\
-    dst = min(src0, src1);\n\
-\n\
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-"; /* end of minimum_vx*/
+MINIMUM_QUANT_2D_IMPL(U8U8toI16,   vxc_uchar16, vxc_uchar16, vxc_short8)\n\
+MINIMUM_QUANT_2D_IMPL(I16I16toI16, vxc_short8,  vxc_short8,  vxc_short8)\n\
+MINIMUM_QUANT_2D_IMPL(I16I16toU8,  vxc_short8,  vxc_short8,  vxc_uchar16)\n\
+MINIMUM_QUANT_2D_IMPL(F16F16toU8,  vxc_half8,   vxc_short8,  vxc_uchar16)\n\
+MINIMUM_QUANT_2D_IMPL(F16F16toI8,  vxc_half8,   vxc_short8,  vxc_char16)\n\
+MINIMUM_QUANT_2D_IMPL(F16F16toI16, vxc_half8,   vxc_short8,  vxc_short8)\n\
+"; /* end of minimum_0_vx*/
 
-static const char minimum_fp16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertI8toI8_0_part0_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertI8toI8_0_part1_2x8;\n\
-_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertInt8toFp16_2x8;\n\
-\n\
-__kernel void minimum_I8F16toI8\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_char16 src0, src2, dst;\n\
-    vxc_short8 src1, src3, src4, src5;\n\
-    vxc_half8 data0, data1, data2, data3;\n\
-    vxc_char16 tmp0, tmp1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src4, input1, coord, VXC_5BITOFFSET_XY(8, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, data0, src1, 16);\n\
-    _viv_asm(COPY, data1, src4, 16);\n\
-\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);\n\
-    VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\
-    VXC_DP2x8(tmp0, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\
-    dst = min(src0, tmp0);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void minimum_I8F16toI8_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
-\n\
-    vxc_char16 src0, src2, dst;\n\
-    vxc_short8 src1, src3, src4, src5;\n\
-    vxc_half8 data0, data1, data2, data3;\n\
-    vxc_char16 tmp0;\n\
-\n\
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src4, input1, coord.xy, VXC_5BITOFFSET_XY(8, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, data0, src1, 16);\n\
-    _viv_asm(COPY, data1, src4, 16);\n\
-\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);\n\
-    VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\
-    VXC_DP2x8(tmp0, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\
-    dst = min(src0, tmp0);\n\
-\n\
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void minimum_I8F16toF16\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_char8 vec0, vec2;\n\
-    vxc_short8 vec1, vec3, dst;\n\
-    vxc_half8  src0, src1, src2, src3;\n\
-\n\
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt8toFp16_2x8);\n\
-\n\
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    _viv_asm(COPY, dst, src0, 16);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void minimum_I8F16toF16_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
-\n\
-    vxc_char8 vec0, vec2;\n\
-    vxc_short8 vec1, vec3, dst;\n\
-    vxc_half8  src0, src1, src2, src3;\n\
-    VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt8toFp16_2x8);\n\
-\n\
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    _viv_asm(COPY, dst, src0, 16);\n\
-\n\
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
+static const char minimum_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\
-_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\
-\n\
-__kernel void minimum_U8F16toF16\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_uchar8 vec0, vec2;\n\
-    vxc_short8 vec1, vec3, dst;\n\
-    vxc_half8  src0, src1, src2, src3;\n\
-\n\
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    vxc_ushort8 ms0;\n\
-    _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\
-    VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-                uniU8MulAndPostShift_0_Lo_2x8);\n\
-\n\
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    _viv_asm(COPY, dst, src0, 16);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void minimum_U8F16toF16_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
-\n\
-    vxc_uchar8 vec0, vec2;\n\
-    vxc_short8 vec1, vec3, dst;\n\
-    vxc_half8  src0, src1, src2, src3;\n\
-\n\
-    VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    vxc_ushort8 ms0;\n\
-    _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\
-    VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-                uniU8MulAndPostShift_0_Lo_2x8);\n\
-\n\
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    _viv_asm(COPY, dst, src0, 16);\n\
-\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\
-_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8;\n\
 _viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\
-__kernel void minimum_U8F16toU8\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_uchar16 src0, dst0, dst1;\n\
-    vxc_ushort8 src1, src2;\n\
-    vxc_half8 data1, data2;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src2, input1, coord, VXC_5BITOFFSET_XY(8, 0), \\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    _viv_asm(COPY, data2, src2, 16);\n\
-\n\
-    vxc_ushort8 mp0, mp1;\n\
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Lo_2x8);\n\
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Hi_2x8);\n\
-    VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniConvertFp16toU8_2x8);\n\
-    VXC_DP2x8(dst1, data2, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniConvertFp16toU8_2x8);\n\
-    dst0 = min(dst0, dst1);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void minimum_U8F16toU8_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_uchar16 src0, dst0, dst1;\n\
-    vxc_ushort8 src1, src2;\n\
-    vxc_half8 data1, data2;\n\
-    VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src2, input1, coord, VXC_5BITOFFSET_XY(8, 0), \\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-    _viv_asm(COPY, data2, src2, 16);\n\
-\n\
-    vxc_ushort8 mp0, mp1;\n\
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Lo_2x8);\n\
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Hi_2x8);\n\
-    VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniConvertFp16toU8_2x8);\n\
-    VXC_DP2x8(dst1, data2, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniConvertFp16toU8_2x8);\n\
-    dst0 = min(dst0, dst1);\n\
-\n\
-    VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void minimum_F16F16toU8\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_ushort8 src0, src1;\n\
-    vxc_half8 data0, data1;\n\
-    vxc_uchar16 dst0, dst1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-\n\
-    vxc_ushort8 mp1;\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(dst0, data0, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniConvertFp16toU8_2x8);\n\
-    VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniConvertFp16toU8_2x8);\n\
-    dst0 = min(dst0, dst1);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void minimum_F16F16toU8_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
-\n\
-    vxc_ushort8 src0, src1;\n\
-    vxc_half8 data0, data1;\n\
-    vxc_uchar16 dst0, dst1;\n\
-    VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-\n\
-    vxc_ushort8 mp1;\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(dst0, data0, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniConvertFp16toU8_2x8);\n\
-    VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniConvertFp16toU8_2x8);\n\
-    dst0 = min(dst0, dst1);\n\
-\n\
-    VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-"; /* end of minimum_fp16_vx*/
-
-static const char minimum_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertI16toI16_2x8;\n\
-_viv_uniform VXC_512Bits uinConvertFp16ToInt16_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8;\n\
-\n\
-_viv_uniform float outputScale;\n\
-_viv_uniform float output_zp;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-_viv_uniform VXC_512Bits uniConvert1stFp16ToFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniConvert2ndFp16ToFp32_4x4;\n\
-\n\
-__kernel void minimum_I16F16toI16\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0, src1, tmp0, dst;\n\
-    vxc_half8 data0;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, data0, src1, 16);\n\
-\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_2x8);\n\
-    VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt16_2x8);\n\
-    dst = min(src0, tmp0);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void minimum_I16F16toI16_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
-\n\
-    vxc_short8 src0, src1, tmp0, dst;\n\
-    vxc_half8 data0;\n\
-\n\
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, data0, src1, 16);\n\
-\n\
-    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_2x8);\n\
-    VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt16_2x8);\n\
-    dst = min(src0, tmp0);\n\
-\n\
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void minimum_I16F16toF16\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 vec0, vec1, dst;\n\
-    vxc_half8  src0, src1;\n\
-\n\
-    VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt16toFp16_2x8);\n\
-\n\
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    _viv_asm(COPY, dst, src0, 16);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void minimum_I16F16toF16_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
-\n\
-    vxc_short8 vec0, vec1, dst;\n\
-    vxc_half8  src0, src1;\n\
-    VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, src1, vec1, 16);\n\
-\n\
-    VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt16toFp16_2x8);\n\
-\n\
-    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-    _viv_asm(COPY, dst, src0, 16);\n\
-\n\
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void minimum_F16F16toI16\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    vxc_half8 data0, data1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-\n\
-    VXC_VertMin3_Half(data0, data0, data1, data1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-\n\
-    int4 tmpDst0, tmpDst1;\n\
-    float4 tmpData0, tmpData1;\n\
-    VXC_DP4x4(tmpData0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert1stFp16ToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert2ndFp16ToFp32_4x4);\n\
-    tmpDst0 = convert_int4_rte(tmpData0 * outputScale + output_zp);\n\
-    tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);\n\
-    VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void minimum_F16F16toI16_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
-\n\
-    vxc_short8 src0, src1, dst;\n\
-    vxc_half8 data0, data1;\n\
-\n\
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(COPY, data0, src0, 16);\n\
-    _viv_asm(COPY, data1, src1, 16);\n\
-\n\
-    VXC_VertMin3_Half(data0, data0, data1, data1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\
-\n\
-    int4 tmpDst0, tmpDst1;\n\
-    float4 tmpData0, tmpData1;\n\
-    VXC_DP4x4(tmpData0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert1stFp16ToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert2ndFp16ToFp32_4x4);\n\
-    tmpDst0 = convert_int4_rte(tmpData0 * outputScale + output_zp);\n\
-    tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);\n\
-    VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-\n\
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
 _viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\
 _viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;\n\
-_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\
-_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\
-__kernel void minimum_I16I16toU8\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
 \n\
-    vxc_short8 src0, src1;\n\
-    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_uchar16 dst0, dst1, dst;\n\
-    vxc_ushort8 mp0, mp1;\n\
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Lo_2x8);\n\
-    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift1_Lo_2x8);\n\
-    dst = min(dst0, dst1);\n\
-\n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+#define MINIMUM_F16TOQUANT_IMPL(name, src0_type, copy_type, dst_type) \\\n\
+__kernel void minimum_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output  \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    copy_type data0; \\\n\
+    src0_type src0; \\\n\
+    vxc_half8 src1; \\\n\
+    vxc_short8 data1; \\\n\
+    dst_type dst0, dst1, dst; \\\n\
+    VXC_ReadImage2DArray(data0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src0, data0, 16); \\\n\
+    VXC_ReadImage2DArray(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, data1, 16); \\\n\
+ \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+    dst = min(dst0, dst1); \\\n\
+ \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+MINIMUM_F16TOQUANT_IMPL(U8F16toU8,   vxc_uchar16, vxc_uchar16, vxc_uchar8)\n\
+MINIMUM_F16TOQUANT_IMPL(I8F16toI8,   vxc_char16,  vxc_char16,  vxc_char8)\n\
+MINIMUM_F16TOQUANT_IMPL(I16F16toI16, vxc_short8,  vxc_short8,  vxc_short8)\n\
+MINIMUM_F16TOQUANT_IMPL(F16F16toI16, vxc_half8,   vxc_short8,  vxc_short8)\n\
+MINIMUM_F16TOQUANT_IMPL(F16F16toI8,  vxc_half8,   vxc_short8,  vxc_char8)\n\
+MINIMUM_F16TOQUANT_IMPL(F16F16toU8,  vxc_half8,   vxc_short8,  vxc_uchar8)\n\
 \n\
-__kernel void minimum_I16I16toU8_2D\n\
-    (\n\
-    __read_only  image2d_array_t    input0,\n\
-    __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
-    )\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
+#define MINIMUM_F16TOQUANT_2D_IMPL(name, src0_type, copy_type, dst_type) \\\n\
+__kernel void minimum_##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output  \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    copy_type data0; \\\n\
+    src0_type src0; \\\n\
+    vxc_half8 src1; \\\n\
+    vxc_short8 data1; \\\n\
+    dst_type dst0, dst1, dst; \\\n\
+    VXC_ReadImage(data0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src0, data0, 16); \\\n\
+    VXC_ReadImage(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, data1, 16); \\\n\
+ \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+    dst = min(dst0, dst1); \\\n\
+ \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+MINIMUM_F16TOQUANT_2D_IMPL(U8F16toU8,   vxc_uchar16, vxc_uchar16, vxc_uchar8)\n\
+MINIMUM_F16TOQUANT_2D_IMPL(I8F16toI8,   vxc_char16,  vxc_char16,  vxc_char8)\n\
+MINIMUM_F16TOQUANT_2D_IMPL(I16F16toI16, vxc_short8,  vxc_short8,  vxc_short8)\n\
+MINIMUM_F16TOQUANT_2D_IMPL(F16F16toI16, vxc_half8,   vxc_short8,  vxc_short8)\n\
+MINIMUM_F16TOQUANT_2D_IMPL(F16F16toI8,  vxc_half8,   vxc_short8,  vxc_char8)\n\
+MINIMUM_F16TOQUANT_2D_IMPL(F16F16toU8,  vxc_half8,   vxc_short8,  vxc_uchar8)\n\
 \n\
-    vxc_short8 src0, src1;\n\
-    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+#define MINIMUM_QUANT_F16TOF16_IMPL(name, src_type) \\\n\
+__kernel void minimum_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output  \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    src_type vec0; \\\n\
+    vxc_half8 src0, src1; \\\n\
+    vxc_short8 data1, dst; \\\n\
+    VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, data1, 16); \\\n\
+ \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                uniU8MulAndPostShift0_Lo_2x8); \\\n\
+ \\\n\
+    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \\\n\
+    _viv_asm(COPY, dst, src0, 16); \\\n\
+ \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+MINIMUM_QUANT_F16TOF16_IMPL(U8F16toF16,  vxc_uchar16)\n\
+MINIMUM_QUANT_F16TOF16_IMPL(I8F16toF16,  vxc_char16)\n\
+MINIMUM_QUANT_F16TOF16_IMPL(I16F16toF16, vxc_short8)\n\
 \n\
-    vxc_uchar16 dst0, dst1, dst;\n\
-    vxc_ushort8 mp0, mp1;\n\
-    _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\
-    _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\
-    VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift0_Lo_2x8);\n\
-    VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\
-        uniU8MulAndPostShift1_Lo_2x8);\n\
-    dst = min(dst0, dst1);\n\
+#define MINIMUM_QUANT_F16TOF16_2D_IMPL(name, src_type) \\\n\
+__kernel void minimum_##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output  \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    src_type vec0; \\\n\
+    vxc_half8 src0, src1; \\\n\
+    vxc_short8 data1, dst; \\\n\
+    VXC_ReadImage(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(data1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, data1, 16); \\\n\
+ \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                uniU8MulAndPostShift0_Lo_2x8); \\\n\
+ \\\n\
+    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \\\n\
+    _viv_asm(COPY, dst, src0, 16); \\\n\
+ \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+MINIMUM_QUANT_F16TOF16_2D_IMPL(U8F16toF16,  vxc_uchar16)\n\
+MINIMUM_QUANT_F16TOF16_2D_IMPL(I8F16toF16,  vxc_char16)\n\
+MINIMUM_QUANT_F16TOF16_2D_IMPL(I16F16toF16, vxc_short8)\n\
 \n\
-    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
-}"; /* end of minimum_i16_vx*/
+#define MINIMUM_QUANTTOF16_IMPL(name, src_type) \\\n\
+__kernel void minimum_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output  \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    src_type vec0, vec1; \\\n\
+    vxc_half8 src0, src1; \\\n\
+    vxc_short8 dst; \\\n\
+    VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(src0, vec0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(src1, vec1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+ \\\n\
+    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \\\n\
+    _viv_asm(COPY, dst, src0, 16); \\\n\
+ \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+MINIMUM_QUANTTOF16_IMPL(U8U8toF16,   vxc_uchar16)\n\
+MINIMUM_QUANTTOF16_IMPL(I8I8toF16,   vxc_char16)\n\
+MINIMUM_QUANTTOF16_IMPL(I16I16toF16, vxc_short8)\n\
+\n\
+#define MINIMUM_QUANTTOF16_2D_IMPL(name, src_type) \\\n\
+__kernel void minimum_##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output  \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    src_type vec0, vec1; \\\n\
+    vxc_half8 src0, src1; \\\n\
+    vxc_short8 dst; \\\n\
+    VXC_ReadImage(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_ushort8 mp0, mp1; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(src0, vec0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(src1, vec1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+ \\\n\
+    VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \\\n\
+    _viv_asm(COPY, dst, src0, 16); \\\n\
+ \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+MINIMUM_QUANTTOF16_2D_IMPL(U8U8toF16,   vxc_uchar16)\n\
+MINIMUM_QUANTTOF16_2D_IMPL(I8I8toF16,   vxc_char16)\n\
+MINIMUM_QUANTTOF16_2D_IMPL(I16I16toF16, vxc_short8)"; /* end of minimum_1_vx*/
 
 static const char moments_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -29069,14 +28418,17 @@ __kernel void one_hot_##name0##to##name1 \\\n\
         coord.z ++; \\\n\
     } while (coord.z < depth); \\\n\
 }\n\
-ONE_HOT_SH_IMPL(F16, F16, vxc_ushort8, vxc_half8,  vxc_ushort8)\n\
-ONE_HOT_SH_IMPL(F16, I16, vxc_ushort8, vxc_half8,  vxc_ushort8)\n\
-ONE_HOT_SH_IMPL(F16, I8,  vxc_ushort8, vxc_half8,  vxc_uchar8)\n\
-ONE_HOT_SH_IMPL(F16, U8,  vxc_ushort8, vxc_half8,  vxc_uchar8)\n\
-ONE_HOT_SH_IMPL(I16, F16, vxc_short8,  vxc_short8, vxc_ushort8)\n\
-ONE_HOT_SH_IMPL(I16, I16, vxc_short8,  vxc_short8, vxc_ushort8)\n\
-ONE_HOT_SH_IMPL(I8,  F16, vxc_char8,   vxc_char8,  vxc_ushort8)\n\
-ONE_HOT_SH_IMPL(I8,  I8,  vxc_char8,   vxc_char8,  vxc_uchar8)\n\
+ONE_HOT_SH_IMPL(F16, F16,  vxc_ushort8, vxc_half8,  vxc_ushort8)\n\
+ONE_HOT_SH_IMPL(F16, I16,  vxc_ushort8, vxc_half8,  vxc_ushort8)\n\
+ONE_HOT_SH_IMPL(F16, I8,   vxc_ushort8, vxc_half8,  vxc_uchar8)\n\
+ONE_HOT_SH_IMPL(F16, U8,   vxc_ushort8, vxc_half8,  vxc_uchar8)\n\
+ONE_HOT_SH_IMPL(I16, F16,  vxc_short8,  vxc_short8, vxc_ushort8)\n\
+ONE_HOT_SH_IMPL(I16, I16,  vxc_short8,  vxc_short8, vxc_ushort8)\n\
+ONE_HOT_SH_IMPL(I16, BI16, vxc_short8,  vxc_short8, vxc_ushort8)\n\
+ONE_HOT_SH_IMPL(I16, I8,   vxc_short8,  vxc_short8, vxc_uchar8)\n\
+ONE_HOT_SH_IMPL(I16, U8,   vxc_short8,  vxc_short8, vxc_uchar8)\n\
+ONE_HOT_SH_IMPL(I8,  F16,  vxc_char8,   vxc_char8,  vxc_ushort8)\n\
+ONE_HOT_SH_IMPL(I8,  I8,   vxc_char8,   vxc_char8,  vxc_uchar8)\n\
 \n\
 #define ONE_HOT_SH_IMPL_2D(name0, name1, src_type, copy_type, dst_type) \\\n\
 __kernel void one_hot_##name0##to##name1##_2D \\\n\
@@ -29125,14 +28477,17 @@ __kernel void one_hot_##name0##to##name1##_2D \\\n\
         coord.y += 4; \\\n\
     } while (coord.y < depth); \\\n\
 }\n\
-ONE_HOT_SH_IMPL_2D(F16, F16, vxc_ushort8, vxc_half8,  vxc_ushort8)\n\
-ONE_HOT_SH_IMPL_2D(F16, I16, vxc_ushort8, vxc_half8,  vxc_ushort8)\n\
-ONE_HOT_SH_IMPL_2D(F16, I8,  vxc_ushort8, vxc_half8,  vxc_uchar8)\n\
-ONE_HOT_SH_IMPL_2D(F16, U8,  vxc_ushort8, vxc_half8,  vxc_uchar8)\n\
-ONE_HOT_SH_IMPL_2D(I16, F16, vxc_short8,  vxc_short8, vxc_ushort8)\n\
-ONE_HOT_SH_IMPL_2D(I16, I16, vxc_short8,  vxc_short8, vxc_ushort8)\n\
-ONE_HOT_SH_IMPL_2D(I8,  F16, vxc_char8,   vxc_char8,  vxc_ushort8)\n\
-ONE_HOT_SH_IMPL_2D(I8,  I8,  vxc_char8,   vxc_char8,  vxc_uchar8)\n\
+ONE_HOT_SH_IMPL_2D(F16, F16,  vxc_ushort8, vxc_half8,  vxc_ushort8)\n\
+ONE_HOT_SH_IMPL_2D(F16, I16,  vxc_ushort8, vxc_half8,  vxc_ushort8)\n\
+ONE_HOT_SH_IMPL_2D(F16, I8,   vxc_ushort8, vxc_half8,  vxc_uchar8)\n\
+ONE_HOT_SH_IMPL_2D(F16, U8,   vxc_ushort8, vxc_half8,  vxc_uchar8)\n\
+ONE_HOT_SH_IMPL_2D(I16, F16,  vxc_short8,  vxc_short8, vxc_ushort8)\n\
+ONE_HOT_SH_IMPL_2D(I16, I16,  vxc_short8,  vxc_short8, vxc_ushort8)\n\
+ONE_HOT_SH_IMPL_2D(I16, BI16, vxc_short8,  vxc_short8, vxc_ushort8)\n\
+ONE_HOT_SH_IMPL_2D(I16, I8,   vxc_short8,  vxc_short8, vxc_uchar8)\n\
+ONE_HOT_SH_IMPL_2D(I16, U8,   vxc_short8,  vxc_short8, vxc_uchar8)\n\
+ONE_HOT_SH_IMPL_2D(I8,  F16,  vxc_char8,   vxc_char8,  vxc_ushort8)\n\
+ONE_HOT_SH_IMPL_2D(I8,  I8,   vxc_char8,   vxc_char8,  vxc_uchar8)\n\
 \n\
 _viv_uniform float input_scale;\n\
 _viv_uniform float input_tail;\n\
@@ -29176,8 +28531,11 @@ __kernel void one_hot_##name0##to##name1 \\\n\
         coord.z ++; \\\n\
     } while (coord.z < depth); \\\n\
 }\n\
-ONE_HOT_ASYM_SH_IMPL(U8,  F16, vxc_uchar8,  vxc_uchar8, vxc_ushort8)\n\
-ONE_HOT_ASYM_SH_IMPL(U8,  U8,  vxc_uchar8,  vxc_uchar8, vxc_uchar8)\n\
+ONE_HOT_ASYM_SH_IMPL(U8, F16,  vxc_uchar8,  vxc_uchar8, vxc_ushort8)\n\
+ONE_HOT_ASYM_SH_IMPL(U8, U8,   vxc_uchar8,  vxc_uchar8, vxc_uchar8)\n\
+ONE_HOT_ASYM_SH_IMPL(U8, I8,   vxc_uchar8,  vxc_uchar8, vxc_uchar8)\n\
+ONE_HOT_ASYM_SH_IMPL(U8, I16,  vxc_uchar8,  vxc_uchar8, vxc_ushort8)\n\
+ONE_HOT_ASYM_SH_IMPL(U8, BI16, vxc_uchar8,  vxc_uchar8, vxc_ushort8)\n\
 \n\
 #define ONE_HOT_ASYM_SH_IMPL_2D(name0, name1, src_type, copy_type, dst_type) \\\n\
 __kernel void one_hot_##name0##to##name1##_2D \\\n\
@@ -29228,8 +28586,11 @@ __kernel void one_hot_##name0##to##name1##_2D \\\n\
         coord.y += 4; \\\n\
     } while (coord.y < depth); \\\n\
 }\n\
-ONE_HOT_ASYM_SH_IMPL_2D(U8,  F16, vxc_uchar8,  vxc_uchar8, vxc_ushort8)\n\
-ONE_HOT_ASYM_SH_IMPL_2D(U8,  U8,  vxc_uchar8,  vxc_uchar8, vxc_uchar8)\n\
+ONE_HOT_ASYM_SH_IMPL_2D(U8, F16,  vxc_uchar8,  vxc_uchar8, vxc_ushort8)\n\
+ONE_HOT_ASYM_SH_IMPL_2D(U8, U8,   vxc_uchar8,  vxc_uchar8, vxc_uchar8)\n\
+ONE_HOT_ASYM_SH_IMPL_2D(U8, I8,   vxc_uchar8,  vxc_uchar8, vxc_uchar8)\n\
+ONE_HOT_ASYM_SH_IMPL_2D(U8, I16,  vxc_uchar8,  vxc_uchar8, vxc_ushort8)\n\
+ONE_HOT_ASYM_SH_IMPL_2D(U8, BI16, vxc_uchar8,  vxc_uchar8, vxc_ushort8)\n\
 \n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
@@ -33212,6 +32573,615 @@ IMAGE_PRE_PROCESS(I16, int4,  vxc_short8,  vxc_short8)\n\
 IMAGE_PRE_PROCESS(F16, half4, vxc_half8,   vxc_short8)\n\
 "; /* end of pre_process_rgb_vx*/
 
+static const char pre_process_rgb888_planar_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniVecShift10;\n\
+_viv_uniform VXC_512Bits uniAddRShift;\n\
+_viv_uniform VXC_512Bits uniGetTempVal;\n\
+_viv_uniform VXC_512Bits uniExtractBytes;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+#define RESIZE_BILINEAR_4X1(input, mean, output) \\\n\
+    VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+    tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \\\n\
+    _viv_asm(CONV, dst0, tmp_dst); \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst1, 8); \\\n\
+    VXC_WriteImage(output, coord_out, dst, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+#define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __read_only  image2d_array_t input2, \\\n\
+    __write_only image2d_array_t output0, \\\n\
+    __write_only image2d_array_t output1, \\\n\
+    __write_only image2d_array_t output2, \\\n\
+          global int             *xRatio, \\\n\
+          global int             *yRatio, \\\n\
+          global int             *xOffset, \\\n\
+          global int             *yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           f32Var \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
+ \\\n\
+    int4 xPos = get_global_id(0); \\\n\
+    int yPos = get_global_id(1); \\\n\
+ \\\n\
+    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\
+    xPos += (int4)(0, 1, 2, 3); \\\n\
+ \\\n\
+    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\
+    int4 sx = fx0 & 0xffff8000; \\\n\
+    fx0 -= sx; \\\n\
+    sx = sx >> 15; \\\n\
+ \\\n\
+    vxc_short4 fx; \\\n\
+    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniAddRShift); \\\n\
+ \\\n\
+    int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\
+    int sy = fy & 0xffff8000; \\\n\
+ \\\n\
+    fy -= sy; \\\n\
+    sy = sy >> 15; \\\n\
+ \\\n\
+    fy = (fy + (1<< 4)) >> 5; \\\n\
+ \\\n\
+    vxc_uchar16 line0Y; \\\n\
+    vxc_uchar16 line1Y; \\\n\
+    int4 coord; \\\n\
+    sx = sx + *xOffset; \\\n\
+    coord.xyz = sx.xyz; \\\n\
+    coord.w = sy + *yOffset; \\\n\
+    int2 coord1 = (int2)(sx.w, coord.w); \\\n\
+    VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    int4 test01, temp1; \\\n\
+    int4 test02, temp2; \\\n\
+    int4 tt; \\\n\
+    vxc_uchar4 val; \\\n\
+    int2 coord_out = (int2)(xPos.x, yPos); \\\n\
+ \\\n\
+    vxc_uchar8 line1, line2; \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ \\\n\
+    vxc_float4 tmp_dst; \\\n\
+    vxc_uchar4 u8_dst; \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+    conv_type dst0; \\\n\
+    dst_type dst1; \\\n\
+    copy_type dst; \\\n\
+    tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\
+    _viv_asm(CONV, dst0, tmp_dst); \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst1, 8); \\\n\
+    VXC_WriteImage(output0, coord_out, dst, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    RESIZE_BILINEAR_4X1(input1, gMean, output1) \\\n\
+    RESIZE_BILINEAR_4X1(input2, bMean, output2) \\\n\
+}\n\
+PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8,  half4, vxc_short8)\n\
+PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4,  vxc_short8)\n\
+\n\
+#define PRE_PROCESS_RGB888_PLANAR_8BITS(dst_name, write_type) \\\n\
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __read_only  image2d_array_t input2, \\\n\
+    __write_only image2d_array_t output0, \\\n\
+    __write_only image2d_array_t output1, \\\n\
+    __write_only image2d_array_t output2, \\\n\
+          global int             *xRatio, \\\n\
+          global int             *yRatio, \\\n\
+          global int             *xOffset, \\\n\
+          global int             *yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           f32Var \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
+    int4 xPos = get_global_id(0); \\\n\
+    int yPos  = get_global_id(1); \\\n\
+ \\\n\
+    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\
+    xPos += (int4)(0, 1, 2, 3); \\\n\
+ \\\n\
+    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\
+    int4 sx = fx0 & 0xffff8000; \\\n\
+    fx0 -= sx; \\\n\
+    sx = sx >> 15; \\\n\
+ \\\n\
+    vxc_short4 fx; \\\n\
+    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \\\n\
+ \\\n\
+    int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\
+    int sy = fy & 0xffff8000; \\\n\
+ \\\n\
+    fy -= sy; \\\n\
+    sy = sy >> 15; \\\n\
+    fy = (fy + (1<< 4)) >> 5; \\\n\
+ \\\n\
+    vxc_uchar16 line0Y; \\\n\
+    vxc_uchar16 line1Y; \\\n\
+    int4 coord; \\\n\
+    sx = sx + *xOffset; \\\n\
+    coord.xyz = sx.xyz; \\\n\
+    coord.w   = sy + *yOffset; \\\n\
+    int2 coord1 = (int2)(sx.w, coord.w); \\\n\
+    VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    int4 test01, temp1; \\\n\
+    int4 test02, temp2; \\\n\
+    int2 coord_out = (int2)(xPos.x, yPos); \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ \\\n\
+    vxc_float4 tmp_dst; \\\n\
+    vxc_uchar4 u8_dst; \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+    int4 dst0; \\\n\
+    write_type dst; \\\n\
+    tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\
+    dst0 = convert_int4_rte(tmp_dst); \\\n\
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+ \\\n\
+    VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input1, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input1, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(line1Y, input1, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input1, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input1, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input1, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+    tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \\\n\
+    dst0 = convert_int4_rte(tmp_dst); \\\n\
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+ \\\n\
+    VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input2, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input2, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(line1Y, input2, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input2, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input2, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input2, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+    tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \\\n\
+    dst0 = convert_int4_rte(tmp_dst); \\\n\
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+ \\\n\
+    VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)\n\
+PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_0_vx*/
+
+static const char pre_process_rgb888_planar_1_vx[] = "/*\n\
+ ============================================================================\n\
+ Name        : GrayScale.vx\n\
+ Author      : Sam\n\
+ Version     :\n\
+ Copyright   : Your copyright notice\n\
+ Description :\n\
+ ============================================================================\n\
+ */\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\
+_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\
+\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __read_only  image2d_array_t input2, \\\n\
+    __write_only image2d_array_t output0, \\\n\
+    __write_only image2d_array_t output1, \\\n\
+    __write_only image2d_array_t output2, \\\n\
+          global int             *xRatio, \\\n\
+          global int             *yRatio, \\\n\
+          global int             *xOffset, \\\n\
+          global int             *yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           f32Var \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    coord.xy += (int2)(*xOffset, *yOffset); \\\n\
+    vxc_uchar16 src0, src1, src2; \\\n\
+    dst_type dst0, dst1; \\\n\
+ \\\n\
+    VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    coord.x = coord.z + 8; \\\n\
+    float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\
+        rMean * output_scale - output_zp, output_scale); \\\n\
+ \\\n\
+    half4 paramData_f16; \\\n\
+    copy_type tmp_dst; \\\n\
+    _viv_asm(CONV, paramData_f16, paramData0); \\\n\
+    VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
+    VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
+    VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\
+        gMean * output_scale - output_zp, output_scale); \\\n\
+    _viv_asm(CONV, paramData_f16, paramData1); \\\n\
+    VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
+    VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
+    VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\
+        bMean * output_scale - output_zp, output_scale); \\\n\
+    _viv_asm(CONV, paramData_f16, paramData2); \\\n\
+    VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
+    VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
+    VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8,  vxc_short8)\n\
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\
+\n\
+#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \\\n\
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __read_only  image2d_array_t input2, \\\n\
+    __write_only image2d_array_t output0, \\\n\
+    __write_only image2d_array_t output1, \\\n\
+    __write_only image2d_array_t output2, \\\n\
+          global int             *xRatio, \\\n\
+          global int             *yRatio, \\\n\
+          global int             *xOffset, \\\n\
+          global int             *yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           f32Var \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    coord.xy += (int2) (*xOffset, *yOffset); \\\n\
+    vxc_uchar16 src0, src1, src2; \\\n\
+    write_type dst; \\\n\
+ \\\n\
+    VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\
+        rMean * output_scale - output_zp, output_scale); \\\n\
+ \\\n\
+    half4 paramData_f16; \\\n\
+    _viv_asm(CONV, paramData_f16, paramData0); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\
+        gMean * output_scale - output_zp, output_scale); \\\n\
+    _viv_asm(CONV, paramData_f16, paramData1); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\
+        bMean * output_scale - output_zp, output_scale); \\\n\
+    _viv_asm(CONV, paramData_f16, paramData2); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\
+PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\
+"; /* end of pre_process_rgb888_planar_1_vx*/
+
+static const char pre_process_rgb888_planar_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;\n\
+\n\
+__kernel void pre_process_rgb888_planar_4over3_U8toU8\n\
+    (\n\
+    __read_only  image2d_array_t input0,\n\
+    __read_only  image2d_array_t input1,\n\
+    __read_only  image2d_array_t input2,\n\
+    __write_only image2d_array_t output0,\n\
+    __write_only image2d_array_t output1,\n\
+    __write_only image2d_array_t output2,\n\
+          global int             *xRatio,\n\
+          global int             *yRatio,\n\
+          global int             *xOffset,\n\
+          global int             *yOffset,\n\
+                 float           rMean,\n\
+                 float           gMean,\n\
+                 float           bMean,\n\
+                 float           f32Var\n\
+    )\n\
+{\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
+    int4 coord_out;\n\
+\n\
+    vxc_uchar16 src0, src1, src2, src3;\n\
+    vxc_uchar16 dst0, dst1, dst2;\n\
+\n\
+    VXC_ReadImage(src0, input0, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input0, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src2, input0, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src3, input0, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_out.xy = (coord_in.xy >> 2) * 3;\n\
+    coord_out.zw = coord_in.yy + (int2)(1, 2);\n\
+\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+\n\
+    VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_ReadImage(src0, input1, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input1, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src2, input1, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src3, input1, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+\n\
+    VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_ReadImage(src0, input2, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input2, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src2, input2, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src3, input2, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+\n\
+    VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void pre_process_rgb888_planar_half_U8toU8\n\
+    (\n\
+    __read_only  image2d_array_t input0,\n\
+    __read_only  image2d_array_t input1,\n\
+    __read_only  image2d_array_t input2,\n\
+    __write_only image2d_array_t output0,\n\
+    __write_only image2d_array_t output1,\n\
+    __write_only image2d_array_t output2,\n\
+          global int             *xRatio,\n\
+          global int             *yRatio,\n\
+          global int             *xOffset,\n\
+          global int             *yOffset,\n\
+                 float           rMean,\n\
+                 float           gMean,\n\
+                 float           bMean,\n\
+                 float           f32Var\n\
+    )\n\
+{\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
+\n\
+    vxc_uchar16 src0, src1, src2;\n\
+\n\
+    VXC_ReadImage(src0, input0, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input1, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src2, input2, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_in.zw = coord_in.xy >> 1;\n\
+\n\
+    VXC_WriteImage(output0, coord_in.zw, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output1, coord_in.zw, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output2, coord_in.zw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of pre_process_rgb888_planar_2_vx*/
+
 static const char pre_process_rgb_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform float outputScale;\n\
@@ -45648,15 +45618,7 @@ UPSAMPLE_SCALETO16B_FUN(I16, F16,  vxc_short8,  vxc_short8,  vxc_half8,  vxc_sho
 UPSAMPLE_SCALETO16B_FUN(I16, I16,  vxc_short8,  vxc_short8,  vxc_short8, vxc_short8)\n\
 "; /* end of upsamplescale_k2_vx*/
 
-static const char vsi_nn_kernel_header_vx[] = "/*\n\
- ============================================================================\n\
- Name        : libNNExt.vx\n\
- Author      : VSI\n\
- Version     :\n\
- Copyright   : Your copyright notice\n\
- Description :\n\
- ============================================================================\n\
- */\n\
+static const char vsi_nn_kernel_header_vx[] = "\n\
 #pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
 \n\
 typedef struct Image\n\
@@ -47312,11 +47274,17 @@ CAST_TO_BOOL_FUN_2D(U32, uint4,  read_imageui)\n\
 
 static const char clip_BF16_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\
 \n\
-__kernel void clip_BF16toBF16(\n\
-    __read_only  image2d_array_t  input,\n\
-    __write_only image2d_array_t  output,\n\
-                           float  minData,\n\
-                           float  maxData)\n\
+__kernel void clip_BF16toBF16\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                           float minData,\n\
+                           float maxData,\n\
+                           float inputScale,\n\
+                           float inputTail,\n\
+                           float outputScale,\n\
+                           float outputZP\n\
+    )\n\
 {\n\
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
     uint4 src0 = read_imageui(input, coord);\n\
@@ -47330,11 +47298,17 @@ __kernel void clip_BF16toBF16(\n\
     write_imageui(output, coord, dst);\n\
 }\n\
 \n\
-__kernel void clip_BF16toBF16_2D(\n\
-    __read_only  image2d_t  input,\n\
-    __write_only image2d_t  output,\n\
-                     float  minData,\n\
-                     float  maxData)\n\
+__kernel void clip_BF16toBF16_2D\n\
+    (\n\
+    __read_only  image2d_t input,\n\
+    __write_only image2d_t output,\n\
+                     float minData,\n\
+                     float maxData,\n\
+                     float inputScale,\n\
+                     float inputTail,\n\
+                     float outputScale,\n\
+                     float outputZP\n\
+    )\n\
 {\n\
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
     uint4 src0 = read_imageui(input, coord);\n\
@@ -47349,72 +47323,194 @@ __kernel void clip_BF16toBF16_2D(\n\
 }\n\
 "; /* end of clip_BF16_cl*/
 
-static const char clip_F32_cl[] = "__kernel void clip_F32toF32(\n\
-    __read_only  image2d_array_t  input,\n\
-    __write_only image2d_array_t  output,\n\
-                           float  minData,\n\
-                           float  maxData)\n\
-{\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-    float4 src = read_imagef(input, coord);\n\
-    float4 dst = src > minData ? src : minData;\n\
-    dst = dst < maxData ? dst : maxData;\n\
-    write_imagef(output, coord, dst);\n\
-}\n\
-\n\
-__kernel void clip_F32toF32_2D(\n\
-    __read_only  image2d_t  input,\n\
-    __write_only image2d_t  output,\n\
-                     float  minData,\n\
-                     float  maxData)\n\
-{\n\
-    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
-    float4 src = read_imagef(input, coord);\n\
-    float4 dst = src > minData ? src : minData;\n\
-    dst = dst < maxData ? dst : maxData;\n\
-    write_imagef(output, coord, dst);\n\
-}\n\
-\n\
-__kernel void clip_F32toU8(\n\
-    __read_only  image2d_array_t  input,\n\
-    __write_only image2d_array_t  output,\n\
-                           float  minData,\n\
-                           float  maxData,\n\
+static const char clip_F32_cl[] = "__kernel void clip_F32toF32\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                           float minData,\n\
+                           float maxData,\n\
                            float inputScale,\n\
                            float inputTail,\n\
                            float outputScale,\n\
                            float outputZP\n\
-                           )\n\
+    )\n\
 {\n\
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
     float4 src = read_imagef(input, coord);\n\
-    float4 result = src > minData ? src : minData;\n\
-    result = result < maxData ? result : maxData;\n\
-    uint4 dst = convert_uint4_rte(result * outputScale + outputZP);\n\
-    write_imageui(output, coord, dst);\n\
+    float4 dst = clamp(src, minData, maxData);\n\
+    write_imagef(output, coord, dst);\n\
 }\n\
 \n\
-__kernel void clip_F32toU8_2D(\n\
-    __read_only  image2d_t  input,\n\
-    __write_only image2d_t  output,\n\
-                     float  minData,\n\
-                     float  maxData,\n\
+__kernel void clip_F32toF32_2D\n\
+    (\n\
+    __read_only  image2d_t input,\n\
+    __write_only image2d_t output,\n\
+                     float minData,\n\
+                     float maxData,\n\
                      float inputScale,\n\
                      float inputTail,\n\
                      float outputScale,\n\
                      float outputZP\n\
-                     )\n\
+    )\n\
 {\n\
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
     float4 src = read_imagef(input, coord);\n\
-    float4 result = src > minData ? src : minData;\n\
-    result = result < maxData ? result : maxData;\n\
+    float4 dst = clamp(src, minData, maxData);\n\
+    write_imagef(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void clip_F32toU8\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                           float minData,\n\
+                           float maxData,\n\
+                           float inputScale,\n\
+                           float inputTail,\n\
+                           float outputScale,\n\
+                           float outputZP\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    float4 src = read_imagef(input, coord);\n\
+    float4 result = clamp(src, minData, maxData);\n\
     uint4 dst = convert_uint4_rte(result * outputScale + outputZP);\n\
     write_imageui(output, coord, dst);\n\
 }\n\
 \n\
+__kernel void clip_F32toU8_2D\n\
+    (\n\
+    __read_only  image2d_t input,\n\
+    __write_only image2d_t output,\n\
+                     float minData,\n\
+                     float maxData,\n\
+                     float inputScale,\n\
+                     float inputTail,\n\
+                     float outputScale,\n\
+                     float outputZP\n\
+    )\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+    float4 src = read_imagef(input, coord);\n\
+    float4 result = clamp(src, minData, maxData);\n\
+    uint4 dst = convert_uint4_rte(result * outputScale + outputZP);\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void clip_F32toI32\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                           float minData,\n\
+                           float maxData,\n\
+                           float inputScale,\n\
+                           float inputTail,\n\
+                           float outputScale,\n\
+                           float outputZP\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    float4 src = read_imagef(input, coord);\n\
+    float4 result = clamp(src, minData, maxData);\n\
+    int4 dst = convert_int4_rte(result * outputScale + outputZP);\n\
+    write_imagei(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void clip_F32toI32_2D\n\
+    (\n\
+    __read_only  image2d_t input,\n\
+    __write_only image2d_t output,\n\
+                     float minData,\n\
+                     float maxData,\n\
+                     float inputScale,\n\
+                     float inputTail,\n\
+                     float outputScale,\n\
+                     float outputZP\n\
+    )\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+    float4 src = read_imagef(input, coord);\n\
+    float4 result = clamp(src, minData, maxData);\n\
+    int4 dst = convert_int4_rte(result * outputScale + outputZP);\n\
+    write_imagei(output, coord, dst);\n\
+}\n\
 "; /* end of clip_F32_cl*/
 
+static const char clip_I32_cl[] = "__kernel void clip_I32toI32\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                           float minData,\n\
+                           float maxData,\n\
+                           float inputScale,\n\
+                           float inputTail,\n\
+                           float outputScale,\n\
+                           float outputZP\n\
+     )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    float4 src = convert_float4(read_imagei(input, coord)) * inputScale + inputTail;\n\
+    float4 result = clamp(src, minData, maxData);\n\
+    int4 dst = convert_int4_rte(result * outputScale + outputZP);\n\
+    write_imagei(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void clip_I32toI32_2D\n\
+    (\n\
+    __read_only  image2d_t input,\n\
+    __write_only image2d_t output,\n\
+                     float minData,\n\
+                     float maxData,\n\
+                     float inputScale,\n\
+                     float inputTail,\n\
+                     float outputScale,\n\
+                     float outputZP\n\
+    )\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+    float4 src = convert_float4(read_imagei(input, coord)) * inputScale + inputTail;\n\
+    float4 result = clamp(src, minData, maxData);\n\
+    int4 dst = convert_int4_rte(result * outputScale + outputZP);\n\
+    write_imagei(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void clip_I32toF32\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                           float minData,\n\
+                           float maxData,\n\
+                           float inputScale,\n\
+                           float inputTail,\n\
+                           float outputScale,\n\
+                           float outputZP\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    float4 src = convert_float4(read_imagei(input, coord)) * inputScale + inputTail;\n\
+    float4 dst = clamp(src, minData, maxData);\n\
+    write_imagef(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void clip_I32toF32_2D\n\
+    (\n\
+    __read_only  image2d_t input,\n\
+    __write_only image2d_t output,\n\
+                     float minData,\n\
+                     float maxData,\n\
+                     float inputScale,\n\
+                     float inputTail,\n\
+                     float outputScale,\n\
+                     float outputZP\n\
+    )\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+    float4 src = convert_float4(read_imagei(input, coord)) * inputScale + inputTail;\n\
+    float4 dst = clamp(src, minData, maxData);\n\
+    write_imagef(output, coord, dst);\n\
+}\n\
+"; /* end of clip_I32_cl*/
+
 static const char clip_U8_cl[] = "__kernel void clip_U8toU8(\n\
     __read_only  image2d_array_t  input,\n\
     __write_only image2d_array_t  output,\n\
@@ -47715,7 +47811,7 @@ inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride
        } while(0)\n\
 "; /* end of eltwise_ops_helper_cl*/
 
-static const char eltwise_unary_cl[] = "float eltwise_unary_sin(float x, float alpha, float beta)\n\
+static const char eltwise_unary_0_cl[] = "float eltwise_unary_sin(float x, float alpha, float beta)\n\
 {\n\
     return native_sin(x);\n\
 }\n\
@@ -47741,14 +47837,6 @@ float eltwise_unary_log(float x, float alpha, float beta)\n\
     return x * rlogE;\n\
 }\n\
 \n\
-float eltwise_unary_elu(float val, float alpha, float beta)\n\
-{\n\
-    float x = val * logE;\n\
-    x = exp2(x) * alpha - alpha;\n\
-\n\
-    return val < 0 ? x : val;\n\
-}\n\
-\n\
 float eltwise_unary_neg(float x, float alpha, float beta)\n\
 {\n\
     return x * -1;\n\
@@ -47790,34 +47878,45 @@ float eltwise_unary_round(float x, float alpha, float beta)\n\
     return convert_float(convert_int_rte(x));\n\
 }\n\
 \n\
-#define MUL2_RSQRTPI    (1.1283791670955126f)\n\
-float erf_eval(float x)\n\
+float evaluate_polynomial_alpha(float x2)\n\
 {\n\
-    float res = 0;\n\
-    float tmp = x;\n\
-    float factorial = 1;\n\
-    float x_pow = x;\n\
-    float one = 1.0f;\n\
-    float n = 1;\n\
+    float4 alpha0 = (float4){-2.72614225801306e-10f, 2.77068142495902e-08f,\n\
+                            -2.10102402082508e-06f, -5.69250639462346e-05f};\n\
+    float4 alpha1 = (float4){-7.34990630326855e-04f, -2.95459980854025e-03f,\n\
+                            -1.60960333262415e-02f, 0};\n\
 \n\
-    if (x <= -3)\n\
-        return -1;\n\
-    else if (x >= 3)\n\
-        return 1;\n\
+    float poly = alpha0.x * x2 + alpha0.y;\n\
+    poly = poly * x2 + alpha0.z;\n\
+    poly = poly * x2 + alpha0.w;\n\
+    poly = poly * x2 + alpha1.x;\n\
+    poly = poly * x2 + alpha1.y;\n\
+    poly = poly * x2 + alpha1.z;\n\
 \n\
-    while (fabs(tmp) > 1e-5)\n\
-    {\n\
-        res += tmp;\n\
-\n\
-        factorial *= n;\n\
-        one *= -1;\n\
-        x_pow *= x * x;\n\
-        tmp = one / factorial * x_pow / ( 2 * n + 1);\n\
-\n\
-        n += 1.0f;\n\
-    }\n\
-    return res * MUL2_RSQRTPI;\n\
+    return poly;\n\
 }\n\
+\n\
+float evaluate_polynomial_beta(float x2)\n\
+{\n\
+    float4 beta0 = (float4){-1.45660718464996e-05f, -2.13374055278905e-04f,\n\
+                            -1.68282697438203e-03f, -7.37332916720468e-03f};\n\
+    float4 beta1 = (float4){-1.42647390514189e-02f, 0, 0, 0};\n\
+\n\
+    float poly = beta0.x * x2 + beta0.y;\n\
+    poly = poly * x2 + beta0.z;\n\
+    poly = poly * x2 + beta0.w;\n\
+    poly = poly * x2 + beta1.x;\n\
+\n\
+    return 1.0f / poly;\n\
+}\n\
+\n\
+float erf_eval(float _x)\n\
+{\n\
+    float x = clamp(_x, -4, 4);\n\
+    float x2 = x * x;\n\
+\n\
+    return x * evaluate_polynomial_alpha(x2) * evaluate_polynomial_beta(x2);\n\
+}\n\
+\n\
 #define RSQRT2      (0.70710678118654752440084436210485f)\n\
 float eltwise_unary_gelu(float x, float alpha, float beta)\n\
 {\n\
@@ -47834,39 +47933,21 @@ float eltwise_unary_hard_gelu(float x, float alpha, float beta)\n\
     return x * cdf;\n\
 }\n\
 \n\
-#define ELTWISE_UNARY_F32(func_name) \\\n\
-__kernel void func_name##_F32toF32 \\\n\
-    ( \\\n\
-    __read_only  image2d_array_t input, \\\n\
-    __write_only image2d_array_t output, \\\n\
-                 float           inputScale, \\\n\
-                 float           inputTail, \\\n\
-                 float           outputScale, \\\n\
-                 float           outputZP, \\\n\
-                 float           alpha, \\\n\
-                 float           beta \\\n\
-    ) \\\n\
-{ \\\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
- \\\n\
-    float4 src = read_imagef(input, coord); \\\n\
- \\\n\
-    float4 dst = 0; \\\n\
-    dst.x = eltwise_unary_##func_name(src.x, alpha, beta); \\\n\
- \\\n\
-    write_imagef(output, coord, dst.xxxx); \\\n\
+float eltwise_unary_selu(float val, float alpha_times_gamma, float gamma)\n\
+{\n\
+    float x = val * logE;\n\
+    x = exp2(x) * alpha_times_gamma - alpha_times_gamma;\n\
+\n\
+    return val <= 0 ? x : val * gamma;\n\
+}\n\
+\n\
+float eltwise_unary_celu(float val, float alpha, float rcp_alpha)\n\
+{\n\
+    float x = val * logE * rcp_alpha;\n\
+    x = exp2(x) * alpha - alpha;\n\
+\n\
+    return val < 0 ? x : val;\n\
 }\n\
-ELTWISE_UNARY_F32(sin)\n\
-ELTWISE_UNARY_F32(cos)\n\
-ELTWISE_UNARY_F32(exp)\n\
-ELTWISE_UNARY_F32(log)\n\
-ELTWISE_UNARY_F32(elu)\n\
-ELTWISE_UNARY_F32(neg)\n\
-ELTWISE_UNARY_F32(mish)\n\
-ELTWISE_UNARY_F32(hard_sigmoid)\n\
-ELTWISE_UNARY_F32(round)\n\
-ELTWISE_UNARY_F32(gelu)\n\
-ELTWISE_UNARY_F32(hard_gelu)\n\
 \n\
 #define ELTWISE_UNARY_F32_2D(func_name) \\\n\
 __kernel void func_name##_F32toF32_2D \\\n\
@@ -47894,48 +47975,14 @@ ELTWISE_UNARY_F32_2D(sin)\n\
 ELTWISE_UNARY_F32_2D(cos)\n\
 ELTWISE_UNARY_F32_2D(exp)\n\
 ELTWISE_UNARY_F32_2D(log)\n\
-ELTWISE_UNARY_F32_2D(elu)\n\
 ELTWISE_UNARY_F32_2D(neg)\n\
 ELTWISE_UNARY_F32_2D(mish)\n\
 ELTWISE_UNARY_F32_2D(hard_sigmoid)\n\
 ELTWISE_UNARY_F32_2D(round)\n\
 ELTWISE_UNARY_F32_2D(gelu)\n\
 ELTWISE_UNARY_F32_2D(hard_gelu)\n\
-\n\
-#define ELTWISE_UNARY_U8(func_name) \\\n\
-__kernel void func_name##_U8toU8 \\\n\
-    ( \\\n\
-    __read_only  image2d_array_t input, \\\n\
-    __write_only image2d_array_t output, \\\n\
-                 float           inputScale, \\\n\
-                 float           inputTail, \\\n\
-                 float           outputScale, \\\n\
-                 float           outputZP, \\\n\
-                 float           alpha, \\\n\
-                 float           beta \\\n\
-    ) \\\n\
-{ \\\n\
-    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
- \\\n\
-    uint4 src = read_imageui(input, coord); \\\n\
-    float4 data = convert_float4(src) * inputScale - inputTail; \\\n\
- \\\n\
-    data.x = eltwise_unary_##func_name(data.x, alpha, beta); \\\n\
-    uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\
- \\\n\
-    write_imageui(output, coord, dst); \\\n\
-}\n\
-ELTWISE_UNARY_U8(sin)\n\
-ELTWISE_UNARY_U8(cos)\n\
-ELTWISE_UNARY_U8(exp)\n\
-ELTWISE_UNARY_U8(log)\n\
-ELTWISE_UNARY_U8(elu)\n\
-ELTWISE_UNARY_U8(neg)\n\
-ELTWISE_UNARY_U8(mish)\n\
-ELTWISE_UNARY_U8(hard_sigmoid)\n\
-ELTWISE_UNARY_U8(round)\n\
-ELTWISE_UNARY_U8(gelu)\n\
-ELTWISE_UNARY_U8(hard_gelu)\n\
+ELTWISE_UNARY_F32_2D(selu)\n\
+ELTWISE_UNARY_F32_2D(celu)\n\
 \n\
 #define ELTWISE_UNARY_U8_2D(func_name) \\\n\
 __kernel void func_name##_U8toU8_2D \\\n\
@@ -47964,33 +48011,14 @@ ELTWISE_UNARY_U8_2D(sin)\n\
 ELTWISE_UNARY_U8_2D(cos)\n\
 ELTWISE_UNARY_U8_2D(exp)\n\
 ELTWISE_UNARY_U8_2D(log)\n\
-ELTWISE_UNARY_U8_2D(elu)\n\
 ELTWISE_UNARY_U8_2D(neg)\n\
 ELTWISE_UNARY_U8_2D(mish)\n\
 ELTWISE_UNARY_U8_2D(hard_sigmoid)\n\
 ELTWISE_UNARY_U8_2D(round)\n\
 ELTWISE_UNARY_U8_2D(gelu)\n\
 ELTWISE_UNARY_U8_2D(hard_gelu)\n\
-\n\
-__kernel void neg_I32toI32\n\
-    (\n\
-    __read_only  image2d_array_t input,\n\
-    __write_only image2d_array_t output,\n\
-                 float           inputScale,\n\
-                 float           inputTail,\n\
-                 float           outputScale,\n\
-                 float           outputZP,\n\
-                 float           alpha,\n\
-                 float           beta\n\
-    )\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-    int4 src = read_imagei(input, coord);\n\
-\n\
-    int4 dst = -src;\n\
-\n\
-    write_imagei(output, coord, dst);\n\
-}\n\
+ELTWISE_UNARY_U8_2D(selu)\n\
+ELTWISE_UNARY_U8_2D(celu)\n\
 \n\
 __kernel void neg_I32toI32_2D\n\
     (\n\
@@ -48011,31 +48039,275 @@ __kernel void neg_I32toI32_2D\n\
 \n\
     write_imagei(output, coord, dst);\n\
 }\n\
-"; /* end of eltwise_unary_cl*/
+"; /* end of eltwise_unary_0_cl*/
 
-static const char erf_cl[] = "#define MUL2_RSQRTPI    (1.1283791670955126f)\n\
+static const char eltwise_unary_1_cl[] = "float eltwise_unary_sin(float x, float alpha, float beta)\n\
+{\n\
+    return native_sin(x);\n\
+}\n\
+\n\
+float eltwise_unary_cos(float x, float alpha, float beta)\n\
+{\n\
+    return native_cos(x);\n\
+}\n\
+\n\
+#define logE        (1.44269502f)\n\
+#define twoLogE     (logE * 2.0f)\n\
+float eltwise_unary_exp(float x, float alpha, float beta)\n\
+{\n\
+    x *= logE;\n\
+    x = exp2(x);\n\
+    return x;\n\
+}\n\
+\n\
+#define rlogE    (0.693147182f)\n\
+float eltwise_unary_log(float x, float alpha, float beta)\n\
+{\n\
+    x = log2(x);\n\
+    return x * rlogE;\n\
+}\n\
+\n\
+float eltwise_unary_neg(float x, float alpha, float beta)\n\
+{\n\
+    return x * -1;\n\
+}\n\
+\n\
+float eltwise_unary_hard_sigmoid(float x, float alpha, float beta)\n\
+{\n\
+    x = alpha * x + beta;\n\
+    x = clamp(x, 0, 1);\n\
+    return x;\n\
+}\n\
+\n\
+float _softrelu(float x, float alpha)\n\
+{\n\
+    x *= logE;\n\
+    x = exp2(x);\n\
+    x += 1;\n\
+    x = log2(x);\n\
+    return x * rlogE;\n\
+}\n\
+\n\
+float _tanh(float x, float alpha)\n\
+{\n\
+    x *= -twoLogE;\n\
+    x = 1 + exp2(x);\n\
+    x = 1 / x;\n\
+    return (2 * x - 1);\n\
+}\n\
+\n\
+float eltwise_unary_mish(float x, float alpha, float beta)\n\
+{\n\
+    float y = _softrelu(x, alpha);\n\
+    x = x * _tanh(y, alpha);\n\
+    return x;\n\
+}\n\
+\n\
+float eltwise_unary_round(float x, float alpha, float beta)\n\
+{\n\
+    return convert_float(convert_int_rte(x));\n\
+}\n\
+\n\
+float evaluate_polynomial_alpha(float x2)\n\
+{\n\
+    float4 alpha0 = (float4){-2.72614225801306e-10f, 2.77068142495902e-08f,\n\
+                            -2.10102402082508e-06f, -5.69250639462346e-05f};\n\
+    float4 alpha1 = (float4){-7.34990630326855e-04f, -2.95459980854025e-03f,\n\
+                            -1.60960333262415e-02f, 0};\n\
+\n\
+    float poly = alpha0.x * x2 + alpha0.y;\n\
+    poly = poly * x2 + alpha0.z;\n\
+    poly = poly * x2 + alpha0.w;\n\
+    poly = poly * x2 + alpha1.x;\n\
+    poly = poly * x2 + alpha1.y;\n\
+    poly = poly * x2 + alpha1.z;\n\
+\n\
+    return poly;\n\
+}\n\
+\n\
+float evaluate_polynomial_beta(float x2)\n\
+{\n\
+    float4 beta0 = (float4){-1.45660718464996e-05f, -2.13374055278905e-04f,\n\
+                            -1.68282697438203e-03f, -7.37332916720468e-03f};\n\
+    float4 beta1 = (float4){-1.42647390514189e-02f, 0, 0, 0};\n\
+\n\
+    float poly = beta0.x * x2 + beta0.y;\n\
+    poly = poly * x2 + beta0.z;\n\
+    poly = poly * x2 + beta0.w;\n\
+    poly = poly * x2 + beta1.x;\n\
+\n\
+    return 1.0f / poly;\n\
+}\n\
+\n\
+float erf_eval(float _x)\n\
+{\n\
+    float x = clamp(_x, -4, 4);\n\
+    float x2 = x * x;\n\
+\n\
+    return x * evaluate_polynomial_alpha(x2) * evaluate_polynomial_beta(x2);\n\
+}\n\
+\n\
+#define RSQRT2      (0.70710678118654752440084436210485f)\n\
+float eltwise_unary_gelu(float x, float alpha, float beta)\n\
+{\n\
+    x = 0.5f * x * (1 + erf_eval(x * RSQRT2));\n\
+\n\
+    return x;\n\
+}\n\
+\n\
+#define SQRT_2_RCP_PI  0.7978845834732056f\n\
+float eltwise_unary_hard_gelu(float x, float alpha, float beta)\n\
+{\n\
+    float cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI *\n\
+                        (x + 0.044715f * x * x * x), 0);\n\
+    return x * cdf;\n\
+}\n\
+\n\
+float eltwise_unary_selu(float val, float alpha_times_gamma, float gamma)\n\
+{\n\
+    float x = val * logE;\n\
+    x = exp2(x) * alpha_times_gamma - alpha_times_gamma;\n\
+\n\
+    return val < 0 ? x : val * gamma;\n\
+}\n\
+\n\
+float eltwise_unary_celu(float val, float alpha, float rcp_alpha)\n\
+{\n\
+    float x = val * logE * rcp_alpha;\n\
+    x = exp2(x) * alpha - alpha;\n\
+\n\
+    return val < 0 ? x : val;\n\
+}\n\
+\n\
+#define ELTWISE_UNARY_F32(func_name) \\\n\
+__kernel void func_name##_F32toF32 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           inputScale, \\\n\
+                 float           inputTail, \\\n\
+                 float           outputScale, \\\n\
+                 float           outputZP, \\\n\
+                 float           alpha, \\\n\
+                 float           beta \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    float4 src = read_imagef(input, coord); \\\n\
+ \\\n\
+    float4 dst = 0; \\\n\
+    dst.x = eltwise_unary_##func_name(src.x, alpha, beta); \\\n\
+ \\\n\
+    write_imagef(output, coord, dst.xxxx); \\\n\
+}\n\
+ELTWISE_UNARY_F32(sin)\n\
+ELTWISE_UNARY_F32(cos)\n\
+ELTWISE_UNARY_F32(exp)\n\
+ELTWISE_UNARY_F32(log)\n\
+ELTWISE_UNARY_F32(neg)\n\
+ELTWISE_UNARY_F32(mish)\n\
+ELTWISE_UNARY_F32(hard_sigmoid)\n\
+ELTWISE_UNARY_F32(round)\n\
+ELTWISE_UNARY_F32(gelu)\n\
+ELTWISE_UNARY_F32(hard_gelu)\n\
+ELTWISE_UNARY_F32(selu)\n\
+ELTWISE_UNARY_F32(celu)\n\
+\n\
+#define ELTWISE_UNARY_U8(func_name) \\\n\
+__kernel void func_name##_U8toU8 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           inputScale, \\\n\
+                 float           inputTail, \\\n\
+                 float           outputScale, \\\n\
+                 float           outputZP, \\\n\
+                 float           alpha, \\\n\
+                 float           beta \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    uint4 src = read_imageui(input, coord); \\\n\
+    float4 data = convert_float4(src) * inputScale - inputTail; \\\n\
+ \\\n\
+    data.x = eltwise_unary_##func_name(data.x, alpha, beta); \\\n\
+    uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\
+ \\\n\
+    write_imageui(output, coord, dst); \\\n\
+}\n\
+ELTWISE_UNARY_U8(sin)\n\
+ELTWISE_UNARY_U8(cos)\n\
+ELTWISE_UNARY_U8(exp)\n\
+ELTWISE_UNARY_U8(log)\n\
+ELTWISE_UNARY_U8(neg)\n\
+ELTWISE_UNARY_U8(mish)\n\
+ELTWISE_UNARY_U8(hard_sigmoid)\n\
+ELTWISE_UNARY_U8(round)\n\
+ELTWISE_UNARY_U8(gelu)\n\
+ELTWISE_UNARY_U8(hard_gelu)\n\
+ELTWISE_UNARY_U8(selu)\n\
+ELTWISE_UNARY_U8(celu)\n\
+\n\
+__kernel void neg_I32toI32\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                 float           inputScale,\n\
+                 float           inputTail,\n\
+                 float           outputScale,\n\
+                 float           outputZP,\n\
+                 float           alpha,\n\
+                 float           beta\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 src = read_imagei(input, coord);\n\
+\n\
+    int4 dst = -src;\n\
+\n\
+    write_imagei(output, coord, dst);\n\
+}\n\
+"; /* end of eltwise_unary_1_cl*/
+
+static const char erf_cl[] = "float evaluate_polynomial_alpha(float x2)\n\
+{\n\
+    float4 alpha0 = (float4){-2.72614225801306e-10f, 2.77068142495902e-08f,\n\
+                            -2.10102402082508e-06f, -5.69250639462346e-05f};\n\
+    float4 alpha1 = (float4){-7.34990630326855e-04f, -2.95459980854025e-03f,\n\
+                            -1.60960333262415e-02f, 0};\n\
+\n\
+    float poly = alpha0.x * x2 + alpha0.y;\n\
+    poly = poly * x2 + alpha0.z;\n\
+    poly = poly * x2 + alpha0.w;\n\
+    poly = poly * x2 + alpha1.x;\n\
+    poly = poly * x2 + alpha1.y;\n\
+    poly = poly * x2 + alpha1.z;\n\
+\n\
+    return poly;\n\
+}\n\
+\n\
+float evaluate_polynomial_beta(float x2)\n\
+{\n\
+    float4 beta0 = (float4){-1.45660718464996e-05f, -2.13374055278905e-04f,\n\
+                            -1.68282697438203e-03f, -7.37332916720468e-03f};\n\
+    float4 beta1 = (float4){-1.42647390514189e-02f, 0, 0, 0};\n\
+\n\
+    float poly = beta0.x * x2 + beta0.y;\n\
+    poly = poly * x2 + beta0.z;\n\
+    poly = poly * x2 + beta0.w;\n\
+    poly = poly * x2 + beta1.x;\n\
+\n\
+    return 1.0f / poly;\n\
+}\n\
+\n\
 float eltwise_unary_erf(float _x)\n\
 {\n\
-    float x = clamp(_x, -2, 2);\n\
-    float res = 0;\n\
-    float tmp = x;\n\
-    float factorial = 1;\n\
-    float x_pow = x;\n\
-    float one = 1.0f;\n\
-    float n = 1;\n\
+    float x = clamp(_x, -4, 4);\n\
+    float x2 = x * x;\n\
 \n\
-    while (fabs(tmp) > 1e-5)\n\
-    {\n\
-        res += tmp;\n\
-\n\
-        factorial *= n;\n\
-        one *= -1;\n\
-        x_pow *= x * x;\n\
-        tmp = one / factorial * x_pow / ( 2 * n + 1);\n\
-\n\
-        n += 1.0f;\n\
-    }\n\
-    return res * MUL2_RSQRTPI;\n\
+    return x * evaluate_polynomial_alpha(x2) * evaluate_polynomial_beta(x2);\n\
 }\n\
 \n\
 #define ELTWISE_UNARY_F32(func_name) \\\n\
@@ -48593,6 +48865,143 @@ __kernel void gather_batch_F32toF32(\n\
 }\n\
 "; /* end of gather_batch_cl*/
 
+static const char gather_elements_cl[] = "\n\
+#define GATHER_ELEMENTS_AXIS0_2D(name, data_type, read_func, write_func, conv_func) \\\n\
+__kernel void gather_elements_axis0_##name##_I32to##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_t input0, \\\n\
+    __read_only  image2d_t input1, \\\n\
+    __write_only image2d_t output, \\\n\
+                 float     input_scale, \\\n\
+                 float     input_tail, \\\n\
+                 int       axis_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    int index = read_imagei(input1, coord).x; \\\n\
+    int index1 = index + axis_size; \\\n\
+    index = index < 0 ? index1 : index; \\\n\
+ \\\n\
+    data_type data = read_func(input0, (int2)(index, coord.y)); \\\n\
+    float4 dst = convert_float4(data) * input_scale + input_tail; \\\n\
+    data = conv_func(dst); \\\n\
+ \\\n\
+    write_func(output, coord, data); \\\n\
+}\n\
+GATHER_ELEMENTS_AXIS0_2D(F32, float4, read_imagef,  write_imagef,  convert_float4)\n\
+GATHER_ELEMENTS_AXIS0_2D(I32, int4,   read_imagei,  write_imagei,  convert_int4_rte)\n\
+GATHER_ELEMENTS_AXIS0_2D(U32, uint4,  read_imageui, write_imageui, convert_uint4_rte)\n\
+\n\
+#define GATHER_ELEMENTS_AXIS0(name, data_type, read_func, write_func, conv_func) \\\n\
+__kernel void gather_elements_axis0_##name##_I32to##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           input_scale, \\\n\
+                 float           input_tail, \\\n\
+                 int             axis_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    int index = read_imagei(input1, coord).x; \\\n\
+    int index1 = index + axis_size; \\\n\
+    index = index < 0 ? index1 : index; \\\n\
+ \\\n\
+    data_type data = read_func(input0, (int4)(index, coord.yzz)); \\\n\
+    float4 dst = convert_float4(data) * input_scale + input_tail; \\\n\
+    data = conv_func(dst); \\\n\
+ \\\n\
+    write_func(output, coord, data); \\\n\
+}\n\
+GATHER_ELEMENTS_AXIS0(F32, float4, read_imagef,  write_imagef,  convert_float4)\n\
+GATHER_ELEMENTS_AXIS0(I32, int4,   read_imagei,  write_imagei,  convert_int4_rte)\n\
+GATHER_ELEMENTS_AXIS0(U32, uint4,  read_imageui, write_imageui, convert_uint4_rte)\n\
+\n\
+#define GATHER_ELEMENTS_AXIS1_2D(name, data_type, read_func, write_func, conv_func) \\\n\
+__kernel void gather_elements_axis1_##name##_I32to##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_t input0, \\\n\
+    __read_only  image2d_t input1, \\\n\
+    __write_only image2d_t output, \\\n\
+                 float     input_scale, \\\n\
+                 float     input_tail, \\\n\
+                 int       axis_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    int index = read_imagei(input1, coord).x; \\\n\
+    int index1 = index + axis_size; \\\n\
+    index = index < 0 ? index1 : index; \\\n\
+ \\\n\
+    data_type data = read_func(input0, (int2)(coord.x, index)); \\\n\
+    float4 dst = convert_float4(data) * input_scale + input_tail; \\\n\
+    data = conv_func(dst); \\\n\
+ \\\n\
+    write_func(output, coord, data); \\\n\
+}\n\
+GATHER_ELEMENTS_AXIS1_2D(F32, float4, read_imagef,  write_imagef,  convert_float4)\n\
+GATHER_ELEMENTS_AXIS1_2D(I32, int4,   read_imagei,  write_imagei,  convert_int4_rte)\n\
+GATHER_ELEMENTS_AXIS1_2D(U32, uint4,  read_imageui, write_imageui, convert_uint4_rte)\n\
+\n\
+#define GATHER_ELEMENTS_AXIS1(name, data_type, read_func, write_func, conv_func) \\\n\
+__kernel void gather_elements_axis1_##name##_I32to##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           input_scale, \\\n\
+                 float           input_tail, \\\n\
+                 int             axis_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    int index = read_imagei(input1, coord).x; \\\n\
+    int index1 = index + axis_size; \\\n\
+    index = index < 0 ? index1 : index; \\\n\
+ \\\n\
+    data_type data = read_func(input0, (int4)(coord.x, index, coord.zz)); \\\n\
+    float4 dst = convert_float4(data) * input_scale + input_tail; \\\n\
+    data = conv_func(dst); \\\n\
+ \\\n\
+    write_func(output, coord, data); \\\n\
+}\n\
+GATHER_ELEMENTS_AXIS1(F32, float4, read_imagef,  write_imagef,  convert_float4)\n\
+GATHER_ELEMENTS_AXIS1(I32, int4,   read_imagei,  write_imagei,  convert_int4_rte)\n\
+GATHER_ELEMENTS_AXIS1(U32, uint4,  read_imageui, write_imageui, convert_uint4_rte)\n\
+\n\
+#define GATHER_ELEMENTS_AXIS2(name, data_type, read_func, write_func, conv_func) \\\n\
+__kernel void gather_elements_axis2_##name##_I32to##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           input_scale, \\\n\
+                 float           input_tail, \\\n\
+                 int             axis_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ \\\n\
+    int index = read_imagei(input1, coord).x; \\\n\
+    int index1 = index + axis_size; \\\n\
+    index = index < 0 ? index1 : index; \\\n\
+ \\\n\
+    data_type data = read_func(input0, (int4)(coord.xy, index, coord.z)); \\\n\
+    float4 dst = convert_float4(data) * input_scale + input_tail; \\\n\
+    data = conv_func(dst); \\\n\
+ \\\n\
+    write_func(output, coord, data); \\\n\
+}\n\
+GATHER_ELEMENTS_AXIS2(F32, float4, read_imagef,  write_imagef,  convert_float4)\n\
+GATHER_ELEMENTS_AXIS2(I32, int4,   read_imagei,  write_imagei,  convert_int4_rte)\n\
+GATHER_ELEMENTS_AXIS2(U32, uint4,  read_imageui, write_imageui, convert_uint4_rte)\n\
+"; /* end of gather_elements_cl*/
+
 static const char gather_nd_cl[] = "__kernel void gather_nd_U8toU8_1D(\n\
     __read_only image2d_t   input0,\n\
     __read_only image2d_t   input1,\n\
@@ -51332,7 +51741,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_a
     __write_only image2d_t output,\n\
                        int axis,\n\
                        int axis_size,\n\
-                     float rsEps\n\
+                     float rsEps,\n\
+                     float inputScale,\n\
+                     float inputTail,\n\
+                     float outputScale,\n\
+                     float outputZP\n\
     )\n\
 {\n\
     int lidx = get_local_id(0);\n\
@@ -51355,7 +51768,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_a
     data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\
     sum = dot(data0, one);\n\
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));\n\
-    for(coord.x = gidx; coord.x < axis_size; coord.x += 16)\n\
+    for (coord.x = gidx; coord.x < axis_size; coord.x += 16)\n\
     {\n\
         src         = read_imagef(input, coord);\n\
         scale_value = read_imagef(scale, coord_scale);\n\
@@ -51397,7 +51810,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_a
     data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\
     sum = dot(data0, one);\n\
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));\n\
-    for(coord.x = gidx; coord.x < axis_size; coord.x += 16)\n\
+    for (coord.x = gidx; coord.x < axis_size; coord.x += 16)\n\
     {\n\
         src         = convert_float4(read_imageui(input, coord))  * inputScale + inputTail;\n\
         scale_value = read_imagef(scale, coord_scale);\n\
@@ -51407,7 +51820,48 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_a
     }\n\
 }\n\
 \n\
-\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_axis0_I32_F32toI32_2D(\n\
+    __read_only  image2d_t input,\n\
+    __read_only  image2d_t scale,\n\
+    __write_only image2d_t output,\n\
+                       int axis,\n\
+                       int axis_size,\n\
+                     float rsEps,\n\
+                     float inputScale,\n\
+                     float inputTail,\n\
+                     float outputScale,\n\
+                     float outputZP\n\
+    )\n\
+{\n\
+    int lidx = get_local_id(0);\n\
+    int gidx = get_global_id(0);\n\
+    float4 src, scale_value, result;\n\
+    float sum  = 0.0f, pSum = 0.0f, rsqrt_sum = 0.0f;\n\
+    int2 coord = (int2)(gidx, get_global_id(1));\n\
+    int2 coord_scale = (int2)(gidx, 0);\n\
+    __local float lcl_sum[16];\n\
+    for(; coord.x < axis_size; coord.x += 16)\n\
+    {\n\
+        src = convert_float4(read_imagei(input, coord))  * inputScale + inputTail;\n\
+        pSum += (src.x * src.x);\n\
+    }\n\
+    lcl_sum[lidx] = pSum;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0];\n\
+    float4 one = (float4)(1, 1, 1, 1);\n\
+    float4 data0;\n\
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\
+    sum = dot(data0, one);\n\
+    rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));\n\
+    for (coord.x = gidx; coord.x < axis_size; coord.x += 16)\n\
+    {\n\
+        src         = convert_float4(read_imagei(input, coord))  * inputScale + inputTail;\n\
+        scale_value = read_imagef(scale, coord_scale);\n\
+        result      = src * rsqrt_sum * scale_value;\n\
+        int4 dst = convert_int4_rte(result * outputScale + outputZP);\n\
+        write_imagei(output, coord, dst);\n\
+    }\n\
+}\n\
 "; /* end of l2normalizescale_axis0_cl*/
 
 static const char l2normalizescale_axis1_cl[] = "\n\
@@ -51417,7 +51871,11 @@ __kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_a
     __write_only image2d_t output,\n\
                        int axis,\n\
                        int axis_size,\n\
-                     float rsEps\n\
+                     float rsEps,\n\
+                     float inputScale,\n\
+                     float inputTail,\n\
+                     float outputScale,\n\
+                     float outputZP\n\
     )\n\
 {\n\
     int lidx = get_local_id(1);\n\
@@ -51440,7 +51898,7 @@ __kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_a
     data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\
     sum = dot(data0, one);\n\
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));\n\
-    for(coord.y = gidy; coord.y < axis_size; coord.y += 16)\n\
+    for (coord.y = gidy; coord.y < axis_size; coord.y += 16)\n\
     {\n\
         src         = read_imagef(input, coord);\n\
         scale_value = read_imagef(scale, coord_scale);\n\
@@ -51482,7 +51940,7 @@ __kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_a
     data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\
     sum = dot(data0, one);\n\
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));\n\
-    for(coord.y = gidy; coord.y < axis_size; coord.y += 16)\n\
+    for (coord.y = gidy; coord.y < axis_size; coord.y += 16)\n\
     {\n\
         src         = convert_float4(read_imageui(input, coord))  * inputScale + inputTail;\n\
         scale_value = read_imagef(scale, coord_scale);\n\
@@ -51491,7 +51949,49 @@ __kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_a
         write_imageui(output, coord, dst);\n\
     }\n\
 }\n\
-"; /* end of l2normalizescale_axis1_cl*/
+\n\
+__kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_axis1_I32_F32toI32_2D(\n\
+    __read_only  image2d_t input,\n\
+    __read_only  image2d_t scale,\n\
+    __write_only image2d_t output,\n\
+                       int axis,\n\
+                       int axis_size,\n\
+                     float rsEps,\n\
+                     float inputScale,\n\
+                     float inputTail,\n\
+                     float outputScale,\n\
+                     float outputZP\n\
+    )\n\
+{\n\
+    int lidx = get_local_id(1);\n\
+    int gidy = get_global_id(1);\n\
+    float4 src, scale_value, result;\n\
+    float sum  = 0.0f, pSum = 0.0f, rsqrt_sum = 0.0f;\n\
+    int2 coord = (int2)(get_global_id(0), gidy );\n\
+    int2 coord_scale = (int2)(gidy, 0);\n\
+    __local float lcl_sum[16];\n\
+    for (; coord.y < axis_size; coord.y += 16)\n\
+    {\n\
+        src = convert_float4(read_imagei(input, coord)) * inputScale + inputTail;\n\
+        pSum = pSum + src.x * src.x;\n\
+    }\n\
+    lcl_sum[lidx] = pSum;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0];\n\
+    float4 one = (float4)(1, 1, 1, 1);\n\
+    float4 data0;\n\
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\
+    sum = dot(data0, one);\n\
+    rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));\n\
+    for (coord.y = gidy; coord.y < axis_size; coord.y += 16)\n\
+    {\n\
+        src         = convert_float4(read_imagei(input, coord)) * inputScale + inputTail;\n\
+        scale_value = read_imagef(scale, coord_scale);\n\
+        result      = src * rsqrt_sum * scale_value;\n\
+        int4 dst = convert_int4_rte(result * outputScale + outputZP);\n\
+        write_imagei(output, coord, dst);\n\
+    }\n\
+}"; /* end of l2normalizescale_axis1_cl*/
 
 static const char layer_normalization_cl[] = "\n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layer_norm_F32toF32(\n\
@@ -55597,7 +56097,10 @@ __kernel void maximum_I32I32toI32\n\
     READ_IMAGEI_2DARRAY(src0, input0, coord);\n\
     READ_IMAGEI_2DARRAY(src1, input1, coord);\n\
 \n\
-    int4 dst = src0 > src1 ? src0 : src1;\n\
+    float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\
+    float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\
+    float4 data = data0 > data1 ? data0 : data1;\n\
+    int4 dst = convert_int4(data * outputScale + outputZP);\n\
 \n\
     write_imagei(output, coord, dst);\n\
 }\n\
@@ -55620,11 +56123,13 @@ __kernel void maximum_I32I32toI32_2D\n\
     int4 src0 = read_imagei(input0, coord);\n\
     int4 src1 = read_imagei(input1, coord);\n\
 \n\
-    int4 dst = src0 > src1 ? src0 : src1;\n\
+    float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\
+    float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\
+    float4 data = data0 > data1 ? data0 : data1;\n\
+    int4 dst = convert_int4(data * outputScale + outputZP);\n\
 \n\
     write_imagei(output, coord, dst);\n\
 }\n\
-\n\
 "; /* end of maximum_cl*/
 
 static const char minimum_cl[] = "__kernel void minimum_FP32FP32toFP32\n\
@@ -55750,7 +56255,10 @@ __kernel void minimum_I32I32toI32\n\
     READ_IMAGEI_2DARRAY(src0, input0, coord);\n\
     READ_IMAGEI_2DARRAY(src1, input1, coord);\n\
 \n\
-    int4 dst = src0 < src1 ? src0 : src1;\n\
+    float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\
+    float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\
+    float4 data = data0 < data1 ? data0 : data1;\n\
+    int4 dst = convert_int4(data * outputScale + outputZP);\n\
 \n\
     write_imagei(output, coord, dst);\n\
 }\n\
@@ -55773,11 +56281,13 @@ __kernel void minimum_I32I32toI32_2D\n\
     int4 src0 = read_imagei(input0, coord);\n\
     int4 src1 = read_imagei(input1, coord);\n\
 \n\
-    int4 dst = src0 < src1 ? src0 : src1;\n\
+    float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\
+    float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\
+    float4 data = data0 < data1 ? data0 : data1;\n\
+    int4 dst = convert_int4(data * outputScale + outputZP);\n\
 \n\
     write_imagei(output, coord, dst);\n\
 }\n\
-\n\
 "; /* end of minimum_cl*/
 
 static const char moments_axis0_cl[] = "__kernel void moments_axis0_U8toF32(\n\
@@ -55808,8 +56318,8 @@ static const char moments_axis0_cl[] = "__kernel void moments_axis0_U8toF32(\n\
         {\n\
             data = read_imageui(input, coord0).x;\n\
             coord0.x++;\n\
-            tmpSum += (data);\n\
-            tmpSqr += (data * data);\n\
+            tmpSum = tmpSum + data;\n\
+            tmpSqr = tmpSqr + data * data;\n\
         }\n\
         sqr = convert_float(as_int(tmpSqr - 2 * input_zp * tmpSum + width * input_zp * input_zp)) * e2InScale;\n\
         sum = convert_float(as_int(tmpSum - width * input_zp)) * input_scale;\n\
@@ -55882,7 +56392,7 @@ __kernel void moments_axis0_I32toF32(\n\
 \n\
     for(coord0.x = 0; coord0.x < width;)\n\
     {\n\
-        data = convert_float(read_imagei(input, coord0).x);\n\
+        data = convert_float(read_imagei(input, coord0).x - input_zp);\n\
         coord0.x++;\n\
 \n\
         sum = sum + data;\n\
@@ -55966,8 +56476,8 @@ static const char moments_axis01_cl[] = "__kernel void moments_axis01_U8toF32(\n
         {\n\
             data = read_imageui(input, coord);\n\
             coord.y++;\n\
-            tmpSum += data.x;\n\
-            tmpSqr += data.x * data.x;\n\
+            tmpSum = tmpSum + data.x;\n\
+            tmpSqr = tmpSqr + data.x * data.x;\n\
         }\n\
         sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\
         sum += (tmpSum - height * input_zp) * input_scale;\n\
@@ -56203,8 +56713,8 @@ static const char moments_axis012_cl[] = "__kernel void moments_axis012_U8toF32(
             {\n\
                 data = read_imageui(input, coord);\n\
                 coord.y++;\n\
-                tmpSum += data.x;\n\
-                tmpSqr += data.x * data.x;\n\
+                tmpSum = tmpSum + data.x;\n\
+                tmpSqr = tmpSqr + data.x * data.x;\n\
             }\n\
             sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\
             sum += (tmpSum - height * input_zp) * input_scale;\n\
@@ -56441,8 +56951,8 @@ static const char moments_axis1_cl[] = "__kernel void moments_axis1_U8toF32(\n\
         {\n\
             data = read_imageui(input, coord0).x;\n\
             coord0.y++;\n\
-            tmpSum += (data);\n\
-            tmpSqr += (data * data);\n\
+            tmpSum = tmpSum + data;\n\
+            tmpSqr = tmpSqr + data * data;\n\
         }\n\
         sqr = convert_float(as_int(tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp)) * e2InScale;\n\
         sum = convert_float(as_int(tmpSum - height * input_zp)) * input_scale;\n\
@@ -56516,7 +57026,7 @@ __kernel void moments_axis1_I32toF32(\n\
 \n\
     for(coord0.y = 0; coord0.y < height;)\n\
     {\n\
-        data = convert_float(read_imagei(input, coord0).x);\n\
+        data = convert_float(read_imagei(input, coord0).x - input_zp);\n\
         coord0.y++;\n\
         sum = sum + data;\n\
         sqr = sqr + data * data;\n\
@@ -56601,8 +57111,8 @@ static const char moments_axis2_cl[] = "__kernel void moments_axis2_U8toF32(\n\
         {\n\
             data = read_imageui(input, coord0).x;\n\
             coord0.z++;\n\
-            tmpSum += (data);\n\
-            tmpSqr += (data * data);\n\
+            tmpSum = tmpSum + data;\n\
+            tmpSqr = tmpSqr + data * data;\n\
         }\n\
         sqr = as_int(tmpSqr - 2 * input_zp * tmpSum + chn * input_zp * input_zp) * e2InScale;\n\
         sum = tmpSum * input_scale;\n\
@@ -56682,7 +57192,7 @@ __kernel void moments_axis2_I32toF32(\n\
 \n\
     for(coord0.z = 0; coord0.z < chn;)\n\
     {\n\
-        data = convert_float(read_imagei(input, coord0).x);\n\
+        data = convert_float(read_imagei(input, coord0).x - input_zp);\n\
         coord0.z++;\n\
 \n\
 \n\
@@ -56782,12 +57292,13 @@ __kernel void one_hot_I32toI32\n\
 {\n\
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
 \n\
-    int4 val = read_imagei(input, coord.xy);\n\
+    int4 src = read_imagei(input, coord.xy);\n\
 \n\
+    int  val = convert_int(convert_float(src.x) * inputScale - inputTail);\n\
     do\n\
     {\n\
         int4 dst;\n\
-        dst.x = val.x == coord.z ? on_value : off_value;\n\
+        dst.x = val == coord.z ? on_value : off_value;\n\
 \n\
         write_imagei(output, coord.xzyw, dst.xxxx);\n\
 \n\
@@ -56808,11 +57319,13 @@ __kernel void one_hot_I32toU8\n\
 {\n\
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
 \n\
-    int4 val = read_imagei(input, coord.xy);\n\
+    int4 src = read_imagei(input, coord.xy);\n\
+\n\
+    int  val = convert_int(convert_float(src.x) * inputScale - inputTail);\n\
     do\n\
     {\n\
         uint4 dst;\n\
-        dst.x = val.x == coord.z ? on_value : off_value;\n\
+        dst.x = val == coord.z ? on_value : off_value;\n\
 \n\
         write_imageui(output, coord.xzyw, dst.xxxx);\n\
 \n\
@@ -56833,12 +57346,13 @@ __kernel void one_hot_I32toF32\n\
 {\n\
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
 \n\
-    int4 val = read_imagei(input, coord.xy);\n\
+    int4 src = read_imagei(input, coord.xy);\n\
 \n\
+    int  val = convert_int(convert_float(src.x) * inputScale - inputTail);\n\
     do\n\
     {\n\
         float4 dst;\n\
-        dst.x = val.x == coord.z ? on_value : off_value;\n\
+        dst.x = val == coord.z ? on_value : off_value;\n\
 \n\
         write_imagef(output, coord.xzyw, dst.xxxx);\n\
 \n\
@@ -61199,14 +61713,17 @@ static const source_map_t evis_resource[] =
     {"depthwise_conv1d_src2_vx", depthwise_conv1d_src2_vx},
     {"depthwise_conv1d_src3_vx", depthwise_conv1d_src3_vx},
     {"detect_post_box_vx", detect_post_box_vx},
-    {"eltwise_unary_2d_vx", eltwise_unary_2d_vx},
-    {"eltwise_unary_3d_vx", eltwise_unary_3d_vx},
+    {"eltwise_unary_2d_0_vx", eltwise_unary_2d_0_vx},
+    {"eltwise_unary_2d_1_vx", eltwise_unary_2d_1_vx},
+    {"eltwise_unary_3d_0_vx", eltwise_unary_3d_0_vx},
+    {"eltwise_unary_3d_1_vx", eltwise_unary_3d_1_vx},
     {"erf_vx", erf_vx},
     {"extra_ending_vx", extra_ending_vx},
     {"floordiv_vx", floordiv_vx},
     {"gather_vx", gather_vx},
     {"gather_array_vx", gather_array_vx},
     {"gather_batch_vx", gather_batch_vx},
+    {"gather_elements_vx", gather_elements_vx},
     {"gather_mix_vx", gather_mix_vx},
     {"gather_mix_batch_vx", gather_mix_batch_vx},
     {"gather_nd_vx", gather_nd_vx},
@@ -61294,12 +61811,10 @@ static const source_map_t evis_resource[] =
     {"matrixmul_u8f16_f16_vx", matrixmul_u8f16_f16_vx},
     {"matrixmul_u8f16_u8_vx", matrixmul_u8f16_u8_vx},
     {"matrixmul_u8u8_f16_vx", matrixmul_u8u8_f16_vx},
-    {"maximum_vx", maximum_vx},
-    {"maximum_fp16_vx", maximum_fp16_vx},
-    {"maximum_i16_vx", maximum_i16_vx},
-    {"minimum_vx", minimum_vx},
-    {"minimum_fp16_vx", minimum_fp16_vx},
-    {"minimum_i16_vx", minimum_i16_vx},
+    {"maximum_0_vx", maximum_0_vx},
+    {"maximum_1_vx", maximum_1_vx},
+    {"minimum_0_vx", minimum_0_vx},
+    {"minimum_1_vx", minimum_1_vx},
     {"moments_axis0_vx", moments_axis0_vx},
     {"moments_axis01_vx", moments_axis01_vx},
     {"moments_axis012_vx", moments_axis012_vx},
@@ -61326,6 +61841,9 @@ static const source_map_t evis_resource[] =
     {"pre_process_nv12_scale_8bits_vx", pre_process_nv12_scale_8bits_vx},
     {"pre_process_nv12_scale_mix_vx", pre_process_nv12_scale_mix_vx},
     {"pre_process_rgb_vx", pre_process_rgb_vx},
+    {"pre_process_rgb888_planar_0_vx", pre_process_rgb888_planar_0_vx},
+    {"pre_process_rgb888_planar_1_vx", pre_process_rgb888_planar_1_vx},
+    {"pre_process_rgb888_planar_2_vx", pre_process_rgb888_planar_2_vx},
     {"pre_process_rgb_copy_vx", pre_process_rgb_copy_vx},
     {"pre_process_yuv420_copy_u8_vx", pre_process_yuv420_copy_u8_vx},
     {"pre_process_yuv420_scale_fp16_vx", pre_process_yuv420_scale_fp16_vx},
@@ -61414,15 +61932,18 @@ static const source_map_t cl_resource[] =
     {"cast_cl", cast_cl},
     {"clip_BF16_cl", clip_BF16_cl},
     {"clip_F32_cl", clip_F32_cl},
+    {"clip_I32_cl", clip_I32_cl},
     {"clip_U8_cl", clip_U8_cl},
     {"depth2space_crd_cl", depth2space_crd_cl},
     {"detect_post_box_cl", detect_post_box_cl},
     {"eltwise_ops_helper_cl", eltwise_ops_helper_cl},
-    {"eltwise_unary_cl", eltwise_unary_cl},
+    {"eltwise_unary_0_cl", eltwise_unary_0_cl},
+    {"eltwise_unary_1_cl", eltwise_unary_1_cl},
     {"erf_cl", erf_cl},
     {"floordiv_cl", floordiv_cl},
     {"gather_cl", gather_cl},
     {"gather_batch_cl", gather_batch_cl},
+    {"gather_elements_cl", gather_elements_cl},
     {"gather_nd_cl", gather_nd_cl},
     {"gather_nd_3d_cl", gather_nd_3d_cl},
     {"group_normalization_f32_cl", group_normalization_f32_cl},
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
index 8a54e35..0b76c95 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
@@ -206,24 +206,34 @@ static vsi_bool op_check
 {
     /* check inputs outputs data type */
     BEGIN_IO_TYPE_DECL(ARGMIN, 1, 1)
-        IO_TYPE(D_F16,  D_U8)
-        IO_TYPE(D_F16,  D_I16)
-        IO_TYPE(D_BF16, D_U8)
-        IO_TYPE(D_BF16, D_I16)
-        IO_TYPE(D_I8|Q_DFP, D_U8)
-        IO_TYPE(D_I8|Q_DFP, D_I16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8)
-        IO_TYPE(D_U8|Q_ASYM, D_I16)
-        IO_TYPE(D_I16|Q_DFP, D_U8)
-        IO_TYPE(D_I16|Q_DFP, D_I16)
-        IO_TYPE(D_F32, D_I32)
-        IO_TYPE(D_F32, D_I16)
-        IO_TYPE(D_F16, D_I32)
-        IO_TYPE(D_I32, D_I32)
-        IO_TYPE(D_I8|Q_DFP,   D_I32)
-        IO_TYPE(D_I8,         D_I32)
-        IO_TYPE(D_U8|Q_ASYM,  D_I32)
-        IO_TYPE(D_U8,         D_I32)
+        IO_TYPE(D_F16,          D_U8)
+        IO_TYPE(D_F16,          D_I16)
+        IO_TYPE(D_BF16,         D_U8)
+        IO_TYPE(D_BF16,         D_I16)
+        IO_TYPE(D_I8|Q_DFP,     D_U8)
+        IO_TYPE(D_I8|Q_DFP,     D_I16)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8)
+        IO_TYPE(D_U8|Q_ASYM,    D_I16)
+        IO_TYPE(D_I16|Q_DFP,    D_U8)
+        IO_TYPE(D_I16|Q_DFP,    D_I16)
+        IO_TYPE(D_F32,          D_I32)
+        IO_TYPE(D_F32,          D_I16)
+        IO_TYPE(D_F16,          D_I32)
+        IO_TYPE(D_I32,          D_I32)
+        IO_TYPE(D_I8|Q_DFP,     D_I32)
+        IO_TYPE(D_U8|Q_ASYM,    D_I32)
+        IO_TYPE(D_I8|Q_ASYM,    D_U8)
+        IO_TYPE(D_I8|Q_ASYM,    D_I16)
+        IO_TYPE(D_I8|Q_ASYM,    D_I32)
+        IO_TYPE(D_I8|Q_SYM,     D_U8)
+        IO_TYPE(D_I8|Q_SYM,     D_I16)
+        IO_TYPE(D_I8|Q_SYM,     D_I32)
+        IO_TYPE(D_I16|Q_ASYM,   D_U8)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16)
+        IO_TYPE(D_I16|Q_ASYM,   D_I32)
+        IO_TYPE(D_I16|Q_SYM,    D_U8)
+        IO_TYPE(D_I16|Q_SYM,    D_I16)
+        IO_TYPE(D_I16|Q_SYM,    D_I32)
     END_IO_TYPE_DECL(ARGMIN)
     if(!VALIDATE_OP_IO_TYPES(ARGMIN, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c
index d0ba47d..56889cb 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c
@@ -25,7 +25,6 @@
 
 #include <string.h>
 #include <stdlib.h>
-
 #include "vsi_nn_types.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_node.h"
@@ -81,7 +80,7 @@ static vsi_status op_compute
             outputs[0]->attr.size, outputs[0]->attr.dim_num,
             shapes_ptr, shapes[3], &new_rank);
 
-    if( ret )
+    if ( ret )
     {
         reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
             inputs[0], shapes[0], new_rank );
@@ -148,39 +147,74 @@ static vsi_bool op_check
 
     /* check inputs outputs data type */
     BEGIN_IO_TYPE_DECL(BATCHNORM_SINGLE, 5, 1)
-        IO_TYPE(D_F16,       D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM)
-        IO_TYPE(D_F16,       D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP)
-        IO_TYPE(D_F16,       D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP)
-        IO_TYPE(D_F16,       D_F16, D_F16, D_F16, D_F32, D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_F16, D_F16, D_F32, D_F16)
-        IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16)
-        IO_TYPE(D_F16,       D_F16, D_F16, D_F32, D_F32, D_U8|Q_ASYM)
-        IO_TYPE(D_F16,       D_F16, D_F16, D_F32, D_F32, D_I16|Q_DFP)
-        IO_TYPE(D_F16,       D_F16, D_F16, D_F32, D_F32, D_I8|Q_DFP)
-        IO_TYPE(D_F16,       D_F16, D_F16, D_F32, D_F32, D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F32, D_F32, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F32, D_F32, D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_F16, D_F32, D_F32, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_F32, D_F32, D_F32, D_F32, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_F16, D_F32, D_F32, D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_F32, D_F32, D_F32, D_F32, D_F16)
-        IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F32, D_F32, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_F32, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_F32, D_F16)
-        IO_TYPE(D_F32,       D_F32, D_F32, D_F32, D_F32, D_F32)
-        IO_TYPE(D_F16,       D_F32, D_F32, D_F32, D_F32, D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_F16)
-        IO_TYPE(D_F16,       D_F32, D_F32, D_F32, D_F32, D_F16)
-        IO_TYPE(D_F16,       D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
-        IO_TYPE(D_F16,       D_F32, D_F32, D_F32, D_F32, D_I16|Q_DFP)
-        IO_TYPE(D_F16,       D_F32, D_F32, D_F32, D_F32, D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_F16, D_F16, D_F16, D_F32, D_I16|Q_ASYM)
+        IO_TYPE(D_F16,        D_F16, D_F16, D_F16, D_F32, D_I16|Q_SYM)
+        IO_TYPE(D_F16,        D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_F16, D_F16, D_F16, D_F32, D_I8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F16, D_F16, D_F16, D_F32, D_I8|Q_SYM)
+        IO_TYPE(D_F16,        D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_I8|Q_ASYM,  D_F16, D_F16, D_F16, D_F32, D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,  D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_F16, D_F16, D_F16, D_F32, D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,   D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_I16|Q_DFP,  D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_I16|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_I16|Q_SYM,  D_F16, D_F16, D_F16, D_F32, D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,  D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_F16,        D_F16, D_F16, D_F32, D_F32, D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F16, D_F16, D_F32, D_F32, D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_F16, D_F16, D_F32, D_F32, D_I16|Q_ASYM)
+        IO_TYPE(D_F16,        D_F16, D_F16, D_F32, D_F32, D_I16|Q_SYM)
+        IO_TYPE(D_F16,        D_F16, D_F16, D_F32, D_F32, D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_F16, D_F16, D_F32, D_F32, D_I8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F16, D_F16, D_F32, D_F32, D_I8|Q_SYM)
+        IO_TYPE(D_F16,        D_F16, D_F16, D_F32, D_F32, D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_F16, D_F16, D_F32, D_F32, D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_F16, D_F16, D_F32, D_F32, D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_F16, D_F16, D_F32, D_F32, D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_F32, D_F32, D_F32, D_F32, D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_F16, D_F16, D_F32, D_F32, D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_F32, D_F32, D_F32, D_F32, D_F16)
+        IO_TYPE(D_I8|Q_ASYM,  D_F16, D_F16, D_F32, D_F32, D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,  D_F32, D_F32, D_F32, D_F32, D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,  D_F16, D_F16, D_F32, D_F32, D_F16)
+        IO_TYPE(D_I8|Q_ASYM,  D_F32, D_F32, D_F32, D_F32, D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_F16, D_F16, D_F32, D_F32, D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,   D_F32, D_F32, D_F32, D_F32, D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,   D_F16, D_F16, D_F32, D_F32, D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_F32, D_F32, D_F32, D_F32, D_F16)
+        IO_TYPE(D_I16|Q_DFP,  D_F16, D_F16, D_F32, D_F32, D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_F32, D_F32, D_F32, D_F32, D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_F32, D_F32, D_F32, D_F32, D_F16)
+        IO_TYPE(D_I16|Q_ASYM, D_F16, D_F16, D_F32, D_F32, D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_F16)
+        IO_TYPE(D_I16|Q_SYM,  D_F16, D_F16, D_F32, D_F32, D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,  D_F32, D_F32, D_F32, D_F32, D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,  D_F32, D_F32, D_F32, D_F32, D_F16)
+        IO_TYPE(D_F32,        D_F32, D_F32, D_F32, D_F32, D_F32)
+        IO_TYPE(D_F16,        D_F32, D_F32, D_F32, D_F32, D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_F32, D_F32, D_F32, D_F32, D_F16)
+        IO_TYPE(D_F16,        D_F32, D_F32, D_F32, D_F32, D_F16)
+        IO_TYPE(D_F16,        D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32, D_F32, D_F32, D_F32, D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_F32, D_F32, D_F32, D_F32, D_I16|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32, D_F32, D_F32, D_F32, D_I16|Q_SYM)
+        IO_TYPE(D_F16,        D_F32, D_F32, D_F32, D_F32, D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_F32, D_F32, D_F32, D_F32, D_I8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32, D_F32, D_F32, D_F32, D_I8|Q_SYM)
     END_IO_TYPE_DECL(BATCHNORM_SINGLE)
-    if(!VALIDATE_OP_IO_TYPES(BATCHNORM_SINGLE, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(BATCHNORM_SINGLE, self, inputs, self->input.num, outputs, self->output.num))
+    {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -188,7 +222,7 @@ static vsi_bool op_check
         return FALSE;
     }
 
-    for(i = 0; i < rank; i++)
+    for (i = 0; i < rank; i++)
     {
         vsi_size_t shape0 = inputs[0]->attr.size[i];
 
@@ -197,7 +231,7 @@ static vsi_bool op_check
             uint32_t rank1 = inputs[j]->attr.dim_num;
             vsi_size_t shape1 = rank1 > i ? inputs[j]->attr.size[i] : 1;
 
-            if(shape0 != shape1 && shape1 != 1)
+            if (shape0 != shape1 && shape1 != 1)
             {
                 VSILOGE("Invalid broadcast for inputs[%d] size[%"VSI_SIZE_T_SPECIFIER"]", j, shape1);
                 return FALSE;
@@ -215,7 +249,7 @@ static vsi_bool op_setup
     )
 {
     /* TODO: Add code to comput outputs' shape. */
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
         memcpy( outputs[0]->attr.size, inputs[0]->attr.size,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c
index abe9d4a..a969fa6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c
@@ -35,9 +35,9 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
+#include "vsi_nn_error.h"
 
 static vsi_bool setup_op_shapes
     (
@@ -225,8 +225,10 @@ static vsi_bool op_setup
 
     /* split input tensor */
     split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
+    CHECK_PTR_FAIL_GOTO( split_output_tensors, "Create buffer fail.", final );
     memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
     reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
+    CHECK_PTR_FAIL_GOTO( reshape_output_tensors, "Create buffer fail.", final );
     memset( reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
 
     vsi_nn_rnn_split_input_tensor(self, input_tensor,
@@ -238,8 +240,10 @@ static vsi_bool op_setup
     if(has_aux_input)
     {
         aux_split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
+        CHECK_PTR_FAIL_GOTO( aux_split_output_tensors, "Create buffer fail.", final );
         memset( aux_split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
         aux_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
+        CHECK_PTR_FAIL_GOTO( aux_reshape_output_tensors, "Create buffer fail.", final );
         memset( aux_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
 
         vsi_nn_rnn_split_input_tensor(self, aux_input_tensor,
@@ -251,9 +255,11 @@ static vsi_bool op_setup
     /* prepare output tensor */
     lstmcell_reshape_output_tensors_fw = (vsi_nn_tensor_t **)malloc(time_step *
         sizeof(vsi_nn_tensor_t **));
+    CHECK_PTR_FAIL_GOTO( lstmcell_reshape_output_tensors_fw, "Create buffer fail.", final );
     memset( lstmcell_reshape_output_tensors_fw, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
     lstmcell_reshape_output_tensors_bw = (vsi_nn_tensor_t **)malloc(time_step *
         sizeof(vsi_nn_tensor_t **));
+    CHECK_PTR_FAIL_GOTO( lstmcell_reshape_output_tensors_bw, "Create buffer fail.", final );
     memset( lstmcell_reshape_output_tensors_bw, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
 
     for( i = 0; i < time_step; i++ )
@@ -474,6 +480,7 @@ static vsi_bool op_setup
     {
         vsi_nn_tensor_t** merge_tensors = NULL;
         merge_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
+        CHECK_PTR_FAIL_GOTO( merge_tensors, "Create buffer fail.", final );
         memset( merge_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
 
         tensor = outputs[BI_LSTM_FW_OUTPUT_OUTPUT];
@@ -580,6 +587,7 @@ static vsi_bool op_setup
         }
     }
 
+final:
     vsi_nn_safe_free( split_output_tensors );
     vsi_nn_safe_free( aux_split_output_tensors )
     vsi_nn_safe_free( reshape_output_tensors );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
index 46eea58..8f81613 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
@@ -35,9 +35,9 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
+#include "vsi_nn_error.h"
 
 static vsi_bool setup_op_shapes
     (
@@ -224,8 +224,10 @@ static vsi_bool op_setup
 
     /* split input tensor */
     split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
+    CHECK_PTR_FAIL_GOTO( split_output_tensors, "Create buffer fail.", final );
     memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
     reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
+    CHECK_PTR_FAIL_GOTO( reshape_output_tensors, "Create buffer fail.", final );
     memset( reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
 
     vsi_nn_rnn_split_input_tensor(self, input_tensor,
@@ -237,8 +239,10 @@ static vsi_bool op_setup
     if(has_aux_input)
     {
         aux_split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
+        CHECK_PTR_FAIL_GOTO( aux_split_output_tensors, "Create buffer fail.", final );
         memset( aux_split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
         aux_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
+        CHECK_PTR_FAIL_GOTO( aux_reshape_output_tensors, "Create buffer fail.", final );
         memset( aux_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
 
         vsi_nn_rnn_split_input_tensor(self, aux_input_tensor,
@@ -250,9 +254,11 @@ static vsi_bool op_setup
     /* prepare output tensor */
     rnncell_reshape_output_tensors_fw = (vsi_nn_tensor_t **)malloc(time_step *
         sizeof(vsi_nn_tensor_t **));
+    CHECK_PTR_FAIL_GOTO( rnncell_reshape_output_tensors_fw, "Create buffer fail.", final );
     memset( rnncell_reshape_output_tensors_fw, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
     rnncell_reshape_output_tensors_bw = (vsi_nn_tensor_t **)malloc(time_step *
         sizeof(vsi_nn_tensor_t **));
+    CHECK_PTR_FAIL_GOTO( rnncell_reshape_output_tensors_bw, "Create buffer fail.", final );
     memset( rnncell_reshape_output_tensors_bw, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
 
     for( i = 0; i < time_step; i++ )
@@ -387,6 +393,7 @@ static vsi_bool op_setup
     {
         vsi_nn_tensor_t** merge_tensors = NULL;
         merge_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
+        CHECK_PTR_FAIL_GOTO( merge_tensors, "Create buffer fail.", final );
         memset( merge_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
 
         tensor = outputs[BI_RNN_FW_OUTPUT_OUTPUT];
@@ -493,6 +500,7 @@ static vsi_bool op_setup
         }
     }
 
+final:
     vsi_nn_safe_free( split_output_tensors );
     vsi_nn_safe_free( aux_split_output_tensors )
     vsi_nn_safe_free( reshape_output_tensors );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
index 5d16c2b..6e7288b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
@@ -55,8 +55,7 @@ static vsi_status op_compute
     float min_value = self->nn_param.clip.min;
     float max_value = self->nn_param.clip.max;
 
-    if ( (min_value == -1.0f && max_value == 1.0f)
-      || (min_value == 0.0f && max_value == 6.0f) )
+    if ( self->nn_param.clip.local2->is_internal_node )
     {
         status = VSI_SUCCESS;
         vsi_nn_internal_compute_node( self );
@@ -69,7 +68,7 @@ static vsi_status op_compute
         vsi_bool ret;
         vsi_nn_kernel_param_t * param = NULL;
 
-        param =vsi_nn_kernel_param_create();
+        param = vsi_nn_kernel_param_create();
 
         ret = vsi_nn_kernel_optimize_element_shape(
                 inputs[0]->attr.size, inputs[0]->attr.dim_num,
@@ -78,7 +77,7 @@ static vsi_status op_compute
         vsi_nn_kernel_param_add_float32( param, "min_value",  min_value );
         vsi_nn_kernel_param_add_float32( param, "max_value",  max_value );
 
-        if( ret )
+        if ( ret )
         {
             reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
                     inputs[0], shape, new_rank );
@@ -90,17 +89,16 @@ static vsi_status op_compute
                     &reshape_tensors[0], 1,
                     &reshape_tensors[1], 1, param );
 
-            vsi_nn_ReleaseTensor( &reshape_tensors[0] );
-            vsi_nn_ReleaseTensor( &reshape_tensors[1] );
+            vsi_safe_release_tensor( reshape_tensors[0] );
+            vsi_safe_release_tensor( reshape_tensors[1] );
         }
 
-        if( self->n )
+        if ( self->n )
         {
             status = VSI_SUCCESS;
         }
 
         vsi_nn_kernel_param_release( &param );
-
     }
 
     return status;
@@ -114,22 +112,48 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(CLIP, 1, 1)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_F32,  D_F32)
-        IO_TYPE(D_U8|Q_ASYM,  D_F32)
-        IO_TYPE(D_F32,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_BF16,       D_BF16)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_F16)
+        IO_TYPE(D_F32,          D_F32)
+        IO_TYPE(D_I32,          D_I32)
+        IO_TYPE(D_U8|Q_ASYM,    D_F32)
+        IO_TYPE(D_I8|Q_DFP,     D_F32)
+        IO_TYPE(D_I8|Q_ASYM,    D_F32)
+        IO_TYPE(D_I8|Q_SYM,     D_F32)
+        IO_TYPE(D_I16|Q_DFP,    D_F32)
+        IO_TYPE(D_I16|Q_ASYM,   D_F32)
+        IO_TYPE(D_I16|Q_SYM,    D_F32)
+        IO_TYPE(D_F32,          D_U8|Q_ASYM)
+        IO_TYPE(D_F32,          D_I16|Q_DFP)
+        IO_TYPE(D_F32,          D_I16|Q_ASYM)
+        IO_TYPE(D_F32,          D_I16|Q_SYM)
+        IO_TYPE(D_F32,          D_I8|Q_DFP)
+        IO_TYPE(D_F32,          D_I8|Q_ASYM)
+        IO_TYPE(D_F32,          D_I8|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_F16)
+        IO_TYPE(D_BF16,         D_BF16)
     END_IO_TYPE_DECL(CLIP)
-    if(!VALIDATE_OP_IO_TYPES(CLIP, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(CLIP, self, inputs, self->input.num, outputs, self->output.num))
+    {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -145,31 +169,13 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
-    uint32_t i;
-    float min = self->nn_param.clip.min;
-    float max = self->nn_param.clip.max;
-
-    for (i = 0; i < _VSI_NN_CLIP_LOCAL_TENSOR_NUM; i++)
-    {
-        if (self->nn_param.clip.local.local_tensor[i] != NULL)
-        {
-            vxReleaseTensor(&(self->nn_param.clip.local.local_tensor[i]));
-            self->nn_param.clip.local.local_tensor[i] = NULL;
-        }
-    }
-
-    if (self->nn_param.clip.local2 != NULL)
-    {
-        free(self->nn_param.clip.local2);
-        self->nn_param.clip.local2 = NULL;
-    }
-
-    if ( (min == -1.0f && max == 1.0f)
-      || (min == 0.0f && max == 6.0f) )
+    if ( self->nn_param.clip.local2->is_internal_node )
     {
         vsi_nn_internal_deinit_node_wksp( self );
     }
 
+    vsi_nn_safe_free(self->nn_param.clip.local2);
+
     vsi_nn_op_common_deinit(self);
 
     return VSI_SUCCESS;
@@ -202,23 +208,39 @@ static vsi_bool op_setup
     vsi_nn_internal_node_t* curr = NULL;
     float min = self->nn_param.clip.min;
     float max = self->nn_param.clip.max;
+    uint32_t infinity = VSI_NN_FLOAT32_INF;
+    float neg_infinity = -*(float*)&infinity;
+    int32_t max_float = *(int32_t*)&max;
 
     if ( (min == -1.0f && max == 1.0f)
-      || (min == 0.0f && max == 6.0f) )
+      || (min == 0.0f && max == 6.0f)
+      || (min == 0.0f && max_float == VSI_NN_FLOAT32_INF)
+      || (min == neg_infinity && max_float == VSI_NN_FLOAT32_INF))
     {
         vsi_nn_internal_init_node_wksp(self);
         if (min == -1.0f && max == 1.0f)
         {
             curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU1, 0, 0);
         }
-        else
+        else if (min == 0.0f && max == 6.0f)
         {
             curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU6, 0, 0);
         }
+        else if (min == 0.0f && max_float == VSI_NN_FLOAT32_INF)
+        {
+            curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU, 0, 0);
+        }
+        else
+        {
+            curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0);
+        }
+
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[0];
 
         vsi_nn_internal_setup_node(self, curr);
+
+        self->nn_param.clip.local2->is_internal_node = TRUE;
     }
     else
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
index 8c216ea..7dbe943 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
@@ -103,6 +103,7 @@ static vsi_bool op_check
             IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP)
             IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP)
             IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_F16)
+            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_F16)
 
             IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM)
             IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c
index 327b949..03118aa 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
 
@@ -515,7 +515,9 @@ static vsi_bool op_setup
     trans_input_tensor(self, inputs, trans_inputs);
 
     split_outputs = (vsi_nn_tensor_t **)malloc(sizeof(vsi_nn_tensor_t *) * timestep);
+    CHECK_PTR_FAIL_GOTO( split_outputs, "Create buffer fail.", final );
     conv2dlstm_step_outputs = (vsi_nn_tensor_t **)malloc(sizeof(vsi_nn_tensor_t *) * timestep);
+    CHECK_PTR_FAIL_GOTO( conv2dlstm_step_outputs, "Create buffer fail.", final );
     memset(split_outputs, 0, sizeof(vsi_nn_tensor_t *) * timestep);
     memset(conv2dlstm_step_outputs, 0, sizeof(vsi_nn_tensor_t *) * timestep);
 
@@ -636,6 +638,7 @@ static vsi_bool op_setup
         trans_output_tensor(self, conv2dlstm_outputs, outputs);
     }
 
+final:
     vsi_nn_safe_free(split_outputs);
     vsi_nn_safe_free(conv2dlstm_step_outputs)
     return TRUE;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
index 87aa2ba..9ab2266 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
@@ -139,6 +139,10 @@ static vsi_nn_internal_tensor_t * create_recurrent_conv
 
     internal_bias = vsi_nn_internal_create_zero_bias_tensor(
         self, &input->attr, &weight->attr, VSI_NN_OP_CONV2D, FALSE);
+    if (internal_bias == NULL)
+    {
+        return NULL;
+    }
     bias = internal_bias->t;
 
     attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c
index 35bf275..063dbd0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c
@@ -83,7 +83,7 @@ static vsi_status op_compute
          self->nn_param.conv3d.dilation[2] > 1)
     {
         VSILOGE("conv3d could not support dilation > 1\n");
-        return status;
+        goto final;
     }else
     {
         MAP_PARAM("dilation_w",self->nn_param.conv3d.dilation[0]);
@@ -98,6 +98,7 @@ static vsi_status op_compute
         status = VSI_SUCCESS;
     }
 
+final:
     vsi_nn_kernel_param_release( &param );
     return status;
 } /* op_compute() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
index be82720..1929167 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@@ -72,7 +72,7 @@ static vsi_bool _is_same_quant
     dtype = &inputs[0]->attr.dtype;
     _dtype = &outputs[0]->attr.dtype;
 
-    if(vsi_nn_DtypeCompare(dtype, _dtype) == FALSE)
+    if (vsi_nn_DtypeCompare(dtype, _dtype) == FALSE)
     {
         return FALSE;
     }
@@ -100,13 +100,17 @@ static vsi_status op_optimize
     }
 
     VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
-    if( direction == VSI_NN_OPTIMIZE_FORWARD )
+    if ( direction == VSI_NN_OPTIMIZE_FORWARD )
     {
-        if(NULL == inputs[0]->t && NULL != outputs[0]->t)
+        if ( NULL == outputs[0]->t )
         {
-            inputs[0]->t = vsi_nn_safe_reshape_tensor(outputs[0]->t,
-                (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num, sizeof(inputs[0]->attr.size[0]));
-            if( inputs[0]->t == NULL )
+            if ( NULL == inputs[0]->t )
+            {
+                vsi_nn_TensorReinit( self->graph, inputs[0] );
+            }
+            outputs[0]->t = vsi_nn_safe_reshape_tensor(inputs[0]->t,
+                (void*)outputs[0]->attr.size, (vsi_size_t)outputs[0]->attr.dim_num, sizeof(outputs[0]->attr.size[0]));
+            if ( outputs[0]->t == NULL )
             {
                 VSILOGE("Call vsi_nn_safe_reshape_tensor fail");
                 return VSI_FAILURE;
@@ -116,11 +120,11 @@ static vsi_status op_optimize
     }
     else
     {
-        if(NULL == outputs[0]->t && NULL != inputs[0]->t)
+        if ( NULL == inputs[0]->t && NULL != outputs[0]->t )
         {
-            outputs[0]->t = vsi_nn_safe_reshape_tensor(inputs[0]->t,
-                (void*)outputs[0]->attr.size, (vsi_size_t)outputs[0]->attr.dim_num, sizeof(outputs[0]->attr.size[0]));
-            if( outputs[0]->t == NULL )
+            inputs[0]->t = vsi_nn_safe_reshape_tensor(outputs[0]->t,
+                (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num, sizeof(inputs[0]->attr.size[0]));
+            if ( inputs[0]->t == NULL )
             {
                 VSILOGE("Call vsi_nn_safe_reshape_tensor fail");
                 return VSI_FAILURE;
@@ -180,11 +184,12 @@ static vsi_bool op_check
         IO_TYPE(D_F16,        D_U32)
         IO_TYPE(D_F16,        D_BF16)
         IO_TYPE(D_F16,        D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_I16|Q_ASYM)
+        IO_TYPE(D_F16,        D_I16|Q_SYM)
         IO_TYPE(D_F16,        D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_I8|Q_ASYM)
+        IO_TYPE(D_F16,        D_I8|Q_SYM)
         IO_TYPE(D_F16,        D_U8|Q_ASYM)
-        IO_TYPE(D_F16,        D_I16)
-        IO_TYPE(D_F16,        D_I8)
-        IO_TYPE(D_F16,        D_U8)
         IO_TYPE(D_F32,        D_F32)
         IO_TYPE(D_F32,        D_I32|Q_DFP)
         IO_TYPE(D_F32,        D_I32|Q_ASYM)
@@ -194,30 +199,21 @@ static vsi_bool op_check
         IO_TYPE(D_F32,        D_I16|Q_DFP)
         IO_TYPE(D_F32,        D_I8|Q_DFP)
         IO_TYPE(D_F32,        D_U8|Q_ASYM)
-        IO_TYPE(D_F32,        D_I16)
-        IO_TYPE(D_F32,        D_I8)
-        IO_TYPE(D_F32,        D_U8)
         IO_TYPE(D_I16|Q_DFP,  D_F32)
         IO_TYPE(D_I16|Q_DFP,  D_I32)
         IO_TYPE(D_I16|Q_DFP,  D_U32)
         IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,  D_I8|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,  D_I8|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,  D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_I16)
-        IO_TYPE(D_I16|Q_DFP,  D_I8)
-        IO_TYPE(D_I16|Q_DFP,  D_U8)
-        IO_TYPE(D_I16,        D_F32)
-        IO_TYPE(D_I16,        D_I32)
-        IO_TYPE(D_I16,        D_U32)
-        IO_TYPE(D_I16,        D_I16|Q_DFP)
-        IO_TYPE(D_I16,        D_I8|Q_DFP)
-        IO_TYPE(D_I16,        D_U8|Q_ASYM)
-        IO_TYPE(D_I16,        D_F16)
-        IO_TYPE(D_I16,        D_I16)
-        IO_TYPE(D_I16,        D_I8)
-        IO_TYPE(D_I16,        D_U8)
+        IO_TYPE(D_I16|Q_ASYM, D_F16)
+        IO_TYPE(D_I16|Q_SYM,  D_F16)
         IO_TYPE(D_I8|Q_DFP,   D_F32)
+        IO_TYPE(D_I8|Q_SYM,   D_F16)
+        IO_TYPE(D_I8|Q_ASYM,  D_F16)
         IO_TYPE(D_I8|Q_DFP,   D_F16)
         IO_TYPE(D_I8|Q_DFP,   D_I32|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,   D_U32)
@@ -225,22 +221,9 @@ static vsi_bool op_check
         IO_TYPE(D_I8|Q_DFP,   D_I8|Q_ASYM)
         IO_TYPE(D_I8|Q_DFP,   D_I16|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,   D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,   D_I8)
-        IO_TYPE(D_I8|Q_DFP,   D_I8)
-        IO_TYPE(D_I8|Q_DFP,   D_I16)
-        IO_TYPE(D_I8|Q_DFP,   D_U8)
-        IO_TYPE(D_I8,         D_F32)
-        IO_TYPE(D_I8,         D_F16)
-        IO_TYPE(D_I8,         D_I32)
-        IO_TYPE(D_I8,         D_U32)
-        IO_TYPE(D_I8,         D_I8|Q_DFP)
-        IO_TYPE(D_I8,         D_I8|Q_ASYM)
-        IO_TYPE(D_I8,         D_I16|Q_DFP)
-        IO_TYPE(D_I8,         D_U8|Q_ASYM)
-        IO_TYPE(D_I8,         D_I8)
-        IO_TYPE(D_I8,         D_I16)
         IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM)
         IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_SYM,   D_I8|Q_SYM)
         IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_I8|Q_ASYM)
@@ -250,12 +233,6 @@ static vsi_bool op_check
         IO_TYPE(D_U8|Q_ASYM,  D_I32|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_U32)
         IO_TYPE(D_U8|Q_ASYM,  D_F32)
-        IO_TYPE(D_U8,         D_U8|Q_ASYM)
-        IO_TYPE(D_U8,         D_I16|Q_DFP)
-        IO_TYPE(D_U8,         D_F16)
-        IO_TYPE(D_U8,         D_I32)
-        IO_TYPE(D_U8,         D_U32)
-        IO_TYPE(D_U8,         D_F32)
         IO_TYPE(D_BOOL8,      D_BOOL8)
         IO_TYPE(D_BOOL8,      D_U8|Q_ASYM)
         IO_TYPE(D_BOOL8,      D_I8|Q_ASYM)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
index 4adcc43..483a6dc 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
@@ -71,6 +71,11 @@ static vsi_status op_compute
             attr.size[2] = inputs[1]->attr.size[3];
             attr.size[3] = inputs[1]->attr.size[2];
             permute_tensor = vsi_nn_CreateTensor(self->graph, &attr);
+            if ( NULL == permute_tensor )
+            {
+                status = VSI_FAILURE;
+                goto final;
+            }
             self->n = vxTensorPermuteNode( self->graph->g, inputs[1]->t,
                         permute_tensor->t, perm, 4);
             if( NULL == self->n )
@@ -88,6 +93,11 @@ static vsi_status op_compute
         memset(&attr_reverse, 0, sizeof(vsi_nn_tensor_attr_t));
         memcpy(&attr_reverse, &tmp_in_tensor->attr, sizeof(vsi_nn_tensor_attr_t) );
         reverse_tensor = vsi_nn_CreateTensor(self->graph, &attr_reverse);
+        if ( NULL == reverse_tensor )
+        {
+            status = VSI_FAILURE;
+            goto final;
+        }
         para.axis = axis_reverse;
         para.numberOfAxis = 2;
 
@@ -116,6 +126,11 @@ static vsi_status op_compute
         attr.size[2] = inputs[1]->attr.size[3];
         attr.size[3] = inputs[1]->attr.size[2];
         permute_tensor = vsi_nn_CreateTensor(self->graph, &attr);
+        if ( NULL == permute_tensor )
+        {
+            status = VSI_FAILURE;
+            goto final;
+        }
         self->n = vxTensorPermuteNode( self->graph->g, inputs[1]->t,
                     permute_tensor->t, perm, 4);
         if( NULL == self->n )
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
index 982d0d4..eb8f75b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
@@ -51,7 +51,12 @@ static vsi_status op_compute
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_tensor_t* weight_tensor = NULL;
+    vsi_nn_tensor_t* weights_tensor = NULL;
     vsi_nn_tensor_t* new_inputs[3] = {NULL};
+    vsi_nn_tensor_t *permute_tensor = NULL;
+#ifdef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS
+    vsi_nn_tensor_t *reverse_tensor = NULL;
+#endif
 
     memcpy(&weight_attr, &(inputs[1]->attr), sizeof(vsi_nn_tensor_attr_t));
     weight_attr.size[3] = weight_attr.size[2];
@@ -64,7 +69,7 @@ static vsi_status op_compute
     }
     else
     {
-        uint8_t    * data = NULL;
+        uint8_t * data = NULL;
         data = vsi_nn_ConvertTensorToData( self->graph, inputs[1] );
         if (NULL == data)
         {
@@ -97,6 +102,83 @@ static vsi_status op_compute
     }
 #endif
 
+#ifdef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS
+    if (FALSE == inputs[1]->attr.is_const)
+    {
+        vsi_nn_tensor_t *tmp_in_tensor = NULL;
+        vx_nn_tensor_reverse_params_t para;
+        vx_int32 axis_reverse[4] = {0, 1, 0, 0};
+        vsi_nn_tensor_attr_t attr_reverse;
+
+        if (vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1)
+        {
+            uint32_t perm_array[] = { 0, 1, 3, 2 };
+            vsi_nn_tensor_attr_t attr;
+            memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
+            memcpy( &attr, &weight_tensor->attr, sizeof(vsi_nn_tensor_attr_t) );
+            attr.size[2] = weight_tensor->attr.size[3];
+            attr.size[3] = weight_tensor->attr.size[2];
+            permute_tensor = vsi_nn_CreateTensor(self->graph, &attr);
+            self->n = vxTensorPermuteNode( self->graph->g, weight_tensor->t,
+                        permute_tensor->t, perm_array, 4);
+            if ( NULL == self->n )
+            {
+                status = VSI_FAILURE;
+                goto final;
+            }
+            tmp_in_tensor = permute_tensor;
+        }
+        else
+        {
+            tmp_in_tensor = weight_tensor;
+        }
+
+        memset(&attr_reverse, 0, sizeof(vsi_nn_tensor_attr_t));
+        memcpy(&attr_reverse, &tmp_in_tensor->attr, sizeof(vsi_nn_tensor_attr_t) );
+        reverse_tensor = vsi_nn_CreateTensor(self->graph, &attr_reverse);
+        para.axis = axis_reverse;
+        para.numberOfAxis = 2;
+
+        self->n = vxTensorReverse( self->graph->g, tmp_in_tensor->t, &para,
+            sizeof(vx_nn_tensor_reverse_params_t), reverse_tensor->t );
+        if ( NULL == self->n )
+        {
+            status = VSI_FAILURE;
+            goto final;
+        }
+
+        weights_tensor  = reverse_tensor;
+    }
+    else
+    {
+        weights_tensor = weight_tensor;
+    }
+
+#else
+    if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 && FALSE == inputs[1]->attr.is_const)
+    {
+        uint32_t perm_array[] = { 0, 1, 3, 2 };
+        vsi_nn_tensor_attr_t attr;
+        memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
+        memcpy( &attr, &weight_tensor->attr, sizeof(vsi_nn_tensor_attr_t) );
+        attr.size[2] = weight_tensor->attr.size[3];
+        attr.size[3] = weight_tensor->attr.size[2];
+        permute_tensor = vsi_nn_CreateTensor(self->graph, &attr);
+        self->n = vxTensorPermuteNode( self->graph->g, weight_tensor->t,
+                    permute_tensor->t, perm_array, 4);
+        if( NULL == self->n )
+        {
+            status = VSI_FAILURE;
+            goto final;
+        }
+        weights_tensor  = permute_tensor;
+    }
+    else
+    {
+        weights_tensor = weight_tensor;
+    }
+#endif
+
     param = vsi_nn_kernel_param_create();
     vsi_nn_kernel_param_add_int32( param, "stride", self->nn_param.deconvolution1d.stride );
     vsi_nn_kernel_param_add_int32( param, "pad_front", self->nn_param.deconvolution1d.pad[0] );
@@ -108,7 +190,7 @@ static vsi_status op_compute
             "down_scale_size_rounding", self->vx_param.down_scale_size_rounding );
 
     new_inputs[0] = inputs[0];
-    new_inputs[1] = weight_tensor;
+    new_inputs[1] = weights_tensor;
     new_inputs[2] = inputs[2];
 
     self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "deconvolution1d",
@@ -120,11 +202,12 @@ static vsi_status op_compute
     }
     vsi_nn_kernel_param_release( &param );
 
-    if ( weight_tensor )
-    {
-        vsi_nn_ReleaseTensor( &weight_tensor );
-    }
-
+final:
+    vsi_safe_release_tensor(weight_tensor);
+    vsi_safe_release_tensor(permute_tensor);
+#ifdef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS
+    vsi_safe_release_tensor(reverse_tensor);
+#endif
     return status;
 } /* op_compute() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c
index 1f39eb7..c1c4404 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c
@@ -50,14 +50,15 @@ static vsi_status op_compute
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_param_t * param = NULL;
     //int32_t mode = self->nn_param.depth2space_internal.mode;
-    int32_t block_size = self->nn_param.depth2space_internal.block_size;
+    int32_t block_size = 0;
 
     if( NULL == self )
     {
         return VSI_FAILURE;
     }
 
-    param =vsi_nn_kernel_param_create();
+    block_size = self->nn_param.depth2space_internal.block_size;
+    param = vsi_nn_kernel_param_create();
 
     // Add params
     vsi_nn_kernel_param_add_int32( param, "block_size", block_size );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
index 19a5303..496d42e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
@@ -222,6 +222,37 @@ static vsi_bool op_check_minimum
         IO_TYPE(D_I32,          D_I32,          D_I32)
         IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_I16|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_ASYM,    D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_F16)
+        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_ASYM,   D_I16|Q_SYM)
     END_IO_TYPE_DECL(MINIMUM)
     if(!VALIDATE_OP_IO_TYPES(MINIMUM, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
@@ -267,6 +298,37 @@ static vsi_bool op_check_maximum
         IO_TYPE(D_I32,          D_I32,          D_I32)
         IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_I16|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_ASYM,    D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_F16)
+        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16)
+        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_ASYM,   D_I16|Q_SYM)
     END_IO_TYPE_DECL(MAXIMUM)
     if(!VALIDATE_OP_IO_TYPES(MAXIMUM, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
index a3a054e..9a85fd1 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
@@ -58,6 +58,16 @@ static vsi_status _eltwise_unary_op_compute
     if (strcmp(kernel_name, "elu") == 0)
     {
         alpha = self->nn_param.elu.alpha;
+        beta = 1.0f;
+    }
+    else if (strcmp(kernel_name, "celu") == 0)
+    {
+        alpha = self->nn_param.celu.alpha;
+    }
+    else if (strcmp(kernel_name, "selu") == 0)
+    {
+        alpha = self->nn_param.selu.alpha;
+        beta = self->nn_param.selu.gamma;
     }
     else
     {
@@ -74,14 +84,18 @@ static vsi_status _eltwise_unary_op_compute
         self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
                     "hard_gelu", inputs, 1, outputs, 1, param );
     }
+    else if (strcmp(kernel_name, "elu") == 0 )
+    {
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+                    "selu", inputs, 1, outputs, 1, param );
+    }
     else
     {
         self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
             kernel_name, inputs, 1, outputs, 1, param );
     }
 
-
-    if( self->n )
+    if ( self->n )
     {
         status = VSI_SUCCESS;
     }
@@ -141,32 +155,29 @@ static vsi_bool op_check
     /* check inputs outputs data type */
     BEGIN_IO_TYPE_DECL(ELTWISE_UNARY, 1, 1)
         /* IO_TYPE(INPUT, OUTPUT) */
-        IO_TYPE(D_I32, D_I32)
-
-        IO_TYPE(D_F32, D_F32)
-        IO_TYPE(D_F32, D_F16)
-        IO_TYPE(D_F32, D_BF16)
-
-        IO_TYPE(D_F16, D_F32)
-        IO_TYPE(D_F16, D_F16)
-        IO_TYPE(D_F16, D_U8|Q_ASYM)
-        IO_TYPE(D_F16, D_I8|Q_DFP)
-        IO_TYPE(D_F16, D_I16|Q_DFP)
-
-        IO_TYPE(D_BF16, D_BF16)
-        IO_TYPE(D_BF16, D_F32)
-
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_F16)
-
-        IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_ASYM, D_F16)
-
-        IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP, D_F16)
-
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_F16)
+        IO_TYPE(D_I32,          D_I32)
+        IO_TYPE(D_F32,          D_F32)
+        IO_TYPE(D_F32,          D_F16)
+        IO_TYPE(D_F32,          D_BF16)
+        IO_TYPE(D_F16,          D_F32)
+        IO_TYPE(D_F16,          D_F16)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_BF16,         D_BF16)
+        IO_TYPE(D_BF16,         D_F32)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,    D_F16)
     END_IO_TYPE_DECL(ELTWISE_UNARY)
     if (!VALIDATE_OP_IO_TYPES(ELTWISE_UNARY, self, inputs, self->input.num, outputs, self->output.num))
     {
@@ -196,6 +207,11 @@ static vsi_status _eltwise_unary_op_init
         self->nn_param.hard_sigmoid.alpha = 0.2f;
         self->nn_param.hard_sigmoid.beta = 0.5f;
     }
+    else if (strcmp(kernel_name, "selu") == 0)
+    {
+        self->nn_param.selu.alpha = 1.67326319217681884765625f;
+        self->nn_param.selu.gamma = 1.05070102214813232421875f;
+    }
 
     return VSI_SUCCESS;
 } /* op_init() */
@@ -234,6 +250,8 @@ DEF_ELEMENT_WISE_UNARY_OP( HARD_SIGMOID, hard_sigmoid );
 DEF_ELEMENT_WISE_UNARY_OP( MISH, mish );
 DEF_ELEMENT_WISE_UNARY_OP( ROUND, round );
 DEF_ELEMENT_WISE_UNARY_OP( GELU, gelu );
+DEF_ELEMENT_WISE_UNARY_OP( SELU, selu );
+DEF_ELEMENT_WISE_UNARY_OP( CELU, celu );
 
 #undef DEF_ELEMENT_UNARY_WISE_OP
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c b/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c
index 5da991f..a789f2c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c
@@ -71,30 +71,28 @@ static vsi_bool op_check
 {
     BEGIN_IO_TYPE_DECL(ERF, 1, 1)
         /* IO_TYPE(INPUT, OUTPUT) */
-        IO_TYPE(D_F32, D_F32)
-        IO_TYPE(D_F32, D_F16)
-        IO_TYPE(D_F32, D_BF16)
-
-        IO_TYPE(D_F16, D_F32)
-        IO_TYPE(D_F16, D_F16)
-        IO_TYPE(D_F16, D_U8|Q_ASYM)
-        IO_TYPE(D_F16, D_I8|Q_DFP)
-        IO_TYPE(D_F16, D_I16|Q_DFP)
-
-        IO_TYPE(D_BF16, D_BF16)
-        IO_TYPE(D_BF16, D_F32)
-
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_F16)
-
-        IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_ASYM, D_F16)
-
-        IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP, D_F16)
-
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_F16)
+        IO_TYPE(D_F32,          D_F32)
+        IO_TYPE(D_F32,          D_F16)
+        IO_TYPE(D_F32,          D_BF16)
+        IO_TYPE(D_F16,          D_F32)
+        IO_TYPE(D_F16,          D_F16)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_BF16,         D_BF16)
+        IO_TYPE(D_BF16,         D_F32)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,    D_F16)
     END_IO_TYPE_DECL(ERF)
     if (!VALIDATE_OP_IO_TYPES(ERF, self, inputs, self->input.num, outputs, self->output.num))
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
index 0c57380..92b1337 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
@@ -123,6 +123,30 @@ static vsi_bool op_check
         IO_TYPE(D_I16|Q_DFP,    D_I32,          D_I16|Q_DFP)
         IO_TYPE(D_I32,          D_I16|Q_DFP,    D_I16|Q_DFP)
         IO_TYPE(D_I32,          D_I32,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I8|Q_ASYM,    D_I32,          D_I8|Q_ASYM)
+        IO_TYPE(D_I32,          D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I32,          D_I32,          D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I32,          D_I8|Q_SYM)
+        IO_TYPE(D_I32,          D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I32,          D_I32,          D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_I32,          D_I16|Q_ASYM)
+        IO_TYPE(D_I32,          D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I32,          D_I32,          D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I32,          D_I16|Q_SYM)
+        IO_TYPE(D_I32,          D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I32,          D_I32,          D_I16|Q_SYM)
     END_IO_TYPE_DECL(FLOORDIV)
     if (!VALIDATE_OP_IO_TYPES(FLOORDIV, self, inputs, self->input.num, outputs, self->output.num))
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
index 34bcd78..f2b9142 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
@@ -101,22 +101,34 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(GATHER, 2, 1)
-        IO_TYPE(D_I32,  D_I32, D_I32)
-        IO_TYPE(D_F32,  D_I32, D_F32)
-        IO_TYPE(D_F16,  D_I32, D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_I32, D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I32, D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_I32, D_F16)
-        IO_TYPE(D_BF16, D_I32, D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM)
-        IO_TYPE(D_U8, D_I32, D_U8)
-        IO_TYPE(D_U8|Q_ASYM, D_I32, D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_I32,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I32,  D_F16)
-        IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_I32, D_F16)
+        IO_TYPE(D_I32,        D_I32,  D_I32)
+        IO_TYPE(D_F32,        D_I32,  D_F32)
+        IO_TYPE(D_F16,        D_I32,  D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_I32,  D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_I32,  D_I16|Q_ASYM)
+        IO_TYPE(D_F16,        D_I32,  D_I16|Q_SYM)
+        IO_TYPE(D_F16,        D_I32,  D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_I32,  D_I8|Q_ASYM)
+        IO_TYPE(D_F16,        D_I32,  D_I8|Q_SYM)
+        IO_TYPE(D_F16,        D_I32,  D_F16)
+        IO_TYPE(D_BF16,       D_I32,  D_BF16)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32,  D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32,  D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_I32,  D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_I32,  D_F16)
+        IO_TYPE(D_I8|Q_ASYM,  D_I32,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,  D_I32,  D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_I32,  D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,   D_I32,  D_F16)
+        IO_TYPE(D_I16|Q_DFP,  D_I32,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_I32,  D_F16)
+        IO_TYPE(D_I16|Q_ASYM, D_I32,  D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM, D_I32,  D_F16)
+        IO_TYPE(D_I16|Q_SYM,  D_I32,  D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,  D_I32,  D_F16)
     END_IO_TYPE_DECL(GATHER)
-    if(!VALIDATE_OP_IO_TYPES(GATHER, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(GATHER, self, inputs, self->input.num, outputs, self->output.num))
+    {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -150,7 +162,7 @@ static vsi_bool op_setup
     uint32_t i = 0;
     vsi_nn_gather_param * p = NULL;
 
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         uint32_t j = 0;
         p = &(self->nn_param.gather);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c
new file mode 100644
index 0000000..2ba3c5c
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c
@@ -0,0 +1,196 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_math.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _gather_elements_local_data_t {
+    int32_t placeholder;
+} gather_elements_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    uint32_t rank_in = 0;
+    int32_t axis = 0;
+    int32_t new_axis0 = 0;
+    int32_t new_axis1 = 0;
+    vsi_bool ret = FALSE;
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_gather_elements_param * p = NULL;
+
+    if ( NULL == self )
+    {
+        return VSI_FAILURE;
+    }
+    status = VSI_FAILURE;
+
+    p = &(self->nn_param.gather_elements);
+    axis = p->axis;
+
+    ret = vsi_nn_kernel_optimize_softmax_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+            shapes[0], &rank_in, &new_axis0);
+    ret |= vsi_nn_kernel_optimize_softmax_shape(
+            inputs[1]->attr.size, inputs[1]->attr.dim_num, axis,
+            shapes[1], &rank_in, &new_axis1);
+
+    // Add params
+    param = vsi_nn_kernel_param_create();
+
+    if ( ret && new_axis0 == new_axis1 )
+    {
+        vsi_nn_kernel_param_add_int32( param, "axis", new_axis0 );
+
+        reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
+                inputs[0], shapes[0], rank_in );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+                inputs[1], shapes[1], rank_in );
+        reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph,
+                outputs[0], shapes[1], rank_in );
+
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+                "gather_elements",
+                &reshape_tensors[0], 2,
+                &reshape_tensors[2], 1, param );
+
+        vsi_safe_release_tensor( reshape_tensors[0] );
+        vsi_safe_release_tensor( reshape_tensors[1] );
+        vsi_safe_release_tensor( reshape_tensors[2] );
+    }
+    else
+    {
+        vsi_nn_kernel_param_add_int32( param, "axis", axis );
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+                "gather_elements",
+                inputs, 2,
+                outputs, 1, param );
+    }
+
+    vsi_nn_kernel_param_release( &param );
+
+    if ( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(GATHER_ELEMENTS, 2, 1)
+        IO_TYPE(D_I32,        D_I32, D_I32)
+        IO_TYPE(D_F32,        D_I32, D_F32)
+        IO_TYPE(D_F16,        D_I32, D_F16)
+        IO_TYPE(D_BF16,       D_I32, D_BF16)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32, D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_DFP,   D_I32, D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,  D_I32, D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_I32, D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,  D_I32, D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM, D_I32, D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_I32, D_I16|Q_SYM)
+    END_IO_TYPE_DECL(GATHER_ELEMENTS)
+    if (!VALIDATE_OP_IO_TYPES(GATHER_ELEMENTS, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        uint32_t i = 0;
+        outputs[0]->attr.dim_num = inputs[1]->attr.dim_num;
+        for (i = 0; i < inputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = inputs[1]->attr.size[i];
+        }
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ GATHER_ELEMENTS,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
index 6cf086c..e77633f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
@@ -96,19 +96,31 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(GATHER_ND, 2, 1)
-        IO_TYPE(D_I32,  D_I32,  D_I32)
-        IO_TYPE(D_F32,  D_I32,  D_F32)
-        IO_TYPE(D_F16,  D_I32,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_I32,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I32,  D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_I32,  D_F16)
-        IO_TYPE(D_BF16, D_I32,  D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_I32, D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_I32,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I32,  D_F16)
-        IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_I32, D_F16)
+        IO_TYPE(D_I32,        D_I32,  D_I32)
+        IO_TYPE(D_F32,        D_I32,  D_F32)
+        IO_TYPE(D_F16,        D_I32,  D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_I32,  D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_I32,  D_I16|Q_ASYM)
+        IO_TYPE(D_F16,        D_I32,  D_I16|Q_SYM)
+        IO_TYPE(D_F16,        D_I32,  D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_I32,  D_I8|Q_ASYM)
+        IO_TYPE(D_F16,        D_I32,  D_I8|Q_SYM)
+        IO_TYPE(D_F16,        D_I32,  D_F16)
+        IO_TYPE(D_BF16,       D_I32,  D_BF16)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32,  D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32,  D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_I32,  D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_I32,  D_F16)
+        IO_TYPE(D_I8|Q_ASYM,  D_I32,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,  D_I32,  D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_I32,  D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,   D_I32,  D_F16)
+        IO_TYPE(D_I16|Q_DFP,  D_I32,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_I32,  D_F16)
+        IO_TYPE(D_I16|Q_ASYM, D_I32,  D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM, D_I32,  D_F16)
+        IO_TYPE(D_I16|Q_SYM,  D_I32,  D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,  D_I32,  D_F16)
     END_IO_TYPE_DECL(GATHER_ND)
     if (!VALIDATE_OP_IO_TYPES(GATHER_ND, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c
index 6890763..4f2ae60 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c
@@ -66,6 +66,11 @@ static vsi_status op_compute
     vsi_nn_grouped_conv2d_param *nn_param = &self->nn_param.grouped_conv2d;
     nn_param->local = (vsi_nn_grouped_conv2d_param_local_data*)malloc(
         sizeof(vsi_nn_grouped_conv2d_param_local_data));
+    if (NULL == nn_param->local)
+    {
+        VSILOGE("Malloc fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
     memset(nn_param->local, 0, sizeof(vsi_nn_grouped_conv2d_param_local_data));
     /* TODO */
     /* example code : add op */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c
index 227650d..5afb30b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c
@@ -38,6 +38,7 @@
 #include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
+#include "vsi_nn_error.h"
 
 typedef struct _gru_ovxlib_local_data_t {
     vsi_nn_tensor_t* weights_input;
@@ -214,8 +215,10 @@ static vsi_bool op_setup_default
 
     /* split input tensor */
     split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
+    CHECK_PTR_FAIL_GOTO( split_output_tensors, "Create buffer fail.", final );
     memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
     grucell_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
+    CHECK_PTR_FAIL_GOTO( grucell_reshape_output_tensors, "Create buffer fail.", final );
     memset( grucell_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
 
     vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
@@ -346,6 +349,7 @@ static vsi_bool op_setup_default
         }
     }
 
+final:
     vsi_nn_safe_free( split_output_tensors );
     vsi_nn_safe_free( grucell_reshape_output_tensors );
 
@@ -486,8 +490,10 @@ static vsi_bool op_setup_optimized
 
     /* split input tensor */
     split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
+    CHECK_PTR_FAIL_GOTO( split_output_tensors, "Create buffer fail.", final );
     memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
     grucell_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
+    CHECK_PTR_FAIL_GOTO( grucell_reshape_output_tensors, "Create buffer fail.", final );
     memset( grucell_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
 
     vsi_nn_rnn_split_input_tensor(self, tmp_tensor->t, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
@@ -650,6 +656,7 @@ static vsi_bool op_setup_optimized
     vsi_nn_rnn_create_permute(self, last_step_h_state, outputs[GRU_OUTPUT_H_STATE],
         permute_in_perm, 2, use_virtual_tensor);
 
+final:
     vsi_nn_safe_free( split_output_tensors );
     vsi_nn_safe_free( grucell_reshape_output_tensors );
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
index 08955f3..0f7baf9 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
@@ -39,6 +39,7 @@
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 #include "utils/vsi_nn_constraint_check.h"
 #include "utils/vsi_nn_dtype_util.h"
+#include "vsi_nn_error.h"
 
 #define _INPUT_NUM          (2)
 #define _OUTPUT_NUM         (1)
@@ -62,6 +63,7 @@ static vsi_nn_tensor_t* _expand_scale_tensor
     vsi_nn_dtype_t   out_dtype;
 
     f32_out_buffer= (float *)malloc(scale_size_out * sizeof(float));
+    CHECK_PTR_FAIL_GOTO( f32_out_buffer, "Create buffer fail.", final );
     memset(f32_out_buffer, 0, scale_size_out * sizeof(float));
     f32_in_buffer = vsi_nn_ConvertTensorToFloat32Data(graph, scale);
     if (NULL == f32_in_buffer)
@@ -87,6 +89,7 @@ static vsi_nn_tensor_t* _expand_scale_tensor
     attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
     attr.vtl = FALSE;
     scale_tensor = vsi_nn_CreateTensor(graph, &attr);
+    CHECK_PTR_FAIL_GOTO( scale_tensor, "Create tensor fail.", final );
     out_dtype          = scale->attr.dtype;
     out_dtype.vx_type  = VSI_NN_TYPE_FLOAT32;
     out_dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
@@ -94,7 +97,6 @@ static vsi_nn_tensor_t* _expand_scale_tensor
           (uint8_t*)f32_out_buffer, &out_dtype, scale_tensor);
     if (VSI_SUCCESS != status)
     {
-        scale_tensor = NULL;
         goto final;
     }
 
@@ -251,24 +253,41 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(L2NORMALIZESCALE, _INPUT_NUM, _OUTPUT_NUM)
-        IO_TYPE(D_F16,  D_F16,  D_F16)
-        IO_TYPE(D_F16,  D_F32,  D_F16)
-        IO_TYPE(D_BF16, D_BF16, D_BF16)
-        IO_TYPE(D_BF16, D_F32,  D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_F16, D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_F32, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_F32, D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16)
-        IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_F16, D_F16)
-        IO_TYPE(D_I16|Q_DFP, D_F32, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_F32, D_F16)
-        IO_TYPE(D_F32, D_F32, D_F32)
-        IO_TYPE(D_U8|Q_ASYM, D_F32, D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F16,  D_F16)
+        IO_TYPE(D_F16,        D_F32,  D_F16)
+        IO_TYPE(D_BF16,       D_BF16, D_BF16)
+        IO_TYPE(D_BF16,       D_F32,  D_BF16)
+        IO_TYPE(D_I8|Q_DFP,   D_F16,  D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_F16,  D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_F16,  D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_F16,  D_F16)
+        IO_TYPE(D_I16|Q_DFP,  D_F16,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_F16,  D_F16)
+        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16)
+        IO_TYPE(D_F32,        D_F32,  D_F32)
+        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,  D_F16,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,  D_F16,  D_F16)
+        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_F16,  D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,   D_F16,  D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16)
+        IO_TYPE(D_I16|Q_ASYM, D_F16,  D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM, D_F16,  D_F16)
+        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16)
+        IO_TYPE(D_I16|Q_SYM,  D_F16,  D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,  D_F16,  D_F16)
+        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16)
     END_IO_TYPE_DECL(L2NORMALIZESCALE)
-    if(!VALIDATE_OP_IO_TYPES(L2NORMALIZESCALE, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(L2NORMALIZESCALE, self, inputs, self->input.num, outputs, self->output.num))
+    {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c
index 881767e..a70c3f7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c
@@ -114,8 +114,16 @@ static vsi_bool op_check
         IO_TYPE(D_U8|Q_ASYM,  D_F32,        D_F32)
         IO_TYPE(D_I8|Q_DFP,   D_F16,        D_F16)
         IO_TYPE(D_I8|Q_DFP,   D_F32,        D_F32)
+        IO_TYPE(D_I8|Q_ASYM,  D_F16,        D_F16)
+        IO_TYPE(D_I8|Q_ASYM,  D_F32,        D_F32)
+        IO_TYPE(D_I8|Q_SYM,   D_F16,        D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_F32,        D_F32)
         IO_TYPE(D_I16|Q_DFP,  D_F16,        D_F16)
         IO_TYPE(D_I16|Q_DFP,  D_F32,        D_F32)
+        IO_TYPE(D_I16|Q_ASYM, D_F16,        D_F16)
+        IO_TYPE(D_I16|Q_ASYM, D_F32,        D_F32)
+        IO_TYPE(D_I16|Q_SYM,  D_F16,        D_F16)
+        IO_TYPE(D_I16|Q_SYM,  D_F32,        D_F32)
         IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
         IO_TYPE(D_F16,        D_F16,        D_F16)
         IO_TYPE(D_F16,        D_F32,        D_F32)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c b/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c
index 99dbd5d..2e0e48b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c
@@ -22,7 +22,6 @@
 *
 *****************************************************************************/
 
-
 #include <string.h>
 #include <stdlib.h>
 
@@ -80,34 +79,57 @@ static vsi_bool op_check
 {
     BEGIN_IO_TYPE_DECL(ONE_HOT, 1, 1)
         /* IO_TYPE(INPUT, OUTPUT) */
-        IO_TYPE(D_F32, D_F32)
-        IO_TYPE(D_F32, D_F16)
-
-        IO_TYPE(D_F16, D_F32)
-        IO_TYPE(D_F16, D_F16)
-        IO_TYPE(D_F16, D_U8|Q_ASYM)
-        IO_TYPE(D_F16, D_I8|Q_DFP)
-        IO_TYPE(D_F16, D_I16|Q_DFP)
-
-        IO_TYPE(D_I32, D_F32)
-        IO_TYPE(D_I32, D_F16)
-        IO_TYPE(D_I32, D_U8|Q_ASYM)
-        IO_TYPE(D_I32, D_I8|Q_DFP)
-        IO_TYPE(D_I32, D_I16|Q_DFP)
-        IO_TYPE(D_I32, D_I32)
-
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_F16)
-
-        IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_ASYM, D_F16)
-
-        IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP, D_F16)
-
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_F16)
-        IO_TYPE(D_BF16,      D_BF16)
+        IO_TYPE(D_F32,          D_F32)
+        IO_TYPE(D_F32,          D_F16)
+        IO_TYPE(D_F16,          D_F32)
+        IO_TYPE(D_F16,          D_F16)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_I32,          D_F32)
+        IO_TYPE(D_I32,          D_F16)
+        IO_TYPE(D_I32,          D_U8|Q_ASYM)
+        IO_TYPE(D_I32,          D_I8|Q_DFP)
+        IO_TYPE(D_I32,          D_I8|Q_ASYM)
+        IO_TYPE(D_I32,          D_I8|Q_SYM)
+        IO_TYPE(D_I32,          D_I16|Q_DFP)
+        IO_TYPE(D_I32,          D_I16|Q_ASYM)
+        IO_TYPE(D_I32,          D_I16|Q_SYM)
+        IO_TYPE(D_I32,          D_I32)
+        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_BF16)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_I8|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,    D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_F32)
+        IO_TYPE(D_I16|Q_DFP,    D_BF16)
+        IO_TYPE(D_I16|Q_ASYM,   D_I8|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_F32)
+        IO_TYPE(D_I16|Q_SYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_BF16,         D_BF16)
     END_IO_TYPE_DECL(ONE_HOT)
     if (!VALIDATE_OP_IO_TYPES(ONE_HOT, self, inputs, self->input.num, outputs, self->output.num))
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
index a8d7b30..e6e5d72 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
@@ -36,6 +36,7 @@
 #include "utils/vsi_nn_math.h"
 #include "utils/vsi_nn_constraint_check.h"
 #include "utils/vsi_nn_dtype_util.h"
+#include "vsi_nn_error.h"
 
 vsi_status vsi_nn_InitPadParameter
     (
@@ -45,6 +46,8 @@ vsi_status vsi_nn_InitPadParameter
 {
     int32_t pad_const_val;
     uint8_t i;
+    vsi_status status = VSI_FAILURE;
+
     if(NULL == node || NULL == param)
     {
         VSILOGE("Set param fail\n");
@@ -85,7 +88,11 @@ vsi_status vsi_nn_InitPadParameter
      */
     param->numViewDimensions = vsi_nn_max(node->nn_param.pad.dim_num, 2);
     param->pad_front_array = (int32_t *)malloc(sizeof(int32_t) * param->numViewDimensions);
+    CHECK_PTR_FAIL_GOTO( param->pad_front_array, "Create buffer fail.", final );
     param->pad_back_array = (int32_t *)malloc(sizeof(int32_t) * param->numViewDimensions);
+    CHECK_PTR_FAIL_GOTO( param->pad_back_array, "Create buffer fail.", final );
+    status = VSI_SUCCESS;
+
     memset(param->pad_front_array, 0, sizeof(int32_t) * param->numViewDimensions);
     memset(param->pad_back_array, 0, sizeof(int32_t) * param->numViewDimensions);
     for(i=0; i < vsi_nn_min(param->numViewDimensions, node->nn_param.pad.dim_num); i++)
@@ -94,7 +101,8 @@ vsi_status vsi_nn_InitPadParameter
         param->pad_back_array[i]  = (int32_t)node->nn_param.pad.back_size[i];
     }
 
-    return VSI_SUCCESS;
+final:
+    return status;
 } /* vsi_nn_InitPadParameter() */
 
 void vsi_nn_DeinitPadParameter
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
index 6a955a5..eb74aff 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
@@ -34,7 +34,6 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_util.h"
 
@@ -95,7 +94,8 @@ static vsi_bool op_setup
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12          ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB           ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA          ||
-         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP
         )
     {
         uint32_t i = 0;
@@ -103,7 +103,6 @@ static vsi_bool op_setup
         vsi_nn_tensor_attr_t attr;
         vsi_bool use_virtual_tensor = TRUE;
 
-
         for (i = 0; i < p->dim_num; i++)
         {
             _axis = p->perm[i];
@@ -292,10 +291,12 @@ static vsi_bool op_setup
         }
         break;
     case VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR:
+    case VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP:
         {
             uint32_t i = 0;
             uint32_t axis = 2;
             uint32_t group = 3;
+            vsi_bool is_input_sep = p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ? FALSE : TRUE;
             vsi_nn_tensor_t ** input_tensor_group = &p->local->local_tensor[0];
             vsi_nn_internal_tensor_t * output_tensor_group[3] = {NULL};
             vsi_nn_internal_tensor_t* tmp_outputs[3] = { NULL };
@@ -305,11 +306,14 @@ static vsi_bool op_setup
 
             memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
 
-            ret = vsi_nn_CreateTensorGroup(self->graph, inputs[0], axis,
-            input_tensor_group, group);
-            if (ret == FALSE)
+            if (!is_input_sep)
             {
-                goto final;
+                ret = vsi_nn_CreateTensorGroup(self->graph, inputs[0], axis,
+                input_tensor_group, group);
+                if (ret == FALSE)
+                {
+                    goto final;
+                }
             }
 
             memcpy(&attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t));
@@ -348,24 +352,33 @@ static vsi_bool op_setup
                 memmove( tmp_outputs, output_tensor_group, sizeof(vsi_nn_tensor_t*) * 3 );
             }
 
-            for (i = 0; i < 3; i++)
+            curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_RGB888_PLANAR, 0, 0 );
+            if (is_input_sep)
             {
-                curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_GRAY, 0, 0 );
-
-                curr->node->nn_param.pre_process_gray.mean = mean[i];
-                curr->node->nn_param.pre_process_gray.scale = p->norm.scale;
-                curr->node->nn_param.pre_process_gray.rect.left = p->rect.left;
-                curr->node->nn_param.pre_process_gray.rect.top = p->rect.top;
-                curr->node->nn_param.pre_process_gray.rect.width = p->rect.width;
-                curr->node->nn_param.pre_process_gray.rect.height = p->rect.height;
-                curr->node->nn_param.pre_process_gray.output_attr.size = size_32bit;
-                curr->node->nn_param.pre_process_gray.output_attr.dim_num = p->output_attr.dim_num;
-
-                curr->inputs[0] = input_tensor_group[i];
-                curr->outputs[0] = output_tensor_group[i]->t;
-
-                vsi_nn_internal_setup_node(self, curr);
+                curr->inputs[0] = inputs[0];
+                curr->inputs[1] = inputs[1];
+                curr->inputs[2] = inputs[2];
             }
+            else
+            {
+                curr->inputs[0] = input_tensor_group[0];
+                curr->inputs[1] = input_tensor_group[1];
+                curr->inputs[2] = input_tensor_group[2];
+            }
+            curr->outputs[0] = output_tensor_group[0]->t;
+            curr->outputs[1] = output_tensor_group[1]->t;
+            curr->outputs[2] = output_tensor_group[2]->t;
+            curr->node->nn_param.pre_process_rgb888_planar.r_mean = mean[0];
+            curr->node->nn_param.pre_process_rgb888_planar.g_mean = mean[1];
+            curr->node->nn_param.pre_process_rgb888_planar.b_mean = mean[2];
+            curr->node->nn_param.pre_process_rgb888_planar.scale = p->norm.scale;
+            curr->node->nn_param.pre_process_rgb888_planar.rect.left = p->rect.left;
+            curr->node->nn_param.pre_process_rgb888_planar.rect.top = p->rect.top;
+            curr->node->nn_param.pre_process_rgb888_planar.rect.width = p->rect.width;
+            curr->node->nn_param.pre_process_rgb888_planar.rect.height = p->rect.height;
+            curr->node->nn_param.pre_process_rgb888_planar.output_attr.size = size_32bit;
+            curr->node->nn_param.pre_process_rgb888_planar.output_attr.dim_num = p->output_attr.dim_num;
+            vsi_nn_internal_setup_node(self, curr);
 
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 3, 1 );
 
@@ -483,7 +496,8 @@ static vsi_bool op_setup
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12          ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB           ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA          ||
-         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP
         )
     {
         if (layout == VSI_NN_DEST_LAYOUT_NHWC)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c
new file mode 100644
index 0000000..e0123fa
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c
@@ -0,0 +1,221 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _pre_process_rgb888_planar_local_data_t {
+    int32_t scale_x;
+    int32_t scale_y;
+    vsi_bool enable_copy;
+    vsi_bool enable_perm;
+} pre_process_rgb888_planar_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (3)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_kernel_node_t    n = NULL;
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_rgb888_planar.local->scale_x );
+    vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_rgb888_planar.local->scale_y );
+    vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_rgb888_planar.rect.left );
+    vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_rgb888_planar.rect.top );
+    vsi_nn_kernel_param_add_int32( param, "width", self->nn_param.pre_process_rgb888_planar.rect.width );
+    vsi_nn_kernel_param_add_int32( param, "height", self->nn_param.pre_process_rgb888_planar.rect.height );
+    vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_rgb888_planar.r_mean );
+    vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_rgb888_planar.g_mean );
+    vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb888_planar.b_mean );
+    vsi_nn_kernel_param_add_float32( param, "scale", self->nn_param.pre_process_rgb888_planar.scale );
+    vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb888_planar.local->enable_copy );
+    n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb888_planar", inputs, 3, outputs, 3, param );
+    if ( n != NULL )
+    {
+        self->n = (vx_node)n;
+        status = VSI_SUCCESS;
+    }
+
+    if (param != NULL)
+    {
+        vsi_nn_kernel_param_release( &param );
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 3, 3)
+        IO_TYPE(D_U8, D_U8, D_U8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
+        IO_TYPE(D_U8, D_U8, D_U8, D_I8|Q_DFP,  D_I8|Q_DFP,  D_I8|Q_DFP)
+        IO_TYPE(D_U8, D_U8, D_U8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
+        IO_TYPE(D_U8, D_U8, D_U8, D_F16,       D_F16,       D_F16)
+    END_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR)
+    if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_RGB888_PLANAR, self, inputs, self->input.num, outputs, self->output.num)) {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_pre_process_rgb888_planar_param * p = NULL;
+    uint32_t i = 0, j = 0;
+    p = (vsi_nn_pre_process_rgb888_planar_param *)&(self->nn_param.pre_process_rgb888_planar);
+
+    if (p->rect.width == 0 || p->rect.height == 0)
+    {
+        VSILOGE("Image size cannot be zero !(PRE_PROCESS_RGB888_PLANAR)\n");
+        return FALSE;
+    }
+    else
+    {
+        for (i = 0; i < p->output_attr.dim_num; i++)
+        {
+            if (p->output_attr.size[i] == 0)
+            {
+                VSILOGE("output size cannot be zero!(PRE_PROCESS_RGB888_PLANAR)\n");
+                return FALSE;
+            }
+        }
+    }
+
+    for (j = 0; j < 3; j++)
+    {
+        if ( VSI_NN_DIM_AUTO == outputs[j]->attr.dim_num )
+        {
+            if (p->output_attr.dim_num > 0)
+            {
+                outputs[j]->attr.dim_num = p->output_attr.dim_num;
+                for (i = 0; i < p->output_attr.dim_num; i++)
+                {
+                    outputs[j]->attr.dim_num = p->output_attr.dim_num;
+                    outputs[j]->attr.size[i] = p->output_attr.size[i];
+                }
+            }
+            else
+            {
+                VSILOGE("output dim num cannot be zero!(PRE_PROCESS_RGB888_PLANAR)\n");
+                return FALSE;
+            }
+        }
+    }
+
+    p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[0]->attr.size[0]);
+    p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[0]->attr.size[1]);
+
+    p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15)));
+
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    self->nn_param.pre_process_rgb888_planar.local   =
+    (pre_process_rgb888_planar_local_data_t *)malloc(sizeof(pre_process_rgb888_planar_local_data_t));
+
+    if (NULL == self->nn_param.pre_process_rgb888_planar.local)
+    {
+        return  VX_ERROR_NO_MEMORY;
+    }
+    memset(self->nn_param.pre_process_rgb888_planar.local, 0, sizeof(pre_process_rgb888_planar_local_data_t));
+
+    return VSI_SUCCESS;
+} /* op_init() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    if (self->nn_param.pre_process_rgb888_planar.local != NULL)
+    {
+        free(self->nn_param.pre_process_rgb888_planar.local);
+        self->nn_param.pre_process_rgb888_planar.local = NULL;
+    }
+    vsi_nn_op_common_deinit(self);
+
+    return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ PRE_PROCESS_RGB888_PLANAR,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
index ecbf5fa..3642b47 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
@@ -1228,7 +1228,7 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
-    vsi_bool use_interanl_node = self->nn_param.reduce.local2->use_internal_node;
+    vsi_bool use_interanl_node = FALSE;
 
     if (self->nn_param.reduce.local.axis_tensor != NULL)
     {
@@ -1237,6 +1237,7 @@ static vsi_status op_deinit
 
     if (self->nn_param.reduce.local2 != NULL)
     {
+        use_interanl_node = self->nn_param.reduce.local2->use_internal_node;
         if (self->nn_param.reduce.local2->axis_tensor2 != NULL)
         {
             vsi_nn_ReleaseTensor(&(self->nn_param.reduce.local2->axis_tensor2));
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
index 211ab7d..ebfa574 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
@@ -52,7 +52,7 @@ static vsi_status _comparisons_op_compute
     vsi_size_t new_rank = 0;
     vsi_bool ret;
     vsi_nn_kernel_param_t * param = NULL;
-    vsi_nn_relational_ops_type_t op_type = self->nn_param.relational_ops.op;
+    vsi_nn_relational_ops_type_t op_type;
 
     if( NULL == self )
     {
@@ -60,6 +60,8 @@ static vsi_status _comparisons_op_compute
     }
     status = VSI_FAILURE;
 
+    op_type = self->nn_param.relational_ops.op;
+
     // TODO: This optimzie is a hack for gpu path,
     // it should be moved to gpu kernel setup.
     ret = vsi_nn_kernel_optimize_eltwise_shape(
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
index d4629ec..b16ba26 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
@@ -37,6 +37,8 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
 
+VSI_NN_SUPPRESS_DEPRECATED_BEGIN
+
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -81,8 +83,7 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    //TODO: Check tensor shapes.
-    return TRUE;
+    return vsi_nn_OpCheck(VSI_NN_OP_DATACONVERT, self, inputs, outputs);
 } /* op_check() */
 
 static vsi_bool op_setup
@@ -93,11 +94,11 @@ static vsi_bool op_setup
     )
 {
     vsi_bool ret = TRUE;
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
         uint32_t i = 0;
-        for(i = 0; i < self->nn_param.reshape.dim_num; i++)
+        for (i = 0; i < self->nn_param.reshape.dim_num; i++)
         {
             shape[i] = -1 == self->nn_param.reshape.size[i] ? -1 : (vsi_size_t)self->nn_param.reshape.size[i];
         }
@@ -119,51 +120,46 @@ static vsi_status op_optimize
     )
 {
     vsi_status status;
-    vsi_bool ret;
 
     status = VSI_SUCCESS;
-    ret = TRUE;
 
-    if( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
+    if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
     {
         return status;
     }
 
-    if (self->nn_param.reshape.local.initialized == FALSE)
+    VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
+    if ( direction == VSI_NN_OPTIMIZE_BACKWARD )
     {
-        VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
-        if ( direction == VSI_NN_OPTIMIZE_BACKWARD )
+        if (NULL == inputs[0]->t && NULL != outputs[0]->t)
         {
-            if (NULL == inputs[0]->t && NULL != outputs[0]->t)
+            inputs[0]->t = vsi_nn_safe_reshape_tensor( outputs[0]->t,
+                (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num,
+                sizeof(inputs[0]->attr.size[0]) );
+            if ( inputs[0]->t == NULL )
             {
-                inputs[0]->t = vsi_nn_safe_reshape_tensor( outputs[0]->t,
-                    (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num,
-                    sizeof(inputs[0]->attr.size[0]) );
-                if ( inputs[0]->t == NULL )
-                {
-                    status = VSI_FAILURE;
-                }
-                self->nn_param.reshape.local.initialized = TRUE;
+                status = VSI_FAILURE;
             }
+            self->nn_param.reshape.local.initialized = TRUE;
         }
-        else
+    }
+    else
+    {
+        if (NULL == outputs[0]->t)
         {
-            if (NULL == outputs[0]->t)
+            if ( NULL == inputs[0]->t )
             {
-                vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
-                uint32_t i = 0;
-                for (i = 0; i < self->nn_param.reshape.dim_num; i++)
-                {
-                    shape[i] = -1 == self->nn_param.reshape.size[i] ? -1 : (vsi_size_t)self->nn_param.reshape.size[i];
-                }
-                ret = vsi_nn_ReshapeTensor( self->graph, inputs[0], outputs[0],
-                    shape, self->nn_param.reshape.dim_num );
-                if ( ret == FALSE )
-                {
-                    status = VSI_FAILURE;
-                }
-                self->nn_param.reshape.local.initialized = TRUE;
+                vsi_nn_TensorReinit( self->graph, inputs[0] );
             }
+
+            outputs[0]->t = vsi_nn_safe_reshape_tensor( inputs[0]->t,
+                (void*)outputs[0]->attr.size, (vsi_size_t)outputs[0]->attr.dim_num,
+                sizeof(outputs[0]->attr.size[0]) );
+            if ( outputs[0]->t == NULL )
+            {
+                status = VSI_FAILURE;
+            }
+            self->nn_param.reshape.local.initialized = TRUE;
         }
     }
 
@@ -188,4 +184,5 @@ DEF_OP_REG
     );
 #ifdef __cplusplus
 }
+VSI_NN_SUPPRESS_DEPRECATED_END
 #endif
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
index 4132004..6a84273 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
@@ -50,7 +50,7 @@ static vsi_status op_compute
     *If reshape is un-initialized, we need add a tensorcopy
     * when input and output are initialized.
     */
-    if(inputs[0]->t != NULL && outputs[0]->t != NULL &&
+    if (inputs[0]->t != NULL && outputs[0]->t != NULL &&
         self->nn_param.reshape2.local->initialized == FALSE)
     {
         self->n = vxTensorCopyNode(self->graph->g,
@@ -72,8 +72,7 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    //TODO: Check tensor shapes.
-    return TRUE;
+    return vsi_nn_OpCheck(VSI_NN_OP_DATACONVERT, self, inputs, outputs);
 } /* op_check() */
 
 static vsi_status op_init
@@ -116,7 +115,7 @@ static vsi_bool op_setup
     )
 {
     vsi_bool ret = TRUE;
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
         memcpy(shape, self->nn_param.reshape2.size,
@@ -139,44 +138,44 @@ static vsi_status op_optimize
     )
 {
     vsi_status status;
-    vsi_bool ret;
 
     status = VSI_SUCCESS;
-    ret = TRUE;
     if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
     {
         return status;
     }
 
-    if (self->nn_param.reshape2.local->initialized == FALSE)
+    if ( direction == VSI_NN_OPTIMIZE_BACKWARD )
     {
-        VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
-        if ( direction == VSI_NN_OPTIMIZE_BACKWARD )
+        if (NULL == inputs[0]->t && NULL != outputs[0]->t)
         {
-            if (NULL == inputs[0]->t && NULL != outputs[0]->t)
+            inputs[0]->t = vsi_nn_safe_reshape_tensor( outputs[0]->t,
+                (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num,
+                sizeof(inputs[0]->attr.size[0]) );
+            if ( inputs[0]->t == NULL )
             {
-                inputs[0]->t = vsi_nn_safe_reshape_tensor( outputs[0]->t,
-                    (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num,
-                    sizeof(inputs[0]->attr.size[0]) );
-                if ( inputs[0]->t == NULL )
-                {
-                    status = VSI_FAILURE;
-                }
-                self->nn_param.reshape2.local->initialized = TRUE;
+                status = VSI_FAILURE;
             }
+            self->nn_param.reshape2.local->initialized = TRUE;
         }
-        else
+    }
+    else
+    {
+        if (NULL == outputs[0]->t)
         {
-            if (NULL == outputs[0]->t)
+            if ( NULL == inputs[0]->t )
             {
-                ret = vsi_nn_ReshapeTensor( self->graph, inputs[0], outputs[0],
-                    self->nn_param.reshape2.size, self->nn_param.reshape2.dim_num );
-                if ( ret == FALSE )
-                {
-                    status = VSI_FAILURE;
-                }
-                self->nn_param.reshape2.local->initialized = TRUE;
+                vsi_nn_TensorReinit( self->graph, inputs[0] );
             }
+
+            outputs[0]->t = vsi_nn_safe_reshape_tensor( inputs[0]->t,
+                (void*)outputs[0]->attr.size, (vsi_size_t)outputs[0]->attr.dim_num,
+                sizeof(outputs[0]->attr.size[0]) );
+            if ( outputs[0]->t == NULL )
+            {
+                status = VSI_FAILURE;
+            }
+            self->nn_param.reshape2.local->initialized = TRUE;
         }
     }
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
index ec8b441..d5f3e54 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
 
@@ -157,11 +157,14 @@ static vsi_bool op_setup
     vsi_bool use_virtual_tensor = TRUE;
     uint32_t kernel_h = 1;
     uint32_t kernel_w = 1;
+    vsi_bool ret = FALSE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_node_wksp( self );
     p->local = (vsi_nn_rnncell_ovxlib_lcl_data_t*)
         malloc(sizeof(vsi_nn_rnncell_ovxlib_lcl_data_t));
+    CHECK_PTR_FAIL_GOTO( p->local, "Create buffer fail.", final );
+    ret = TRUE;
 
     memset(p->local, 0x00, sizeof(vsi_nn_rnncell_ovxlib_lcl_data_t));
     memset(&attr, 0x00, sizeof(attr));
@@ -328,7 +331,8 @@ static vsi_bool op_setup
         vsi_nn_internal_setup_node(self, curr);
     }
 
-    return TRUE;
+final:
+    return ret;
 } /* op_setup() */
 
 static vsi_status op_deinit
@@ -349,16 +353,22 @@ static vsi_status op_init
     vsi_nn_node_t * self
     )
 {
+    vsi_status status = VSI_FAILURE;
+
     self->nn_param.rnncell_ovxlib.local = (vsi_nn_rnncell_ovxlib_lcl_data_t *)
         malloc(sizeof(vsi_nn_rnncell_ovxlib_lcl_data_t));
+    CHECK_PTR_FAIL_GOTO( self->nn_param.rnncell_ovxlib.local, "Create buffer fail.", final );
     memset(self->nn_param.rnncell_ovxlib.local, 0,
         sizeof(vsi_nn_rnncell_ovxlib_lcl_data_t));
     self->nn_param.rnncell_ovxlib.internal_dtype = (vsi_nn_dtype_t *)
         malloc(sizeof(vsi_nn_dtype_t) * RNNCELL_QUANTIZE_PARAM_COUNT);
+    CHECK_PTR_FAIL_GOTO( self->nn_param.rnncell_ovxlib.internal_dtype, "Create buffer fail.", final );
     memset(self->nn_param.rnncell_ovxlib.internal_dtype, 0,
         sizeof(vsi_nn_dtype_t) * RNNCELL_QUANTIZE_PARAM_COUNT);
 
-    return VSI_SUCCESS;
+    status = VSI_SUCCESS;
+final:
+    return status;
 } /* op_init() */
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c
index 78c3886..8121363 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c
@@ -188,9 +188,10 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
-    vx_tensor rois = self->nn_param.roi_pool.local.rois;
-    if( NULL != self && NULL != self->n )
+    vx_tensor rois = NULL;
+    if ( NULL != self && NULL != self->n )
     {
+        rois = self->nn_param.roi_pool.local.rois;
         if(rois)
         {
             vxReleaseTensor(&rois);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c
index f9213ad..14b3250 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c
@@ -54,16 +54,16 @@ static vsi_status op_compute
     vsi_size_t *input_size = inputs[1]->attr.size;
     uint32_t dims_num = inputs[1]->attr.dim_num;
 
-    if(inputs[0]->attr.dim_num > 1)
+    if (inputs[0]->attr.dim_num > 1)
     {
         coord_dim = (uint32_t)inputs[0]->attr.size[0];
     }
-    if( coord_dim > 3 )
+    if ( coord_dim > 3 )
     {
         CHECK_STATUS(status);
         return status;
     }
-    for(i = 0; i < inputs[0]->attr.dim_num; i++)
+    for (i = 0; i < inputs[0]->attr.dim_num; i++)
     {
         idx_num *= (uint32_t)inputs[0]->attr.size[i];
     }
@@ -71,7 +71,7 @@ static vsi_status op_compute
 
     param =vsi_nn_kernel_param_create();
 
-    for(i = 0; i < dims_num; ++i)
+    for (i = 0; i < dims_num; ++i)
     {
         block_size *= (uint32_t)input_size[i];
     }
@@ -81,13 +81,13 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "coord_dim", coord_dim );
     vsi_nn_kernel_param_add_int32( param, "idx_num", idx_num );
     n = vsi_nn_kernel_selector( self->graph, "scatter_nd", inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
-    if( n != NULL )
+    if ( n != NULL )
     {
         self->n = (vx_node)n;
         status = VSI_SUCCESS;
     }
 
-    if(param != NULL)
+    if (param != NULL)
     {
         vsi_nn_kernel_param_release( &param );
     }
@@ -104,15 +104,20 @@ static vsi_bool op_check
 {
     BEGIN_IO_TYPE_DECL(SCATTER_ND, 2, 1)
         IO_TYPE(D_I32, D_I8|Q_DFP,   D_I8|Q_DFP)
+        IO_TYPE(D_I32, D_I8|Q_ASYM,  D_I8|Q_ASYM)
+        IO_TYPE(D_I32, D_I8|Q_SYM,   D_I8|Q_SYM)
         IO_TYPE(D_I32, D_U8|Q_ASYM,  D_U8|Q_ASYM)
         IO_TYPE(D_I32, D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I32, D_F16, D_F16)
-        IO_TYPE(D_I32, D_I32, D_I32)
-        IO_TYPE(D_I32, D_U32, D_U32)
-        IO_TYPE(D_I32, D_F32, D_F32)
-        IO_TYPE(D_I32, D_BF16,D_BF16)
+        IO_TYPE(D_I32, D_I16|Q_ASYM, D_I16|Q_ASYM)
+        IO_TYPE(D_I32, D_I16|Q_SYM,  D_I16|Q_SYM)
+        IO_TYPE(D_I32, D_F16,        D_F16)
+        IO_TYPE(D_I32, D_I32,        D_I32)
+        IO_TYPE(D_I32, D_U32,        D_U32)
+        IO_TYPE(D_I32, D_F32,        D_F32)
+        IO_TYPE(D_I32, D_BF16,       D_BF16)
     END_IO_TYPE_DECL(SCATTER_ND)
-    if(!VALIDATE_OP_IO_TYPES(SCATTER_ND, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(SCATTER_ND, self, inputs, self->input.num, outputs, self->output.num))
+    {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -134,9 +139,9 @@ static vsi_bool op_setup
     uint32_t i = 0;
     vsi_nn_scatter_nd_param * p = &(self->nn_param.scatter_nd);
 
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
-        if(p->shape == NULL)
+        if (p->shape == NULL)
         {
             return FALSE;
         }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
index d8c0c8d..384cb7f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
@@ -63,7 +63,7 @@ static vsi_status op_compute
         CHECK_STATUS(status);
         return status;
     }
-    for(i = 0; i < inputs[1]->attr.dim_num; i++)
+    for (i = 0; i < inputs[1]->attr.dim_num; i++)
     {
         idx_num *= (uint32_t)inputs[1]->attr.size[i];
     }
@@ -71,7 +71,7 @@ static vsi_status op_compute
 
     param =vsi_nn_kernel_param_create();
 
-    for(i = 0; i < dims_num; ++i)
+    for (i = 0; i < dims_num; ++i)
     {
         block_size *= (uint32_t)input_size[i];
     }
@@ -103,20 +103,29 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(SCATTER_ND_UPDATE, 3, 1)
-        IO_TYPE(D_I8|Q_DFP,  D_I32, D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I32, D_I8|Q_DFP,   D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_F16, D_I32, D_F16, D_F16)
-        IO_TYPE(D_F16, D_I32, D_F16, D_U8|Q_ASYM)
-        IO_TYPE(D_BF16, D_I32, D_BF16, D_BF16)
-        IO_TYPE(D_I32, D_I32, D_I32, D_I32)
-        IO_TYPE(D_U32, D_I32, D_U32, D_U32)
-        IO_TYPE(D_F32, D_I32, D_F32, D_F32)
+        IO_TYPE(D_I8|Q_DFP,   D_I32, D_I8|Q_DFP,   D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_I32, D_I8|Q_DFP,   D_F16)
+        IO_TYPE(D_I8|Q_ASYM,  D_I32, D_I8|Q_ASYM,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,  D_I32, D_I8|Q_ASYM,  D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_I32, D_I8|Q_SYM,   D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,   D_I32, D_I8|Q_SYM,   D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32, D_U8|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32, D_U8|Q_ASYM,  D_F16)
+        IO_TYPE(D_I16|Q_DFP,  D_I32, D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_I32, D_I16|Q_DFP,  D_F16)
+        IO_TYPE(D_I16|Q_ASYM, D_I32, D_I16|Q_ASYM, D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM, D_I32, D_I16|Q_ASYM, D_F16)
+        IO_TYPE(D_I16|Q_SYM,  D_I32, D_I16|Q_SYM,  D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,  D_I32, D_I16|Q_SYM,  D_F16)
+        IO_TYPE(D_F16,        D_I32, D_F16,        D_F16)
+        IO_TYPE(D_F16,        D_I32, D_F16,        D_U8|Q_ASYM)
+        IO_TYPE(D_BF16,       D_I32, D_BF16,       D_BF16)
+        IO_TYPE(D_I32,        D_I32, D_I32,        D_I32)
+        IO_TYPE(D_U32,        D_I32, D_U32,        D_U32)
+        IO_TYPE(D_F32,        D_I32, D_F32,        D_F32)
     END_IO_TYPE_DECL(SCATTER_ND_UPDATE)
-    if (!VALIDATE_OP_IO_TYPES(SCATTER_ND_UPDATE, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(SCATTER_ND_UPDATE, self, inputs, self->input.num, outputs, self->output.num))
+    {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c
index 5660eea..34202c3 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c
@@ -49,15 +49,17 @@ static vsi_status op_compute
 {
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_param_t * param = NULL;
-    int32_t block_size_x = self->nn_param.space2depth_internal.block_size_x;
-    int32_t block_size_y = self->nn_param.space2depth_internal.block_size_y;
+    int32_t block_size_x = 0;
+    int32_t block_size_y = 0;
 
     if ( NULL == self )
     {
         return VSI_FAILURE;
     }
 
-    param =vsi_nn_kernel_param_create();
+    block_size_x = self->nn_param.space2depth_internal.block_size_x;
+    block_size_y = self->nn_param.space2depth_internal.block_size_y;
+    param = vsi_nn_kernel_param_create();
 
     // Add params
     vsi_nn_kernel_param_add_int32( param, "block_size_x", block_size_x );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
index 76495df..dcd34fe 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
@@ -151,17 +151,17 @@ static vsi_bool _get_stride_slice_start_stop_stride
 
         start[i] = vsi_nn_clamp(start[i], 0, (vx_int32)(inputs[0]->attr.size[i] - 1));
 
-        if (params->shrink_axis_mask & (1 << i))
-        {
-            stop[i] = start[i] + 1;
-        }
-
         if (params->end_mask & (1 << i))
         {
             stop[i] = (int32_t)get_slice_mask_stop_value(stride[i], (uint32_t)inputs[0]->attr.size[i]);
         }
 
         stop[i] = (int32_t)get_slice_clamp_stop(stride[i], stop[i], (uint32_t)inputs[0]->attr.size[i]);
+
+        if (params->shrink_axis_mask & (1 << i))
+        {
+            stop[i] = start[i] + 1;
+        }
     }
 
     /* reset start stop and stride when output size is 1*/
@@ -833,7 +833,7 @@ static vsi_status op_init
 
     params->begin_dims =
         (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
-    if (NULL == lcl2_data->begin_dims)
+    if (NULL == params->begin_dims)
     {
         return  VX_ERROR_NO_MEMORY;
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
index 09343e7..e1e2615 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
@@ -35,6 +35,7 @@
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
 
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (2)
@@ -70,7 +71,27 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    /*TODO: Check tensor shapes. */
+    BEGIN_IO_TYPE_DECL(TOPK, _INPUT_NUM, _OUTPUT_NUM)
+        IO_TYPE(D_F16,        D_F16,        D_I32)
+        IO_TYPE(D_F32,        D_F32,        D_I32)
+        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP,   D_I32)
+        IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,  D_I32)
+        IO_TYPE(D_I8|Q_SYM,   D_I8|Q_SYM,   D_I32)
+        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_I32)
+        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I32)
+        IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM, D_I32)
+        IO_TYPE(D_I16|Q_SYM,  D_I16|Q_SYM,  D_I32)
+        IO_TYPE(D_I32,        D_I32,        D_I32)
+    END_IO_TYPE_DECL(TOPK)
+    if (!VALIDATE_OP_IO_TYPES(TOPK, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
     return TRUE;
 } /* op_check() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c
index d213bb9..a8a2a7e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c
@@ -1,4 +1,3 @@
-
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
@@ -45,22 +44,7 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    /*
-        Need copy input data to output if don't reshape input to output
-    */
-    if(inputs[0]->t != NULL && outputs[0]->t != NULL &&
-        self->nn_param.variable.local->initialized == FALSE)
-    {
-        self->n = vxTensorCopyNode(self->graph->g,
-            inputs[0]->t, outputs[0]->t);
-        if(NULL == self->n)
-        {
-            VSILOGE( "Create vxTensorCopyNode fail." );
-            return VSI_FAILURE;
-        }
-        VSILOGD("Create a copy node for variable");
-    }
-    return VSI_SUCCESS;
+    return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
 static vsi_bool op_check
@@ -85,53 +69,41 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
-    vsi_nn_variable_lcl_data *local = NULL;
-    if( direction == VSI_NN_OPTIMIZE_BACKWARD )
-    {
-        return VSI_SUCCESS;
-    }
-    local = (vsi_nn_variable_lcl_data *)malloc(sizeof(vsi_nn_variable_lcl_data));
-    if( NULL == local )
-    {
-        VSILOGE("malloc memory fail");
-        return VSI_FAILURE;
-    }
-    memset(local, 0, sizeof(vsi_nn_variable_lcl_data));
-    if( NULL != inputs[0]->t && NULL == outputs[0]->t &&
-        vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype))
-    {
-        VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
-        outputs[0]->t = vsi_nn_safe_reshape_tensor(inputs[0]->t, (void*)outputs[0]->attr.size,
-            (vsi_size_t)outputs[0]->attr.dim_num, sizeof(outputs[0]->attr.size[0]));
-        if( NULL == outputs[0]->t )
-        {
-            VSILOGE("Call vsi_nn_safe_reshape_tensor fail");
-            free(local);
-            local = NULL;
-            return VSI_FAILURE;
-        }
-        local->initialized = TRUE;
-    }
-    else
-    {
-        local->initialized = FALSE;
-    }
-    self->nn_param.variable.local = local;
-    return VSI_SUCCESS;
+    return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = TRUE;
+    vsi_nn_internal_node_t* curr = NULL;
+
+    vsi_nn_internal_init_node_wksp(self);
+    curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
+    if (NULL == curr)
+    {
+        return FALSE;
+    }
+    curr->inputs[0]  = inputs[0];
+    curr->outputs[0] = outputs[0];
+
+    vsi_nn_internal_setup_node(self, curr);
+
+    return ret;
+}
+
 static vsi_status op_deinit
     (
     vsi_nn_node_t * self
     )
 {
-    vsi_nn_variable_lcl_data *local = self->nn_param.variable.local;
-    if(local)
-    {
-        free(local);
-        local = NULL;
-    }
+    vsi_nn_internal_deinit_node_wksp(self);
     vsi_nn_op_common_deinit(self);
+
     return VSI_SUCCESS;
 } /* op_deinit() */
 
@@ -146,7 +118,7 @@ DEF_OP_REG
     /* compute    */ op_compute,
     /* deinit     */ op_deinit,
     /* check      */ op_check,
-    /* setup      */ vsi_nn_op_common_setup,
+    /* setup      */ op_setup,
     /* optimize   */ op_optimize,
     /* input_num  */ 1,
     /* output_num */ 1
@@ -154,4 +126,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
index bae1005..ade122c 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@@ -440,6 +440,10 @@ static _op_param_gen_t s_op_gen[] =
     /* DECONV3D */              NULL,
     /* PAD2 */                  NULL,
     /* COS */                   NULL,
+    /* PRE_PROCESS_RGB888_PLANAR */ NULL,
+    /* GATHER_ELEMENTS */       NULL,
+    /* SELU */                  NULL,
+    /* CELU */                  NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
 
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c
index 2f6aec6..f170bcf 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c
@@ -33,6 +33,7 @@
 #include "vsi_nn_log.h"
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_math.h"
+#include "vsi_nn_error.h"
 
 typedef struct _node_io_signature_t {
     int count;
@@ -102,6 +103,7 @@ static node_io_signature_t* _get_op_signature
 
     item = malloc(sizeof(node_io_signature_t) + \
         (reg_io_count - 1) * sizeof(vsi_nn_type_e));
+    CHECK_PTR_FAIL_GOTO( item, "Create buffer fail.", final );
     item->count = inputs_num + outputs_num;
     memset(&item->types[0], 0x00, reg_io_count * sizeof(vsi_nn_type_e));
 
@@ -128,6 +130,7 @@ static node_io_signature_t* _get_op_signature
                 outputs[i]->attr.dtype.qnt_type << Q_SHIFT;
     }
 
+final:
     return item;
 }
 
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_map.c b/src/tim/vx/internal/src/utils/vsi_nn_map.c
index b046f5e..db08568 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_map.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_map.c
@@ -83,6 +83,10 @@ void vsi_nn_MapAdd
     {
         key_iter = (vsi_nn_map_key_list_t *)vsi_nn_LinkListNewNode(
                 sizeof( vsi_nn_map_key_list_t ), NULL );
+        if ( NULL == key_iter )
+        {
+            return;
+        }
         key_iter->val = key;
         vsi_nn_LinkListPushStart( (vsi_nn_link_list_t **)&map->keys,
                 (vsi_nn_link_list_t *)key_iter );
@@ -140,4 +144,3 @@ vsi_bool vsi_nn_MapHasKey
         return TRUE;
     }
 }  /* vsi_nn_MapHasKey() */
-
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_math.c b/src/tim/vx/internal/src/utils/vsi_nn_math.c
index 65878b6..b2aae05 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_math.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_math.c
@@ -429,3 +429,45 @@ void vsi_nn_random_uniform_transform
         uniform_buf[i] = random_buf[i] / rand_max;
     }
 }
+
+float _evaluate_polynomial(float x, const float *coefficients, int32_t len)
+{
+    float poly = 0;
+    int32_t i = 0;
+
+    for (i = 0; i < len; i ++)
+    {
+        float c = coefficients[i];
+        poly = poly * x + c;
+    }
+    return poly;
+}
+
+// Compute a polynomial approximation of the error function.
+// This is the same approximation used by Eigen.
+float vsi_nn_erf_impl(float x)
+{
+    float x2 = 0 ;
+    // The monomial coefficients of the numerator polynomial (odd).
+    const float kAlpha[7] =
+    {
+        -2.72614225801306e-10f, 2.77068142495902e-08f,  -2.10102402082508e-06f,
+        -5.69250639462346e-05f, -7.34990630326855e-04f, -2.95459980854025e-03f,
+        -1.60960333262415e-02f,
+    };
+    // The monomial coefficients of the denominator polynomial (even).
+    const float kBeta[5] =
+    {
+        -1.45660718464996e-05f, -2.13374055278905e-04f, -1.68282697438203e-03f,
+        -7.37332916720468e-03f, -1.42647390514189e-02f,
+    };
+
+   // Clamp the inputs to the range [-4, 4] since anything outside
+   // this range is +/-1.0f in single-precision.
+    x = vsi_clamp(-4.f, x, 4.f);
+    // Since the polynomials are odd/even, we need x^2.
+    x2 = x * x;
+
+    return x * _evaluate_polynomial(x2, kAlpha, 7) /
+        _evaluate_polynomial(x2, kBeta, 5);
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/vip/virtual_device.cpp b/src/tim/vx/internal/src/vip/virtual_device.cpp
new file mode 100644
index 0000000..4cd7982
--- /dev/null
+++ b/src/tim/vx/internal/src/vip/virtual_device.cpp
@@ -0,0 +1,244 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include "vip/virtual_device.h"
+#include "virtual_device_private.h"
+#include "vsi_nn_log.h"
+
+namespace vip {
+
+Device::Device(uint32_t id){
+    id_ = id;
+    graphqueue_ = std::make_unique<GraphQueue> ();
+    worker_ = std::make_unique<Worker> ();;
+    ThreadInit();
+    StatusInit();
+}
+
+Device::~Device(){
+}
+
+uint32_t Device::Id() const{
+    return id_;
+}
+
+void Device::ThreadInit(){
+    for (std::size_t i = 0; i < threads_.size(); ++i){
+        std::thread t(&Device::HandleQueue, this);
+        threads_[i] = std::move(t);
+    }
+}
+
+void Device::StatusInit(){
+    // init thread status after thread id has been generated
+    for (std::size_t i = 0; i < threads_.size(); ++i){
+        VSILOGI("Init thread[%ld] status = %d", threads_[i].get_id(), IDLE);
+        threads_status_[threads_[i].get_id()] = IDLE;
+    }
+}
+
+bool Device::ThreadExit(){
+    WaitThreadIdle();
+    for (std::size_t i = 0; i < threads_.size(); ++i){
+        threads_status_[threads_[i].get_id()] = CANCEL;
+    }
+    for (std::size_t i = 0; i < threads_.size(); ++i){
+        graphqueue_->Submit(NULL, NULL, NULL);  // submit fake graph to exit thread
+    }
+    for (std::size_t i = 0; i < threads_.size(); ++i){
+        threads_[i].join();
+    }
+    return true;
+}
+
+bool Device::GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data){
+    bool status = false;
+    status = graphqueue_->Submit(graph, func, data);
+    return status;
+}
+
+bool Device::GraphRemove(const vsi_nn_graph_t* graph){
+    return graphqueue_->Remove(graph);
+}
+
+bool Device::ThreadIdle(){
+    for (std::size_t i = 0; i < threads_.size(); ++i){
+        if (threads_status_[threads_[i].get_id()] !=  IDLE){
+            return false;
+    }
+  }
+  return true;
+}
+
+void Device::WaitThreadIdle(){
+  if (!ThreadIdle()){
+    VSILOGI("Wait threads idle ...");
+    std::unique_lock<std::mutex> lck(idle_mtx_);
+    idle_cv_.wait(lck);
+    VSILOGI("Threads idle");
+  }
+}
+
+Worker::Worker(){
+}
+
+void Worker::RunGraph(const vsi_nn_graph_t* graph){
+    vsi_nn_RunGraph(graph);
+}
+
+void Worker::Handle(const QueueItem& item){
+    vsi_nn_graph_t* graph = item.graph;
+    func_t func = item.func;
+    data_t data = item.data;
+    if (graph != NULL){
+        VSILOGI("Start running graph%d in thread[%ld] ", item.id , std::this_thread::get_id());
+        RunGraph(graph);
+        VSILOGI("End running graph%d in thread[%ld]", item.id , std::this_thread::get_id());
+    }
+    if (func != NULL){
+        func(data);
+    }
+}
+
+void Device::HandleQueue(){
+    std::thread::id thd_id;
+    thd_id = std::this_thread::get_id();
+    // VSILOGI("Thread[%ld] status = %d", thd_id, threads_status_[thd_id]);
+    while (1) {
+        QueueItem item = graphqueue_->Fetch();
+        if (threads_status_[thd_id] == IDLE) {threads_status_[thd_id] = RUNNING;}
+        worker_->Handle(item);
+        if (threads_status_[thd_id] == RUNNING) {threads_status_[thd_id] = IDLE;}
+        if (threads_status_[thd_id] == CANCEL) {VSILOGI("Thread[%ld] exit", thd_id); break;}
+        if ((graphqueue_->Empty()) && ThreadIdle()) {idle_cv_.notify_one();}
+    }
+}
+
+GraphQueue::GraphQueue(){
+    gcount_ = 0;
+}
+
+void GraphQueue::Show(){
+    queue_mtx_.lock();
+    VSILOGI("Queue element:");
+    for (std::size_t i=0; i < queue_.size(); i++){
+        auto gid = queue_[i].id;
+        VSILOGI("%d", gid);
+    }
+    queue_mtx_.unlock();
+}
+
+void GraphQueue::Notify(){
+    cv_.notify_one();
+}
+
+bool GraphQueue::Submit(vsi_nn_graph_t* graph, func_t func, data_t data){
+    queue_mtx_.lock();
+    QueueItem item;
+    item.graph = graph;
+    item.func = func;
+    item.data = data;
+    item.id = gcount_;
+    queue_.push_back(item);
+    if (graph != NULL){
+        VSILOGI("Submit graph%d", item.id);
+        gcount_++;
+    }
+    queue_mtx_.unlock();
+    Notify();
+    return true;
+}
+
+QueueItem GraphQueue::Fetch(){
+        QueueItem item;
+        if (queue_.empty()){
+            std::unique_lock<std::mutex> lock(queue_mtx_);
+            cv_.wait(lock);
+        }
+        queue_mtx_.lock();
+        if (!queue_.empty()){
+            item = queue_.front();
+            queue_.erase(queue_.begin());
+        }
+        queue_mtx_.unlock();
+        return item;
+}
+
+bool GraphQueue::Remove(const vsi_nn_graph_t* graph){
+    queue_mtx_.lock();
+    std::size_t idx=0;
+    bool exist=false;
+    if (!queue_.empty()){
+        for (std::size_t i=0; i < queue_.size(); i++){
+            if (graph == queue_[i].graph){
+                idx = i;
+                exist = true;
+            }
+        }
+        if (exist){
+            auto gid = queue_[idx].id;
+            queue_.erase(queue_.begin() + idx);
+            VSILOGI("Remove graph%d", gid);
+        }
+    }
+    queue_mtx_.unlock();
+    return true;
+}
+
+bool GraphQueue::Empty() const{
+    return queue_.empty();
+}
+
+IDevice::IDevice(uint32_t id){
+    device_ = new Device(id);
+}
+
+IDevice::~IDevice(){
+    delete device_;
+}
+
+uint32_t IDevice::Id() const{
+    return device_->Id();
+}
+
+bool IDevice::GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data){
+    return device_->GraphSubmit(graph, func, data);
+}
+
+bool IDevice::GraphRemove(const vsi_nn_graph_t* graph){
+    return device_->GraphRemove(graph);
+}
+
+bool IDevice::ThreadExit(){
+    return device_->ThreadExit();
+}
+
+bool IDevice::ThreadIdle(){
+    return device_->ThreadIdle();
+}
+
+void IDevice::WaitThreadIdle(){
+    device_->WaitThreadIdle();
+}
+
+}  // namespace vip
diff --git a/src/tim/vx/internal/src/vip/virtual_device_private.h b/src/tim/vx/internal/src/vip/virtual_device_private.h
new file mode 100644
index 0000000..fc065aa
--- /dev/null
+++ b/src/tim/vx/internal/src/vip/virtual_device_private.h
@@ -0,0 +1,113 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VIP_VIRTUAL_DEVICE_PRIVARE_H
+#define _VIP_VIRTUAL_DEVICE_PRIVATE_H
+
+#include <memory>
+#include <queue>
+#include <vector>
+#include <map>
+#include <thread>
+#include <iostream>
+#include <mutex>
+#include <unistd.h>
+#include <condition_variable>
+#include <functional>
+
+extern "C" {
+    #include "vsi_nn_pub.h"
+};
+
+namespace vip {
+
+enum the_state {
+  CANCEL,
+  IDLE,
+  RUNNING,
+};
+using func_t = std::function<bool (const void*)>;
+using data_t = const void*;
+typedef struct _Queueitem{
+    vsi_nn_graph_t* graph;
+    func_t func;
+    data_t data;
+    size_t id;
+} QueueItem;
+
+class GraphQueue{
+    public:
+        GraphQueue();
+        ~GraphQueue(){};
+        void Show();
+        bool Submit(vsi_nn_graph_t* graph, func_t func, data_t data);
+        bool Remove(const vsi_nn_graph_t* graph);
+        QueueItem Fetch();
+        bool Empty() const;
+        void Notify();
+
+    protected:
+        std::vector<QueueItem> queue_;
+        std::mutex queue_mtx_;
+        std::condition_variable cv_;
+        size_t gcount_;
+};
+
+class Worker{
+    public:
+        Worker();
+        ~Worker(){};
+        void Handle(const QueueItem& item);
+        void RunGraph(const vsi_nn_graph_t* graph);
+    protected:
+};
+
+class Device {
+    public:
+        Device(uint32_t id);
+        ~Device();
+        uint32_t Id() const;
+        void ThreadInit();
+        void StatusInit();
+        bool ThreadExit();
+        void HandleQueue();
+        bool GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data);
+        bool GraphRemove(const vsi_nn_graph_t* graph);
+        bool DeviceExit();
+        bool ThreadIdle();
+        void WaitThreadIdle();
+
+    protected:
+        uint32_t id_;
+        enum the_state thd_state_;
+        std::array<std::thread, 2> threads_;
+        std::map<std::thread::id, the_state> threads_status_;
+        std::condition_variable idle_cv_;
+        std::mutex idle_mtx_;
+        std::unique_ptr<GraphQueue> graphqueue_;
+        std::unique_ptr<Worker> worker_;
+};
+
+}  // namespace vip
+
+#endif
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c
index 76f4f2c..ee81ac1 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph.c
@@ -40,6 +40,7 @@
 #include "utils/vsi_nn_vdata.h"
 #include "utils/vsi_nn_map.h"
 #include "vsi_nn_graph_optimization.h"
+#include "vsi_nn_error.h"
 
 static vsi_status _set_reference_node_name
     (
@@ -1320,13 +1321,13 @@ vsi_nn_node_id_t * vsi_nn_SortGraphNode
     uint32_t             count;
     vsi_bool             dirty;
     vsi_bool             all_tensor_processed;
-    vsi_bool           * tensors;
-    vsi_nn_node_id_t   * nodes;
-    vsi_nn_node_id_t   * sorted_nodes;
-    vsi_nn_node_t      * node;
+    vsi_bool           * tensors = NULL;
+    vsi_nn_node_id_t   * nodes = NULL;
+    vsi_nn_node_id_t   * sorted_nodes = NULL;
+    vsi_nn_node_t      * node = NULL;
     vsi_nn_node_id_t     node_id;
     vsi_nn_tensor_id_t   tensor_id;
-    vsi_nn_tensor_t    * tensor;
+    vsi_nn_tensor_t    * tensor = NULL;
 
     if( NULL == graph || NULL == graph->nodes
         || NULL == graph->tensors )
@@ -1884,7 +1885,7 @@ vsi_status vsi_nn_setup_binary_graph_inputs_outputs
     )
 {
     uint32_t i,j;
-    vsi_status status;
+    vsi_status status = VSI_FAILURE;
     uint32_t num_of_graph_inputs;
     uint32_t num_of_graph_real_inputs;
     vx_reference *graph_inputs = NULL;
@@ -1911,6 +1912,7 @@ vsi_status vsi_nn_setup_binary_graph_inputs_outputs
         }
     }
     graph_inputs = (vx_reference *)malloc( num_of_graph_real_inputs * sizeof( vx_reference ) );
+    CHECK_PTR_FAIL_GOTO( graph_inputs, "Create buffer fail.", final );
     for( i = 0, j = 0; i < num_of_graph_inputs; i++ )
     {
         tensor = vsi_nn_GetTensor( graph, graph->input.tensors[i] );
@@ -1946,6 +1948,7 @@ vsi_status vsi_nn_setup_binary_graph_inputs_outputs
         }
     }
     graph_outputs = (vx_reference *)malloc( num_of_graph_real_outputs * sizeof( vx_reference ) );
+    CHECK_PTR_FAIL_GOTO( graph_outputs, "Create buffer fail.", final );
     for( i = 0, j = 0; i < num_of_graph_outputs; i++ )
     {
         tensor = vsi_nn_GetTensor( graph, graph->output.tensors[i] );
@@ -1981,14 +1984,9 @@ vsi_status vsi_nn_setup_binary_graph_inputs_outputs
     }
 
 final:
-    if ( NULL != graph_inputs)
-    {
-        free( graph_inputs );
-    }
-    if ( NULL != graph_outputs)
-    {
-        free( graph_outputs );
-    }
+    vsi_nn_safe_free(graph_inputs);
+    vsi_nn_safe_free(graph_outputs);
+
     return status;
 } /* vsi_nn_setup_binary_graph_inputs_outputs() */
 
diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
index 8e57205..7a0d809 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
@@ -149,11 +149,11 @@ static vsi_status _add_dataconvert_node
 
     /* Add dataconvert node */
     node = vsi_nn_AddNode(graph, VSI_NN_OP_DATACONVERT, 1, 1, NULL);
-    node->uid = (uint32_t)(VSI_NN_DATACONVERT_NODE_UID_BASE) + idx;
     if( NULL == node ) {
         status = VSI_FAILURE;
         goto final;
     }
+    node->uid = (uint32_t)(VSI_NN_DATACONVERT_NODE_UID_BASE) + idx;
 
     if( direction == VSI_NN_OPTIMIZE_FORWARD )
     {
diff --git a/src/tim/vx/internal/src/vsi_nn_internal_node.c b/src/tim/vx/internal/src/vsi_nn_internal_node.c
index 4962dbc..d80c80d 100644
--- a/src/tim/vx/internal/src/vsi_nn_internal_node.c
+++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c
@@ -474,7 +474,10 @@ vsi_nn_internal_node_t* vsi_nn_internal_new_node
 
     inode = vsi_nn_internal_create_node( node->graph,
                 op, input_num, output_num );
-    inode->node->attr.const_tensor_preload_type = node->attr.const_tensor_preload_type;
+    if (inode)
+    {
+        inode->node->attr.const_tensor_preload_type = node->attr.const_tensor_preload_type;
+    }
     return inode;
 } /* vsi_nn_internal_new_node() */
 
diff --git a/src/tim/vx/internal/src/vsi_nn_node.c b/src/tim/vx/internal/src/vsi_nn_node.c
index 86d4937..f13e80b 100644
--- a/src/tim/vx/internal/src/vsi_nn_node.c
+++ b/src/tim/vx/internal/src/vsi_nn_node.c
@@ -83,6 +83,10 @@ vsi_nn_node_t * vsi_nn_NewNode
         node->attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE;
         node->attr.enable_op_constraint_check = TRUE;
     }
+    else
+    {
+        return NULL;
+    }
 
     node->uid = VSI_NN_NODE_UID_NA;
     return node;
diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
index 8c6c7ba..3b10e44 100644
--- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
+++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
@@ -28,19 +28,21 @@
 #include "vsi_nn_log.h"
 #include "vsi_nn_test.h"
 
-static void _create_yuv_norm_tensors
+static void _create_multi_norm_tensors
     (
     vsi_nn_graph_t* graph,
     vsi_nn_tensor_attr_t* input_attr,
     vsi_nn_preprocess_source_layout_e* source_layout,
     vsi_nn_preprocess_source_format_e* source_format,
-    vsi_nn_tensor_id_t* yuv_tensors
+    vsi_nn_tensor_id_t* multi_input_tensors
     )
 {
     vsi_size_t w = 0;
     vsi_size_t h = 0;
+    uint32_t i = 0;
     vsi_nn_tensor_attr_t y_input_attr;
     vsi_nn_tensor_attr_t uv_input_attr;
+    vsi_nn_tensor_attr_t rgb888_planar_sep_attr;
 
     if (*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC)
     {
@@ -52,48 +54,60 @@ static void _create_yuv_norm_tensors
         w = input_attr->size[0];
         h = input_attr->size[1];
     }
-    /* Create y norm tensor */
-    y_input_attr = *input_attr;
-    y_input_attr.size[0]= w;
-    y_input_attr.size[1]= h;
-    y_input_attr.size[2] = 1;
-    y_input_attr.size[3] = 1;
-    yuv_tensors[0] = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &y_input_attr, NULL );
 
-    /* Create uv norm tensor */
-    if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420)
+    if(*source_format ==  VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP)
     {
-        uv_input_attr = *input_attr;
-        uv_input_attr.size[0]= w/2;
-        uv_input_attr.size[1]= h/2;
-        uv_input_attr.size[2] = 1;
-        uv_input_attr.size[3] = 1;
-
-        yuv_tensors[1] =  vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL );
-        yuv_tensors[2] =  vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL );
+        rgb888_planar_sep_attr = *input_attr;
+        rgb888_planar_sep_attr.size[0] = w;
+        rgb888_planar_sep_attr.size[1] = h;
+        rgb888_planar_sep_attr.size[2] = 1;  /* channel */
+        for (i = 0; i < 3; i++)
+        {
+            multi_input_tensors[i] = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &rgb888_planar_sep_attr, NULL);
+        }
     }
-
-    else if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12)
+    else
     {
-        uv_input_attr = *input_attr;
-        uv_input_attr.size[0]= w;
-        uv_input_attr.size[1]= h/2;
-        uv_input_attr.size[2] = 1;
-        uv_input_attr.size[3] = 1;
+        /* Create y norm tensor */
+        y_input_attr = *input_attr;
+        y_input_attr.size[0] = w;
+        y_input_attr.size[1] = h;
+        y_input_attr.size[2] = 1;
+        y_input_attr.size[3] = 1;
+        multi_input_tensors[0] = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &y_input_attr, NULL);
 
-        yuv_tensors[1] =  vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL );
-    }
+        /* Create uv norm tensor */
+        if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420)
+        {
+            uv_input_attr = *input_attr;
+            uv_input_attr.size[0] = w / 2;
+            uv_input_attr.size[1] = h / 2;
+            uv_input_attr.size[2] = 1;
+            uv_input_attr.size[3] = 1;
 
-    else if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444)
-    {
-        uv_input_attr = *input_attr;
-        uv_input_attr.size[0]= w;
-        uv_input_attr.size[1]= h;
-        uv_input_attr.size[2] = 1;
-        uv_input_attr.size[3] = 1;
-        yuv_tensors[1] =  vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL );
-        yuv_tensors[2] =  vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL );
+            multi_input_tensors[1] = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL);
+            multi_input_tensors[2] = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL);
+        }
+        else if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12)
+        {
+            uv_input_attr = *input_attr;
+            uv_input_attr.size[0] = w;
+            uv_input_attr.size[1] = h / 2;
+            uv_input_attr.size[2] = 1;
+            uv_input_attr.size[3] = 1;
 
+            multi_input_tensors[1] = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL);
+        }
+        else if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444)
+        {
+            uv_input_attr = *input_attr;
+            uv_input_attr.size[0] = w;
+            uv_input_attr.size[1] = h;
+            uv_input_attr.size[2] = 1;
+            uv_input_attr.size[3] = 1;
+            multi_input_tensors[1] = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL);
+            multi_input_tensors[2] = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL);
+        }
     }
 } /* _create_yuv_norm_tensors() */
 
@@ -438,6 +452,7 @@ vsi_status vsi_nn_add_single_preproc_node
     uint32_t idx =0;
 
     org_norm_tensor = vsi_nn_GetTensor(graph, org_input);
+    TEST_CHECK_PTR(org_norm_tensor, final);
 
     /* Get preprocess configurations*/
     for(idx = 0; idx < proc_count; idx++)
@@ -485,7 +500,9 @@ vsi_status vsi_nn_add_single_preproc_node
     }
 
     /* Add preprocess node */
-    if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420)
+    if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 ||
+        *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444 ||
+        *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP)
     {
         node_input_num = 3;
     }
@@ -493,10 +510,6 @@ vsi_status vsi_nn_add_single_preproc_node
     {
         node_input_num = 2;
     }
-    else if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444)
-    {
-        node_input_num = 3;
-    }
 
     node = vsi_nn_AddNode(graph, VSI_NN_OP_PRE_PROCESS, node_input_num, 1, NULL);
     node->uid = (uint32_t)(VSI_NN_PREPROC_NODE_UID_BASE) + input_idx;
@@ -539,9 +552,10 @@ vsi_status vsi_nn_add_single_preproc_node
     /* Create new norm and virtual tensors */
     if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 ||
         *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 ||
-        *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444)
+        *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444 ||
+        *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP)
     {
-        _create_yuv_norm_tensors(graph, &input_attr, source_layout, source_format, preproc_inputs);
+        _create_multi_norm_tensors(graph, &input_attr, source_layout, source_format, preproc_inputs);
     }
     else
     {
@@ -601,9 +615,11 @@ vsi_status vsi_nn_add_single_postproc_node
     vsi_status status = VSI_SUCCESS;
 
     org_norm_tensor = vsi_nn_GetTensor(graph, graph->output.tensors[output_idx]);
+    TEST_CHECK_PTR( org_norm_tensor, final );
 
     /*Create postprocess node*/
     node = vsi_nn_AddNode(graph, VSI_NN_OP_POST_PROCESS, 1, 1, NULL);
+    TEST_CHECK_PTR( node, final );
     node->uid = (uint32_t)(VSI_NN_POSTPROC_NODE_UID_BASE) + output_idx;
 
     /* Get postprocess condigurations */
@@ -647,6 +663,7 @@ vsi_status vsi_nn_add_single_postproc_node
     if(nodes_count != 0)
     {
         consume_nodes = (vsi_nn_node_t**)malloc(sizeof(vsi_nn_node_t*)*nodes_count);
+        TEST_CHECK_PTR( consume_nodes, final );
         vsi_nn_get_tensor_consumers(graph, graph->output.tensors[output_idx], consume_nodes, NULL);
         for(i = 0; i < nodes_count; i++)
             {
@@ -699,16 +716,19 @@ vsi_status vsi_nn_AddGraphPreProcess
     vsi_nn_tensor_id_t* graph_inputs=NULL;
 
     graph_inputs = (vsi_nn_tensor_id_t*)malloc(sizeof(vsi_nn_tensor_id_t)*graph->input.num);
+    TEST_CHECK_PTR( graph_inputs, final );
     _get_org_graph_inputs(graph, graph_inputs);
     input = graph_inputs[input_idx];
     vsi_nn_get_tensor_consumers(graph, input, NULL, &nodes_count);
     if(nodes_count != 0)
     {
         nodes = (vsi_nn_node_t**)malloc(sizeof(vsi_nn_node_t*)*nodes_count);
+        TEST_CHECK_PTR( nodes, final );
         vsi_nn_get_tensor_consumers(graph, input, nodes, NULL);
         status = vsi_nn_add_single_preproc_node(graph, input_idx, input, nodes, nodes_count, preprocess, count);
     }
 
+final:
     if(nodes)
     {
         free(nodes);
@@ -743,3 +763,268 @@ vsi_status vsi_nn_AddGraphPostProcess
 
     return status;
 } /* vsi_nn_AddGraphPostProcess() */
+
+#define WKSP(_NODE_PTR) \
+    ((vsi_nn_internal_node_wksp_t*)((_NODE_PTR)->internal_node_wksp))
+vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam
+(
+    vsi_nn_graph_t* graph,
+    vsi_nn_node_id_t* enable_nodes,
+    uint32_t enable_nodes_count
+)
+{
+    uint32_t i, j, k, idx, p;
+    vsi_status status = VSI_FAILURE;
+    uint32_t num_of_graph_inputs;
+    uint32_t num_of_graph_real_inputs;
+    vx_reference* graph_inputs = NULL;
+    uint32_t num_of_graph_outputs;
+    uint32_t num_of_graph_real_outputs;
+    vx_reference* graph_outputs = NULL;
+    vsi_nn_tensor_t* tensor;
+    vsi_nn_node_t** nodes = NULL;
+    vsi_nn_node_t* node = NULL;
+    vsi_nn_node_id_t* processed_node_id_list = NULL;
+    uint32_t processed_idx = 0;
+
+    num_of_graph_real_inputs = 0;
+    num_of_graph_real_outputs = 0;
+
+    /* Explicitly set graph inputs and outputs */
+    num_of_graph_inputs = graph->input.num;
+    processed_node_id_list = (vsi_nn_node_id_t*)malloc(num_of_graph_inputs * sizeof(vsi_nn_node_id_t));
+    TEST_CHECK_PTR( processed_node_id_list, final );
+    memset(processed_node_id_list, 0, num_of_graph_inputs * sizeof(vsi_nn_node_id_t));
+    processed_idx = 0;
+    for (i = 0; i < num_of_graph_inputs; i++)
+    {
+        vsi_bool processed = FALSE;
+        vsi_bool enabled = FALSE;
+        uint32_t nodes_count = 0;
+        num_of_graph_real_inputs += 1;
+        tensor = vsi_nn_GetTensor(graph, graph->input.tensors[i]);
+        nodes = NULL;
+        vsi_nn_get_tensor_consumers(graph, graph->input.tensors[i], NULL, &nodes_count);
+        if (nodes_count != 0)
+        {
+            nodes = (vsi_nn_node_t**)malloc(sizeof(vsi_nn_node_t*) * nodes_count);
+            TEST_CHECK_PTR( nodes, final );
+            vsi_nn_get_tensor_consumers(graph, graph->input.tensors[i], nodes, NULL);
+            for (j = 0; j < nodes_count; j++)
+            {
+                node = nodes[j];
+                for (k = 0; k < num_of_graph_inputs; k++)
+                {
+                    if (node->uid == processed_node_id_list[k])
+                    {
+                        processed = TRUE;
+                        break;
+                    }
+                }
+                for (k = 0; k < enable_nodes_count; k++)
+                {
+                    if (node->uid == enable_nodes[k])
+                    {
+                        enabled = TRUE;
+                        break;
+                    }
+                }
+                if (!processed && enabled)
+                {
+                    processed_node_id_list[processed_idx++] = node->uid;
+                    if (node->op == VSI_NN_OP_PRE_PROCESS && node->nn_param.pre_process.type !=
+                            VSI_NN_SOURCE_FORMAT_TENSOR)
+                    {
+                        if(node->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR)
+                        {
+                            /* 2 additional input tensors and 4 paramter scalar*/
+                            num_of_graph_real_inputs += 6;
+                        }
+                        else
+                        {
+                            num_of_graph_real_inputs += 4;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    graph_inputs = (vx_reference*)malloc(num_of_graph_real_inputs * sizeof(vx_reference));
+    TEST_CHECK_PTR( graph_inputs, final );
+    memset(processed_node_id_list,  0, num_of_graph_inputs * sizeof(vsi_nn_node_id_t));
+    processed_idx = 0;
+    for (i = 0, j=0; i < num_of_graph_inputs; i++)
+    {
+        vsi_bool processed = FALSE;
+        vsi_bool enabled = FALSE;
+        uint32_t nodes_count = 0;
+        tensor = vsi_nn_GetTensor(graph, graph->input.tensors[i]);
+        vsi_nn_get_tensor_consumers(graph, graph->input.tensors[i], NULL, &nodes_count);
+        if (nodes_count != 0)
+        {
+            nodes = (vsi_nn_node_t**)malloc(sizeof(vsi_nn_node_t*) * nodes_count);
+            TEST_CHECK_PTR( nodes, final );
+            vsi_nn_get_tensor_consumers(graph, graph->input.tensors[i], nodes, NULL);
+            for (k = 0; k < nodes_count; k++)
+            {
+                node = nodes[k];
+                for (idx = 0; idx < num_of_graph_inputs; idx++)
+                {
+                    if (node->uid == processed_node_id_list[idx])
+                    {
+                        processed = TRUE;
+                        break;
+                    }
+                }
+                for (idx = 0; idx < enable_nodes_count; idx++)
+                {
+                    if (node->uid == enable_nodes[idx])
+                    {
+                        enabled = TRUE;
+                        break;
+                    }
+                }
+                if (!processed && enabled)
+                {
+                    processed_node_id_list[processed_idx++] = node->uid;
+                    if (node->op == VSI_NN_OP_PRE_PROCESS)
+                    {
+                        vx_node prenode = NULL;
+                        vx_uint32 numParams = 0;
+                        vsi_nn_internal_node_t* curr = NULL;
+                        curr = WKSP(node)->nodes;
+                        while (NULL != curr)
+                        {
+                            if (curr->node->op != VSI_NN_OP_PRE_PROCESS_TENSOR)
+                            {
+                                int scalar_index = 0;
+                                numParams = 0;
+                                prenode = curr->node->n;
+                                status = vxQueryNode(prenode,
+                                                     VX_NODE_PARAMETERS,
+                                                     &numParams,
+                                                     sizeof(numParams));
+                                if (VSI_SUCCESS != status)
+                                {
+                                    goto final;
+                                }
+                                for (p = 0; p < numParams; p++)
+                                {
+                                    vx_parameter param = 0;
+                                    vx_reference ref = 0;
+                                    vx_enum type = 0;
+                                    vx_enum direction = 0;
+                                    vx_enum data_type = 0;
+
+                                    param = vxGetParameterByIndex(prenode, p);
+                                    vxQueryParameter(param,
+                                                     VX_PARAMETER_TYPE,
+                                                     &type,
+                                                     sizeof(vx_enum));
+                                    vxQueryParameter(param,
+                                                     VX_PARAMETER_DIRECTION,
+                                                     &direction,
+                                                     sizeof(vx_enum));
+                                    if (direction != VX_INPUT) continue;
+                                    vxQueryParameter(param,
+                                                     VX_PARAMETER_REF,
+                                                     &ref,
+                                                     sizeof(vx_reference));
+                                    if (type == VX_TYPE_TENSOR)
+                                    {
+                                        graph_inputs[j++] = ref;
+                                    }
+                                    else if (type == VX_TYPE_SCALAR)
+                                    {
+                                        vxQueryScalar((vx_scalar)ref,
+                                                      VX_SCALAR_TYPE,
+                                                      &data_type,
+                                                      sizeof(vx_enum));
+                                        /*scale_x,scale_y,left,top are int32
+                                         * and index <4 type,mean and
+                                         * scarlar are float*/
+                                        if (data_type != VX_TYPE_INT32 ||
+                                            scalar_index >= 4)
+                                            continue;
+                                        graph_inputs[j++] = ref;
+                                        scalar_index++;
+                                    }
+                                }
+                                break;
+                            }
+                            else
+                            {
+                                graph_inputs[j++] = (vx_reference)(tensor->t);
+                            }
+                            curr = (vsi_nn_internal_node_t*)vsi_nn_LinkListNext(
+                                (vsi_nn_link_list_t*)curr);
+                        }
+                    }
+                    else
+                    {
+                        graph_inputs[j++] = (vx_reference)(tensor->t);
+                    }
+                }
+            }
+        }
+    }
+    num_of_graph_outputs = graph->output.num;
+    if (graph->complete_signal.exists)
+    {
+        num_of_graph_outputs += 1;
+    }
+    for (i = 0; i < num_of_graph_outputs; i++)
+    {
+        tensor = vsi_nn_GetTensor(graph, graph->output.tensors[i]);
+        if (tensor)
+        {
+            num_of_graph_real_outputs += 1;
+        }
+    }
+    graph_outputs = (vx_reference*)malloc(num_of_graph_real_outputs * sizeof(vx_reference));
+    TEST_CHECK_PTR( graph_outputs, final );
+    for (i = 0, j = 0; i < num_of_graph_outputs; i++)
+    {
+        tensor = vsi_nn_GetTensor(graph, graph->output.tensors[i]);
+        if (tensor)
+        {
+            if (j > num_of_graph_real_outputs - 1)
+            {
+                status = VSI_FAILURE;
+                goto final;
+            }
+            graph_outputs[j++] = (vx_reference)(tensor->t);
+        }
+    }
+    if (graph->complete_signal.exists)
+    {
+        graph_outputs[num_of_graph_real_outputs - 1] = (vx_reference)graph->complete_signal.tensor->t;
+    }
+
+    status = vxIdentifyGraphInputsAndOutputs(graph->g,
+                                             num_of_graph_real_inputs,
+                                             graph_inputs,
+                                             num_of_graph_real_outputs,
+                                             graph_outputs);
+
+    if (VSI_SUCCESS != status)
+    {
+        goto final;
+    }
+
+final:
+    if (NULL != processed_node_id_list)
+    {
+        free(processed_node_id_list);
+    }
+    if (NULL != graph_inputs)
+    {
+        free(graph_inputs);
+    }
+    if (NULL != graph_outputs)
+    {
+        free(graph_outputs);
+    }
+    return status;
+} /* vs_nn_AddBinaryGraphInputsWithCropParam() */
diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c
index 80ea7a6..18c964e 100644
--- a/src/tim/vx/internal/src/vsi_nn_tensor.c
+++ b/src/tim/vx/internal/src/vsi_nn_tensor.c
@@ -369,12 +369,14 @@ static vsi_bool _init_tensor
         params.quant_data.affinePerChannel.scales = scales;
         params.quant_data.affinePerChannel.zeroPoint = NULL;
         params.quant_data.affinePerChannel.zeroPointCount = 0;
-        // TODO: This is a hack since driver will access a NULL pointer and cause a crash.
-        // Remove me in the future.
         {
+            // Low-level driver only support asymmetric. Application doesn't provide zp information if
+            // it's symmetric quantized tensor. Fake a zp information filled with zero to meet low-level's
+            // requirement
             null_zp = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.scale_dim);
             memset(null_zp, 0, sizeof(int32_t) * tensor->attr.dtype.scale_dim);
             params.quant_data.affinePerChannel.zeroPoint = null_zp;
+            params.quant_data.affinePerChannel.zeroPointCount= tensor->attr.dtype.scale_dim;
         }
         break;
 #else
@@ -1228,7 +1230,7 @@ void vsi_nn_SaveTensorToTextByFp32
     if( NULL == fp )
     {
         VSILOGW( "Write file %s fail. Please check...", filename );
-        return;
+        goto final;
     }
     sz = vsi_nn_GetElementNum( tensor );
     ptr = data;
@@ -1249,6 +1251,8 @@ void vsi_nn_SaveTensorToTextByFp32
     }
     fwrite( buf, count, 1, fp );
     fclose( fp );
+
+final:
     vsi_nn_safe_free( data );
 } /* vsi_nn_SaveTensorToTextByFp32() */
 
@@ -1378,7 +1382,7 @@ void vsi_nn_SaveTensorToBinary
     if( NULL == fp )
     {
         VSILOGW( "Write file %s fail. Please check...", filename );
-        return;
+        goto final;
     }
     sz = (vsi_size_t)vsi_nn_GetTypeBytes( tensor->attr.dtype.vx_type );
     if( tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT4 ||
@@ -1404,6 +1408,7 @@ void vsi_nn_SaveTensorToBinary
         fwrite( data, sz, 1, fp );
     }
     fclose( fp );
+final:
     vsi_nn_safe_free( data );
 } /* vsi_nn_SaveTensorToBinary() */
 
@@ -2372,6 +2377,11 @@ vsi_status vsi_nn_copy_tensor_veiw_patch
         vx_trensor_addressing addr = NULL;
         vx_size dim_sizes[VSI_NN_MAX_DIM_NUM], strides[VSI_NN_MAX_DIM_NUM];
         addr = (vx_trensor_addressing)malloc(sizeof(vx_tensorpatch_addressing_t));
+        if( NULL == addr )
+        {
+            VSILOGE("Call malloc fail");
+            return status;
+        }
         addr->num_of_dims = (vx_uint32)attr->dim_num;
         for(i = 0; i < dim; i++)
         {
@@ -2567,6 +2577,7 @@ vsi_nn_tensor_t* vsi_nn_ConcatTensor_impl
     va_end(args);
 
     tensors = (vsi_nn_tensor_t**)malloc(sizeof(vsi_nn_tensor_t*) * tensor_count);
+    TEST_CHECK_PTR( tensors, final );
     tensor_count = 0;
     va_start(args, axis);
 
@@ -2578,6 +2589,7 @@ vsi_nn_tensor_t* vsi_nn_ConcatTensor_impl
 
     next = vsi_nn_Concat(graph, tensors, tensor_count, axis);
 
+final:
     vsi_nn_safe_free(tensors);
 
     return next;
@@ -2603,6 +2615,7 @@ vsi_nn_tensor_t* vsi_nn_ConstTensorAdd_impl
     va_end(args);
 
     tensors = (vsi_nn_tensor_t**)malloc(sizeof(vsi_nn_tensor_t*) * tensor_count);
+    TEST_CHECK_PTR( tensors, final );
     tensor_count = 0;
     va_start(args, output_attr);
     FOREACH_ARGS(args, next, vsi_nn_tensor_t*)
@@ -2613,6 +2626,7 @@ vsi_nn_tensor_t* vsi_nn_ConstTensorAdd_impl
 
     next = vsi_nn_TensorAdd(graph, tensors, tensor_count, output_attr);
 
+final:
     vsi_nn_safe_free(tensors);
 
     return next;