Update prebuilt and internal for 22Q1 release(#349)

update driver to REL/6.4.10.2 update internal to commit-id: 33cfb75b Co-authored-by: zhouheng.zheng <zhouheng.zheng@ouotlook.com>
2022-04-12 15:18:45 +08:00 · 2022-04-12 15:18:45 +08:00 · 20e27ed550
parent d0af7ae8df
commit 20e27ed550
233 changed files with 11427 additions and 7782 deletions
--- a/prebuilt-sdk/x86_64_linux/VERSION
+++ b/prebuilt-sdk/x86_64_linux/VERSION
@ -1 +1 @@
-REL/6.4.9
+REL/6.4.10.2
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h
@ -3347,6 +3347,36 @@ VX_API_ENTRY vx_status VX_API_CALL vxSwapTensorHandle(vx_tensor tensor, void* ne
 VX_API_ENTRY vx_status VX_API_CALL vxCopyTensorPatch(vx_tensor tensor, vx_size number_of_dims, const vx_size * view_start, const vx_size * view_end,
        const vx_size * user_stride, void * user_ptr, vx_enum usage, vx_enum user_memory_type);

+/*! \brief Allows the application to copy a view patch from/into an tensor object .
+ * \param [in] tensor The reference to the tensor object that is the source or the
+ * destination of the copy.
+ * \param [in] number_of_dims Number of patch dimension. Error return if 0 or greater than number of
+ * tensor dimensions. If smaller than number of tensor dimensions, the lower dimensions are assumed.
+ * \param [in] view_start Array of patch start points in each dimension
+ * \param [in] view_end Array of patch end points in each dimension
+ * \param [in] tensorpatch_addressing Pointer to parameter of type <tt>\ref vx_tensorpatch_addressing_t</tt>.
+ * \param [in] user_ptr The address of the memory location where to store the requested data
+ * if the copy was requested in read mode, or from where to get the data to store into the tensor
+ * object if the copy was requested in write mode. The accessible memory must be large enough
+ * to contain the specified patch with the specified layout:\n
+ * accessible memory in bytes >= (end[last_dimension] - start[last_dimension]) * stride[last_dimension].\n
+ * The layout of the user memory must follow a row major order.
+ * \param [in] usage This declares the effect of the copy with regard to the tensor object
+ * using the <tt>\ref vx_accessor_e</tt> enumeration. Only <tt>\ref VX_READ_ONLY</tt> and <tt>\ref VX_WRITE_ONLY</tt> are supported:
+ * \arg <tt>\ref VX_READ_ONLY</tt> means that data is copied from the tensor object into the application memory
+ * \arg <tt>\ref VX_WRITE_ONLY</tt> means that data is copied into the tensor object from the application memory
+ * \param [in] user_memory_type A <tt>\ref vx_memory_type_e</tt> enumeration that specifies
+ * the memory type of the memory referenced by the user_addr.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_ERROR_OPTIMIZED_AWAY This is a reference to a virtual tensor that cannot be
+ * accessed by the application.
+ * \retval VX_ERROR_INVALID_REFERENCE The tensor reference is not actually an tensor reference.
+ * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect.
+ * \ingroup group_object_tensor
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxCopyTensorPatch2(vx_tensor tensor, vx_size number_of_dims, const vx_size * view_start, const vx_size * view_end,
+        const vx_tensorpatch_addressing_t * addressing, vx_size size_of_addressing, void * user_ptr, vx_enum usage, vx_enum user_memory_type);
+
 /*! \brief Allows the application to get direct access to a patch of tensor object.
 * \param [in] tensor The reference to the tensor object that is the source or the
 * destination for direct access.
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
@ -50,7 +50,6 @@ enum vx_library_e {
 * \ingroup group_kernel
 */
 enum vx_kernel_e {
-
    /*!
     * \brief The Color Space conversion kernel.
     * \details The conversions are based on the <tt>\ref vx_df_image_e</tt> code in the images.
@ -377,7 +376,7 @@ enum vx_kernel_e {
    * \see group_vision_function_min
    */
    VX_KERNEL_MIN = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x3F,
- 
+
    /*! \brief The weigthed average kernel.
     * \see group_vision_function_weighted_average
     */
@ -391,14 +390,14 @@ enum vx_kernel_e {
    VX_KERNEL_NN_FULLY_CONNECTED_RELU_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x2,

    //VX_KERNEL_NN_SOFTMAX_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x3,
-    
-    //VX_KERNEL_NN_NORMALIZATION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x4, 
+
+    //VX_KERNEL_NN_NORMALIZATION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x4,

    VX_KERNEL_NN_LRN_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x3,

-    //VX_KERNEL_NN_NORMALIZE_IMAGE_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x4, 
+    //VX_KERNEL_NN_NORMALIZE_IMAGE_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x4,

-    //VX_KERNEL_NN_POOLING_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x7, 
+    //VX_KERNEL_NN_POOLING_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x7,

    //VX_KERNEL_NN_ACTIVATION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x9,

@ -415,7 +414,7 @@ enum vx_kernel_e {
    //VX_KERNEL_NN_CONVOLUTION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0xF,

    VX_KERNEL_NN_CONCATINDEFINITE_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x8,
-    
+
    VX_KERNEL_NN_REORG_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x9,

    //VX_KERNEL_NN_DECONVOLUTION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x12,
@ -429,9 +428,9 @@ enum vx_kernel_e {
    VX_KERNEL_NN_CONVOLUTION_RELU_POOLING_LAYER2 = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0xD,

    VX_KERNEL_NN_POOLING_LAYER2 = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0xE,
-    
+
    VX_KERNEL_NN_TENSOR_REDUCE_SUM = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0xF,
-    
+
    VX_KERNEL_NN_TENSOR_PAD = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x10,

    VX_KERNEL_NN_LSTM_UNIT = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x11,
@ -439,25 +438,25 @@ enum vx_kernel_e {
    VX_KERNEL_NN_LSTM_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x12,

    VX_KERNEL_NN_REORG2_LAYER         = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x13,
-    
+
    VX_KERNEL_NN_TENSOR_ROUNDING      = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x14,
-    
+
    VX_KERNEL_NN_HASH_LUT_LAYER       = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x15,
-    
+
    VX_KERNEL_NN_LSH_PROJECTION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x16,
-    
+
    VX_KERNEL_NN_TENSOR_RESHPE        = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x17,
-    
+
    VX_KERNEL_NN_LUT2_LAYER           = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x18,
-    
+
    VX_KERNEL_NN_TENSOR_SCALE         = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x19,
-    
+
    VX_KERNEL_NN_RNN_LAYER            = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x1A,
-    
+
    VX_KERNEL_NN_SOFTMAX2_LAYER       = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x1B,
-    
+
    VX_KERNEL_NN_SVDF_LAYER           = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x1C,
-    
+
    VX_KERNEL_NN_NORMALIZATION_LAYER2 = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x1D,

    VX_KERNEL_NN_TENSOR_REVERSE       = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x1E,
@ -477,11 +476,11 @@ enum vx_kernel_e {
    VX_KERNEL_NN_PRELU                = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x25,

    VX_KERNEL_NN_GRU_UNIT_LAYER       = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x26,
-    
+
    VX_KERNEL_NN_GRU_LAYER            = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x27,
-    
+
    VX_KERNEL_NN_CONV_LSTM_UNIT_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x28,
-    
+
    VX_KERNEL_NN_CONV_LSTM_LAYER      = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x29,

    VX_KERNEL_NN_FULLY_CONNECTED_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x2A,
@ -498,6 +497,8 @@ enum vx_kernel_e {

    VX_KERNEL_NN_CONV_3D_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x30,

+    VX_KERNEL_NN_DECONV_3D_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x31,
+
    VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */
 };

--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
@ -29,8 +29,8 @@
 #define __VX_KHR_COMPATIBLE_H__
 /*
 VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS is used to distingush deconvolution weight layout
- [value] 
- 0: weight_layout is whnc 
+ [value]
+ 0: weight_layout is whnc
 1: weight_layout is whcn
 */
 #ifndef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS
@ -166,4 +166,34 @@ VX_CONV_3D_API_SUPPORT is used to declare that vsi openvx driver can support con
 #define VX_CONV_3D_API_SUPPORT 1
 #endif

+/*
+VX_DECONV_3D_API_SUPPORT is used to declare that vsi openvx driver can support deconv3d by vxDeconv3dLayer API.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_DECONV_3D_API_SUPPORT
+#define VX_DECONV_3D_API_SUPPORT 0
+#endif
+
+/*
+ VX_PAD_CONST_SUPPORT is used to declare that openvx can support pad_const for tensorpad and convolution.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_PAD_CONST_SUPPORT
+#define VX_PAD_CONST_SUPPORT 1
+#endif
+
+/*
+ VX_TENSOR_STRIDE_X_BITS_SUPPORT is used to declare that openvx can support tensor which bits of stride in x dimension is not an integer number of bytes.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_TENSOR_STRIDE_X_BITS_SUPPORT
+#define VX_TENSOR_STRIDE_X_BITS_SUPPORT 1
+#endif
+
 #endif /* __VX_KHR_COMPATIBLE_H__ */
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
@ -1,4 +1,4 @@
-/* 
+/*

 * Copyright (c) 2012-2017 The Khronos Group Inc.
 *
@ -74,7 +74,7 @@ CONVOLUTIONAL_NETWORK structs and enums
 /*! \brief The Neural Network Extension Library Set
 * \ingroup group_cnn
 */
-#define VX_LIBRARY_KHR_NN_EXTENSION (0x1) 
+#define VX_LIBRARY_KHR_NN_EXTENSION (0x1)

 /*! \brief The list of Neural Network Extension Kernels.
 * \ingroup group_cnn
@ -212,7 +212,7 @@ enum vx_nn_activation_function_e
    VX_NN_ACTIVATION_NONE = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x7,
 };

-/*! \brief  The Convolutional network type 
+/*! \brief  The Convolutional network type
 * \ingroup group_cnn
 */
 enum vx_nn_layer_type_e
@ -337,6 +337,30 @@ typedef struct _vx_nn_convolution_3d_params_t
    vx_int32 depth_multiplier;              /*!< \brief depthwise multiplier value, if 0, means convolution, elsewise(>=1), the convolution is depthwiseconvolution. */
 }vx_nn_convolution_3d_params_t;

+typedef struct _vx_nn_deconvolution_3d_params_t
+{
+    vx_int32 padding_w_left;                /*!< \brief Number of elements subtracted at left of the w dimension of the input. */
+    vx_int32 padding_w_right;               /*!< \brief Number of elements subtracted at right of the w dimension of the input. */
+    vx_int32 padding_h_top;                 /*!< \brief Number of elements subtracted at top of the h dimension of the input. */
+    vx_int32 padding_h_bottom;              /*!< \brief Number of elements subtracted at bottom of the h dimension of the input. */
+    vx_int32 padding_d_front;               /*!< \brief Number of elements subtracted at front of the d dimension of the input. */
+    vx_int32 padding_d_rear;                /*!< \brief Number of elements subtracted at end of the d dimension of the input. */
+
+    vx_int32 stride_w;                      /*!< \brief  inter 0 between input elements at w direction for down scale.  */
+    vx_int32 stride_h;                      /*!< \brief  inter 0 between input elements at h direction for down scale.  */
+    vx_int32 stride_d;                      /*!< \brief  inter 0 between input elements at d direction for down scale.  */
+
+    vx_int32 a_w;                            /*!< \brief user-specified quantity used to distinguish between the \f$upscale_w\f$ different possible output sizes. */
+    vx_int32 a_h;                            /*!< \brief user-specified quantity used to distinguish between the \f$upscale_h\f$ different possible output sizes. */
+    vx_int32 a_d;                            /*!< \brief user-specified quantity used to distinguish between the \f$upscale_d\f$ different possible output sizes. */
+
+    vx_int32 channel_group;                 /*!< \brief  Number of separate groups for deconvolution (Range: 0 <= groups <= size of z dimension of input; size of z dimension of input can be divided by groups) */
+
+    vx_enum overflow_policy;                /*!< \brief A <tt> VX_TYPE_ENUM</tt> of the <tt> vx_convert_policy_e</tt> enumeration. */
+    vx_enum rounding_policy;                /*!< \brief A <tt> VX_TYPE_ENUM</tt> of the <tt> vx_round_policy_e</tt> enumeration. */
+    vx_enum down_scale_size_rounding;       /*!< \brief Rounding method for calculating output dimensions. See <tt>\ref vx_nn_rounding_type_e</tt> */
+}vx_nn_deconvolution_3d_params_t;
+
 /*==============================================================================
    TENSOR DATA FUNCTIONS
 =============================================================================*/
@ -415,9 +439,9 @@ VX_API_ENTRY vx_status VX_API_CALL vxReleaseTensorAddressing(vx_tensor_addressin
 /*! \brief Creates an array of tensors
 * \param [in] context      The reference to the overall Context.
 * \param [in] count        Number of Objects to create in the ObjectArray.
- * \param [in] tensor*     The tensors array that need add to the ObjectArray. 
+ * \param [in] tensor*     The tensors array that need add to the ObjectArray.
 *
- * \returns An ObjectArray reference <tt>\ref vx_object_array</tt>. Any possible errors preventing a 
+ * \returns An ObjectArray reference <tt>\ref vx_object_array</tt>. Any possible errors preventing a
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>. Data objects are not initialized by this function.
 *
 * \ingroup group_object_array
@ -426,18 +450,18 @@ VX_API_ENTRY vx_object_array VX_API_CALL vxCreateTensorObjectArray(vx_context co

 typedef union _vx_tensor_quant_param
 {
-    struct 
+    struct
    {
        vx_int8 fixed_point_pos; /*!< \brief Specifies the fixed point position when the input element type is int16/int8, if 0 calculations are performed in integer math */
    } dfp;

-    struct 
+    struct
    {
        vx_float32      scale;       /*!< \brief Scale vaule for the quantized value */
        vx_int32        zeroPoint;  /*!< \brief  A 32 bit integer, in range [0, 255] */
    } affine;

-    struct 
+    struct
    {
        vx_uint32       channelDim; /*!< \brief a 32 bit unsigned integer indicating channel dimension */
        vx_uint32       scaleCount; /*!< \brief the size of the scale array, must be equal to size[channelDim] */
@ -515,22 +539,22 @@ VX_API_ENTRY vx_status VX_API_CALL vxSwapTensor(vx_tensor tensor0, vx_tensor ten
 * \param [in] context The reference to the implementation context.
 * \param [in] tensor_create_params The <tt>\ref vx_tensor_create_params_t</tt> that points to a parameter structure.
 * \param [in] size_of_create_params Size of parameter structure.
- * \param [in] addrs The tensor patch addressing structures that define the dimension and stride of pointers. See note below. 
+ * \param [in] addrs The tensor patch addressing structures that define the dimension and stride of pointers. See note below.
 * \param [in] ptr The logical pointer of platform-defined references to tensor data.
 * \param [in] import_type <tt>\ref vx_memory_type_e</tt>. When giving <tt>\ref VX_MEMORY_TYPE_HOST</tt>
 * the \a ptr is assumed to be a HOST accessible pointer to memory.
- * \returns An tensor reference <tt>\ref vx_tensor</tt>. Any possible errors preventing a 
+ * \returns An tensor reference <tt>\ref vx_tensor</tt>. Any possible errors preventing a
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 *
 * In order to release the image back to the application we should use <tt>\ref vxSwapTensorHandle</tt>.
- * 
+ *
 * \ingroup group_tensor
 *\version 0.4
 */
 VX_API_ENTRY vx_tensor VX_API_CALL vxCreateTensorFromHandle2(
-        vx_context context, const vx_tensor_create_params_t* tensor_create_params, vx_size size_of_create_params, const vx_tensor_addressing addrs, 
+        vx_context context, const vx_tensor_create_params_t* tensor_create_params, vx_size size_of_create_params, const vx_tensor_addressing addrs,
        void * const ptr, vx_enum import_type);
-    
+
 /*! \brief Flush the memory referenced by reference's handle when it is ready.
 * \param [in] ref The reference(image or tensor) which created from handle.
 * \return A <tt>\ref vx_status_e</tt> enumeration.;
@ -607,7 +631,7 @@ typedef struct _vx_nn_convolution_params_t
 typedef struct _vx_nn_convolution_params_ext_t
 {
    vx_nn_convolution_params_t khr;          /*!< \brief Khronos standard structure head */
-    vx_size padding_x_right;                 /*!< \brief Number of elements added at each side in the right of x dimension of the input, 
+    vx_size padding_x_right;                 /*!< \brief Number of elements added at each side in the right of x dimension of the input,
                                               "padding_x" is for the left */
    vx_size padding_y_bottom;                /*!< \brief Number of elements added at each side in the bottom of y dimension of the input.
                                                "padding_y" is for the top */
@ -696,7 +720,7 @@ typedef struct _vx_nn_convolution_params_ext2_t
 * The relation between input to output is as follows: \n
 * \f$ width_{output} = round(\frac{(width_{input} + 2 * padding_x - kernel_x - (kernel_x -1) * dilation_x)}{skip_x} + 1) \f$\n
 * and \n
- * \f$ height_{output} = round(\frac{(height + 2 * padding_y - kernel_y - (kernel_y -1) * dilation_y)}{skip_y} + 1) \f$\n 
+ * \f$ height_{output} = round(\frac{(height + 2 * padding_y - kernel_y - (kernel_y -1) * dilation_y)}{skip_y} + 1) \f$\n
 * where \f$width\f$ is the size of the input width dimension. \f$height\f$ is the size of the input height dimension.
 * \f$width_{output}\f$ is the size of the output width dimension. \f$height_{output}\f$ is the size of the output height dimension.
 * \f$kernel_x\f$ and \f$kernel_y\f$ are the convolution sizes in width and height dimensions.
@ -705,11 +729,11 @@ typedef struct _vx_nn_convolution_params_ext2_t
 * Notice that this node creation function has more parameters than the corresponding kernel. Numbering of kernel parameters (required if you create this node using the generic interface) is explicitly specified here.
 * \param [in] graph The handle to the graph.
 * \param [in] inputs The input tensor data. 3 lower dimensions represent a single input, all following dimensions represent number of batches, possibly nested.
- * The dimension order is [width, height, #IFM, #batches].\n 
+ * The dimension order is [width, height, #IFM, #batches].\n
 * \param [in] weights [*static] Weights are 4d tensor with dimensions [kernel_x, kernel_y, #IFM, #OFM]. see <tt>\ref vxCreateTensor2</tt> and <tt>\ref vxCreateVirtualTensor2</tt> \n Weights data type must match the data type of the inputs.  (Kernel parameter #1)
 * \param [in] biases [*static] Optional, ignored if NULL. The biases, which may be shared (one per ofm) or unshared (one per ofm * output location). The possible layouts are
- * either [#OFM] or [width, height, #OFM]. Biases data type must match the data type of the inputs. 
- * \param [in] convolution_params [static] Pointer to parameters of type <tt>\ref vx_nn_convolution_params_t</tt>. 
+ * either [#OFM] or [width, height, #OFM]. Biases data type must match the data type of the inputs.
+ * \param [in] convolution_params [static] Pointer to parameters of type <tt>\ref vx_nn_convolution_params_t</tt>.
 * \param [in] size_of_convolution_params [static] Size in bytes of convolution_params. Note that this parameter is not counted as one of the kernel parameters.
 * \param [out] outputs The output tensor data. Output will have the same number and structure of dimensions as input. Output tensor data type must be same as the inputs.
 * \return <tt> vx_node</tt>.
@ -725,8 +749,8 @@ VX_API_ENTRY vx_node VX_API_CALL vxConvolutionLayer(vx_graph graph, vx_tensor in
 * round: rounding according the <tt>vx_round_policy_e</tt> enumeration. \n
 * saturate: A saturation according the <tt>vx_convert_policy_e</tt> enumeration.
 * The saturation is done based on the accumulator_bits parameter.
-* According the accumulator_bits, the saturation might not be performed every operation. 
-* But every a specified amount of operations, 
+* According the accumulator_bits, the saturation might not be performed every operation.
+* But every a specified amount of operations,
 * that are suspected to saturate the accumulation bits\n
 * The equation for Fully connected layer:\n
 * \f$ outputs[i] = ( \sum_{j} saturate(round(inputs[j] \times weights[j,i])))+biasses[i] \f$\n
@ -735,10 +759,10 @@ VX_API_ENTRY vx_node VX_API_CALL vxConvolutionLayer(vx_graph graph, vx_tensor in
 * Then down scale is done by picking the results according to a skip jump. The skip is determined by the output size dimensions.
 * The relation between input to output is as follows:
 * \f$ size_{output} = round(\frac{(size_{input} + 2 * pad)}{skip} + 1) \f$\n
-* where \f$size_{input}\f$ is the size of the input dimension. 
-* \f$size_{output}\f$ is the size of the output dimension. 
+* where \f$size_{input}\f$ is the size of the input dimension.
+* \f$size_{output}\f$ is the size of the output dimension.
 * skip is calculated by the relation between input and output.
-* rounding is done according to <tt>\ref vx_convolutional_network_rounding_type_e</tt>. 
+* rounding is done according to <tt>\ref vx_convolutional_network_rounding_type_e</tt>.
 * \param [in] graph The handle to the graph.
 * \param [in] inputs The input tensor data. There two possible input layouts:
 * 1. [#IFM, #batches]. See <tt>\ref vxCreateTensor2</tt> and <tt>\ref vxCreateVirtualTensor2</tt>.
@ -884,7 +908,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxNormalizationLayer2(vx_graph graph, vx_tensor
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 */
-VX_API_ENTRY vx_node VX_API_CALL vxActivationLayer(vx_graph graph, vx_tensor inputs, vx_enum function, vx_float32 a,vx_float32 b, vx_tensor outputs); 
+VX_API_ENTRY vx_node VX_API_CALL vxActivationLayer(vx_graph graph, vx_tensor inputs, vx_enum function, vx_float32 a,vx_float32 b, vx_tensor outputs);

 /*! \brief [Graph] Creates a Convolutional Network ROI pooling node
 * \details Pooling is done on the width and height dimensions of the <tt>\ref vx_tensor</tt>. The ROI Pooling get an array of roi rectangles, and an input tensor.
@ -892,9 +916,9 @@ VX_API_ENTRY vx_node VX_API_CALL vxActivationLayer(vx_graph graph, vx_tensor inp
 * The down scale method is determined by the pool_type.
 * Notice that this node creation function has more parameters than the corresponding kernel. Numbering of kernel parameters (required if you create this node using the generic interface) is explicitly specified here.
 * \param [in] graph The handle to the graph.
- * \param [in] inputs The input tensor data. 3 lower dimensions represent a single input, 4th dimension for batch of inputs is optional. Dimension layout is [width, height, #IFM, #batches]. 
+ * \param [in] inputs The input tensor data. 3 lower dimensions represent a single input, 4th dimension for batch of inputs is optional. Dimension layout is [width, height, #IFM, #batches].
 * See <tt>\ref vxCreateTensor2</tt> and <tt>\ref vxCreateVirtualTensor2</tt>.
- * Implementations must support input tensor data types indicated by the extension strings 'KHR_NN_8' or 'KHR_NN_8 KHR_NN_16'.  (Kernel parameter #0) 
+ * Implementations must support input tensor data types indicated by the extension strings 'KHR_NN_8' or 'KHR_NN_8 KHR_NN_16'.  (Kernel parameter #0)
 * \param [in] inputs_rois The roi array tensor. ROI array with dimensions [4, roi_count, #batches] where the first dimension represents 4 coordinates of the top left and bottom right corners of the roi rectangles, based on the input tensor width and height.
 * #batches is optional and must be the same as in inputs. roi_count is the number of ROI rectangles.  (Kernel parameter #1)
 * \param [in] pool_type [static] Of type <tt>\ref vx_nn_pooling_type_e</tt>. Only <tt>\ref VX_NN_POOLING_MAX</tt> pooling is supported.   (Kernel parameter #2)
@ -906,13 +930,13 @@ VX_API_ENTRY vx_node VX_API_CALL vxActivationLayer(vx_graph graph, vx_tensor inp
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 */
 VX_API_ENTRY vx_node VX_API_CALL vxROIPoolingLayer(vx_graph graph, vx_tensor input_data, vx_tensor input_rois, const vx_nn_roi_pool_params_t *roi_pool_params, vx_size size_of_roi_params, vx_tensor output_arr);
-                
-                
+
+
 /*! \brief [Graph] Creates a Convolutional Network Deconvolution Layer Node.
 * \details  Deconvolution denote a sort of reverse convolution, which importantly and confusingly is not actually a proper mathematical deconvolution.
 * Convolutional Network Deconvolution is up-sampling of an image by learned Deconvolution coefficients.
 * The operation is similar to convolution but can be implemented by up-sampling the inputs with zeros insertions between the inputs,
- * and convolving the Deconvolution kernels on the up-sampled result. 
+ * and convolving the Deconvolution kernels on the up-sampled result.
 * For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined,
 * and should be at least 16.\n
 * round: rounding according the <tt>vx_round_policy_e</tt> enumeration. \n
@ -926,7 +950,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxROIPoolingLayer(vx_graph graph, vx_tensor inp
 * The relation between input to output is as follows: \n
 * \f$ width_{output} =  (width_{input} -1) * upscale_x  - 2 * padding_x + kernel_x + a_x \f$\n
 * and \n
- * \f$ height_{output} =  (height_{input} - 1) * upscale_y - 2 * padding_y + kernel_y + a_y \f$\n 
+ * \f$ height_{output} =  (height_{input} - 1) * upscale_y - 2 * padding_y + kernel_y + a_y \f$\n
 * where \f$width_{input}\f$ is the size of the input width dimension. \f$height_{input}\f$ is the size of the input height dimension.
 * \f$width_{output}\f$ is the size of the output width dimension. \f$height_{output}\f$ is the size of the output height dimension.
 * \f$kernel_x\f$ and \f$kernel_y\f$ are the convolution sizes in width and height. \f$a_x\f$ and \f$a_y\f$ are user-specified quantity used to distinguish between the \f$upscale_x\f$ and \f$upscale_y\f$ different possible output sizes.
@ -966,7 +990,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxDeconvolutionLayer(vx_graph graph, vx_tensor
 * \ingroup group_cnn
 */
 VX_API_ENTRY vx_node VX_API_CALL vxLeakyReluLayer(
-    vx_graph                    graph, 
+    vx_graph                    graph,
    vx_tensor                   inputs,
    vx_float32                  negative_slope,
    vx_tensor                   outputs
@ -985,7 +1009,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxLeakyReluLayer(
 * \version 0.5
 */
 VX_API_ENTRY vx_node VX_API_CALL vxPReluLayer(
-    vx_graph                    graph, 
+    vx_graph                    graph,
    vx_tensor                   inputs,
    vx_tensor                   alpha,
    vx_tensor                   outputs
@ -1033,14 +1057,14 @@ VX_API_ENTRY vx_node VX_API_CALL vxConcat2Layer(
    vx_tensor in0,
    vx_tensor in1,
    vx_tensor out
-    ); 
+    );

 /*! \brief parameter for vxConcatIndefiniteLayer
 * \ingroup group_cnn
 * \version 0.4
 */
 typedef struct _vx_nn_concat_params_t
-{  
+{
    vx_uint32 axis;             /*!< \brief  The axis on which we need do concat. */
 } vx_nn_concat_params_t;

@ -1085,7 +1109,7 @@ enum vx_reorg_type_e
    VX_REORG_SHUFFLE_CHANNEL,
 };

-/*! \brief Input parameter for reorg layer 
+/*! \brief Input parameter for reorg layer
 *\ingroup group_cnn
 *\version 0.4
 */
@ -1108,7 +1132,7 @@ typedef struct _vx_nn_reorg_params_ext_t
 typedef struct _vx_nn_reorg_params_ext2_t
 {
    vx_nn_reorg_params_t base;      /*!< \brief vx_nn_reorg_params <tt>\ref vx_nn_reorg_params_t</tt> */
-    vx_int32 *num_group;                  
+    vx_int32 *num_group;
    vx_int32 *axis;
 } vx_nn_reorg_params_ext2_t;

@ -1125,7 +1149,7 @@ typedef struct _vx_nn_reorg_params_ext2_t
 * \version 0.4
 */
 VX_API_ENTRY vx_node VX_API_CALL vxReorgLayer2(
-    vx_graph                    graph, 
+    vx_graph                    graph,
    vx_tensor                   input,
    const vx_nn_reorg_params    reorg_params,
    vx_size                     size_of_reorg_params,
@ -1154,7 +1178,7 @@ typedef struct _vx_nn_rounding_params_t
 * \version 0.4
 */
 VX_API_ENTRY vx_node VX_API_CALL vxTensorRoundingNode(
-    vx_graph                       graph, 
+    vx_graph                       graph,
    vx_tensor                      input,
    const vx_nn_rounding_params    rounding_params,
    vx_size                        size_of_rounding_params,
@ -1189,7 +1213,7 @@ typedef struct _vx_nn_hashlut_params_t
 * \version 0.4
 */
 VX_API_ENTRY vx_node VX_API_CALL vxHashTableLookupLayer(
-    vx_graph                    graph, 
+    vx_graph                    graph,
    vx_tensor                   input,
    const vx_nn_hashlut_params  hashlut_params,
    vx_size                     size_of_hashlut_params,
@ -1235,7 +1259,7 @@ typedef struct _vx_nn_lshproj_params_t
 * \param [in] lshproj_params Pointer to parameters of type <tt>\ref vx_nn_lshproj_params</tt>
 * \param [in] size_of_lshproj_params [static] Size in bytes of vx_nn_lshproj_params.
 * \param [out] output The output tensor data.
- *  If the projection type is sparse: 
+ *  If the projection type is sparse:
 *    Output.Dim == { Tensor[0].Dim[0] }
 *    A tensor that represents hash signatures.
 *  If the projection type is Dense:
@ -1248,7 +1272,7 @@ typedef struct _vx_nn_lshproj_params_t
 * \version 0.4
 */
 VX_API_ENTRY vx_node VX_API_CALL vxLSHProjectionLayer(
-    vx_graph                    graph, 
+    vx_graph                    graph,
    vx_tensor                   input,
    const vx_nn_lshproj_params  lshproj_params,
    vx_size                     size_of_lshproj_params,
@ -1261,7 +1285,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxLSHProjectionLayer(
 */
 typedef struct _vx_nn_reshape_params_t
 {
-    vx_tensor dims;                    /*!< \brief  dimension. */  
+    vx_tensor dims;                    /*!< \brief  dimension. */
 } vx_nn_reshape_params_t, * vx_nn_reshape_params;

 /*! \brief [Graph] Creates a Reshape Layer Node.
@ -1277,7 +1301,7 @@ typedef struct _vx_nn_reshape_params_t
 * \version 0.4
 */
 VX_API_ENTRY vx_node VX_API_CALL vxTensorReshapeNode(
-    vx_graph                    graph, 
+    vx_graph                    graph,
    vx_tensor                   input,
    const vx_nn_reshape_params  reshape_params,
    vx_size                     size_of_reshape_params,
@ -1306,7 +1330,7 @@ typedef struct _vx_nn_scale_params_t
 * \version 0.4
 */
 VX_API_ENTRY vx_node VX_API_CALL vxTensorScaleNode(
-    vx_graph                    graph, 
+    vx_graph                    graph,
    vx_tensor                   input,
    const vx_nn_scale_params    scale_params,
    vx_size                     size_of_scale_params,
@ -1370,7 +1394,7 @@ typedef struct _vx_nn_rnn_params_t
 * \details A basic recurrent neural network layer.
 *      This layer implements the operation:
 *      outputs = state = activation(inputs * input_weights + state * recurrent_weights + bias)
- *      
+ *
 *      Where:
 *      "input_weights" is a weight matrix that multiplies the inputs;
 *      "recurrent_weights" is a weight matrix that multiplies the current
@ -1392,7 +1416,7 @@ typedef struct _vx_nn_rnn_params_t
 * \version 0.4
 */
 VX_API_ENTRY vx_node VX_API_CALL vxRNNLayer(
-    vx_graph                    graph, 
+    vx_graph                    graph,
    vx_tensor                   input,
    const vx_nn_rnn_params      rnn_params,
    vx_size                     size_of_rnn_params,
@ -1432,7 +1456,7 @@ typedef struct _vx_nn_softmax_params_ext_t
 * \version 0.4
 */
 VX_API_ENTRY vx_node VX_API_CALL vxSoftmaxLayer2(
-    vx_graph                    graph, 
+    vx_graph                    graph,
    vx_tensor                   input,
    const vx_nn_softmax_params  softmax_params,
    vx_size                     size_of_softmax_params,
@ -1458,25 +1482,25 @@ typedef struct _vx_nn_svdf_params_t
 *          densely connected layer that's processing a sequence of input frames can
 *          be approximated by using a singular value decomposition of each of its
 *          nodes. The implementation is based on:
- *        
+ *
 *          https://research.google.com/pubs/archive/43813.pdf
- *          
+ *
 *          P. Nakkiran, R. Alvarez, R. Prabhavalkar, C. Parada.
 *          "Compressing Deep Neural Networks using a Rank-Constrained Topology".
 *          INTERSPEECH, 2015.
- *          
+ *
 *          It processes the incoming input using a 2-stage filtering mechanism:
 *          stage 1 performs filtering on the "features" dimension, whose outputs get
 *          pushed into a memory of fixed-size memory_size.
 *          stage 2 performs filtering on the "time" dimension of the memory_size
 *          memoized outputs of stage 1.
- *          
+ *
 *          Specifically, for rank 1, this layer implements the operation:
- *          
+ *
 *             memory = push(conv1d(inputs, weights_feature, feature_dim,
 *                           "PADDING_VALID"));
 *             outputs = activation(memory * weights_time + bias);
- *          
+ *
 *          Where:
 *          "weights_feature" is a weights matrix that processes the inputs (by
 *          convolving the input with every "feature filter"), and whose outputs get
@ -1488,7 +1512,7 @@ typedef struct _vx_nn_svdf_params_t
 *          batch); and
 *          "activation" is the function passed as the "fused_activation_function"
 *          argument (if not "NONE").
- *          
+ *
 *          Each rank adds a dimension to the weights matrices by means of stacking
 *          the filters.
 * \param [in] graph The reference to the parent graph.
@ -1506,7 +1530,7 @@ typedef struct _vx_nn_svdf_params_t
 * \version 0.4
 */
 VX_API_ENTRY vx_node VX_API_CALL vxSVDFLayer(
-    vx_graph                    graph, 
+    vx_graph                    graph,
    vx_tensor                   input,
    const vx_nn_svdf_params     svdf_params,
    vx_size                     size_of_svdf_params,
@ -1535,7 +1559,7 @@ typedef struct _vx_nn_pooling_params_t
 * \version 0.4
 */
 typedef struct _vx_nn_pooling_params_ext_t
-{  
+{
    vx_nn_pooling_params_t base;    /*!< \brief The base definition.<tt>\ref vx_nn_pooling_params_t</tt> */
    vx_uint32 stride_x;             /*!< \brief  Skip x jump for down scale. */
    vx_uint32 stride_y;             /*!< \brief  Skip y jump for down scale. */
@ -1569,7 +1593,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxPoolingLayer2(

 /*! \brief [Graph] Performs arithmetic addition on element values in the input tensor data's.
 * \param [in] graph The handle to the graph.
- * \param [in] in1 input tensor data,. 
+ * \param [in] in1 input tensor data,.
 * \param [in] in2 input tensor data, inputs must be of equal in dimensions.
 * else, If in one of the vx_mddata dimension is 1.
 * That dimension is considered as a const on all the dimension terms.
@ -1639,7 +1663,6 @@ typedef struct _vx_nn_pad_params_t
    vx_uint8   numViewDimensions;         /*!< \brief The size of two arrays. */
    vx_enum    pad_mode;                  /*!< \brief A VX_TYPE_ENUM of the <tt>\ref vx_pad_mode_e</tt> enumeration. */
    vx_scalar  pad_const;                 /*!< \brief The order const value if setting pad mode to const, the const value is base value, not quantized value. */
-
 } vx_nn_pad_params_t, * vx_nn_pad_params;


@ -1716,9 +1739,9 @@ typedef struct _vx_nn_l2norm_params_t
 * \retval * Node handle.
 */
 VX_API_ENTRY vx_node VX_API_CALL vxL2NormalizeLayer2(
-    vx_graph                      graph, 
-    vx_tensor                     inputs, 
-    const vx_nn_l2norm_params_t * l2norm_params, 
+    vx_graph                      graph,
+    vx_tensor                     inputs,
+    const vx_nn_l2norm_params_t * l2norm_params,
    vx_size                       size_of_l2norm_params,
    vx_tensor                     outputs);

@ -1752,7 +1775,7 @@ typedef struct _vx_nn_rpn_params_t
 * \ingroup group_cnn
 */
 VX_API_ENTRY vx_node VX_API_CALL vxRPNLayer(
-    vx_graph                    graph, 
+    vx_graph                    graph,
    vx_tensor                   score,
    vx_tensor                   bbox,
    vx_tensor                   anchors,
@ -1773,24 +1796,24 @@ typedef struct _vx_nn_lstm_params_t
    vx_tensor input2forget_weight;                 /*!< \brief  A 2-D tensor of type T, of shape [num_units, input_size].*/
    vx_tensor input2cell_weight;                   /*!< \brief  A 2-D tensor of type T, of shape [num_units, input_size].*/
    vx_tensor input2output_weight;                 /*!< \brief  A 2-D tensor of type T, of shape [num_units, input_size].*/
-    
+
    vx_tensor recurrent2input_weight;              /*!< \brief Optional A 2-D tensor of type T, of shape [num_units, output_size]. where "output_size" corresponds to either the number of cell units (i.e., "num_units"), or the second dimension of the "projection_weights", if defined.*/
    vx_tensor recurrent2forget_weight;             /*!< \brief  A 2-D tensor of type T, of shape [num_units, output_size].*/
    vx_tensor recurrent2cell_weight;               /*!< \brief  A 2-D tensor of type T, of shape [num_units, output_size].*/
    vx_tensor recurrent2output_weight;             /*!< \brief  A 2-D tensor of type T, of shape [num_units, output_size].*/
-    
+
    vx_tensor cell2input_weight;                   /*!< \brief Optional A 1-D tensor of type T, of shape [num_units].*/
    vx_tensor cell2forget_weight;                  /*!< \brief Optional A 1-D tensor of type T, of shape [num_units].*/
    vx_tensor cell2output_weight;                  /*!< \brief Optional A 1-D tensor of type T, of shape [num_units].*/
-    
+
    vx_tensor input_gate_bias;                     /*!< \brief Optional A 1-D tensor of type T, of shape [num_units].*/
    vx_tensor forget_gate_bias;                    /*!< \brief  A 1-D tensor of type T, of shape [num_units].*/
    vx_tensor cell_bias;                           /*!< \brief  A 1-D tensor of type T, of shape [num_units].*/
    vx_tensor output_gate_bias;                    /*!< \brief  A 1-D tensor of type T, of shape [num_units].*/
-    
+
    vx_tensor projection_weight;                   /*!< \brief Optional A 2-D tensor of type T, of shape [output_size, num_units].*/
    vx_tensor projection_bias;                     /*!< \brief Optional A 1-D tensor of type T, of shape [output_size].*/
-    
+
    vx_tensor activation;                          /*!< \brief Optional. An ActivationFunctionType indicating the activation function. If "NONE" is specified then it results in a linear activation.If "NONE" is specified then it results in a linear activation.*/
    vx_tensor cell_clip;                           /*!< \brief  A clipping threshold for the cell state, such that values are bound within [-cell_clip, cell_clip]. If set to 0.0 then clipping is disabled.*/
    vx_tensor proj_clip;                           /*!< \brief  A clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.*/
@ -1805,9 +1828,9 @@ typedef struct _vx_nn_lstm_params_ext_t
    vx_tensor forget_bias;                 /*!< \brief  A bias(float 32) for the forget gate. If set to 0.0f(by default) then bias is ignored.*/

    vx_float32 norm_gain;                  /*!< \brief  Float32[static] The layer normalization gain initial value(default is 1.0f).*/
-    vx_float32 norm_shift;                 /*!< \brief  Float32[static] The layer normalization shift initial value(default is 0.0f).*/ 
+    vx_float32 norm_shift;                 /*!< \brief  Float32[static] The layer normalization shift initial value(default is 0.0f).*/

-    vx_tensor sequence_length;             /*!< \brief  Optional[static] Specifies the length of each sequence in inputs. An `int32` (tensor) size `[batch_size]`, values in `[0, time_len)` or None(by default).*/ 
+    vx_tensor sequence_length;             /*!< \brief  Optional[static] Specifies the length of each sequence in inputs. An `int32` (tensor) size `[batch_size]`, values in `[0, time_len)` or None(by default).*/

    /*Since ANDROID NN API level 29 there are additional inputs to this op:*/
    vx_tensor layernorm2input_weight;              /*!< \brief [Optional] The input layer normalization weights. A 1 - D tensor of shape[num_units].Used to rescale normalized inputs to activation at input gate.*/
@ -1846,11 +1869,11 @@ typedef struct _vx_nn_lstm_layer_params_ext_t
 *     Hasim Sak, Andrew Senior, and Francoise Beaufays. "Long short-term memory
 *     recurrent neural network architectures for large scale acoustic modeling."
 *     INTERSPEECH, 2014.
- *     
+ *
 *     The coupling of input and forget gate (CIFG) is based on:
 *     http://arxiv.org/pdf/1503.04069.pdf
 *     Greff et al. "LSTM: A Search Space Odyssey"
- *     
+ *
 *     The class has the following independently optional inputs:
 *     * If input gate (if CIFG): "input_to_forget_weights",
 *       "recurrent_to_input_weights", "cell_to_input_weights", "input_gate_bias".
@ -1870,7 +1893,7 @@ typedef struct _vx_nn_lstm_layer_params_ext_t
 * \param [out] scratch A 3-D tensor of type T, of shape [num_cell, 4, batch_size].
 * \param [out] output_state_out A 2-D tensor of type T, of shape [output_size, batch_size].
 * \param [out] cell_state_out A 2-D tensor of type T, of shape [num_units, batch_size].
- * \param [out] output A 2-D tensor of type T, of shape [output_size, batch_size]. 
+ * \param [out] output A 2-D tensor of type T, of shape [output_size, batch_size].
 *                      This is effectively the same as the current "output_state" value.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
@ -1905,7 +1928,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxLstmUnitLayer(
 *                    is the batching dimension.
 * \param [in] lstm_layer_params LSTM paraments <tt>\ref vx_nn_lstm_layer_params_t </tt>.
 * \param [in] size_of_lstm_layer_params [static] The size of the lstm_layer_params.
- * \param [out] output A 2-D/3D tensor of type T, of shape [output_size, batch_size] or [output_size, batch_size, time]. 
+ * \param [out] output A 2-D/3D tensor of type T, of shape [output_size, batch_size] or [output_size, batch_size, time].
 *                      This is effectively the same as the current "output_state" value.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
@ -1914,7 +1937,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxLstmUnitLayer(
 * \version 0.3
 */
 VX_API_ENTRY vx_node VX_API_CALL vxLstmLayer(
-    vx_graph graph, 
+    vx_graph graph,
    vx_tensor input,
    vx_tensor static_input,
    vx_tensor cont,
@ -1975,7 +1998,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxTensorMeanNode(
 * \param [in] input A n-D tensor, specifying the tensor to be squeezed.
 * \param [in] squeeze_params paraments <tt>\ref vx_nn_squeeze_params_t </tt>.
 * \param [in] size_of_squeeze_param [static] The size of the vx_nn_squeeze_params_t.
-* \param [out] output A n-D tensor of the same type as input. Contains the same data as input, 
+* \param [out] output A n-D tensor of the same type as input. Contains the same data as input,
 *              but has one or more dimensions of size 1 removed.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
@ -2072,6 +2095,65 @@ VX_API_ENTRY vx_status VX_API_CALL vxQueryHardwareCaps(
 */
 VX_API_ENTRY vx_node VX_API_CALL vxConv3dLayer(vx_graph graph, vx_tensor inputs, vx_tensor weights, vx_tensor biases, const vx_nn_convolution_3d_params_t *convolution_params, vx_size size_of_convolution_params, vx_tensor outputs);

+/*! \brief [Graph] Creates a Convolutional Network Deconvolution3d Layer Node.
+ * \details  Deconvolution denote a sort of reverse convolution, which importantly and confusingly is not actually a proper mathematical deconvolution.
+ * Convolutional Network Deconvolution is up-sampling of an image by learned Deconvolution coefficients.
+ * The operation is similar to convolution but can be implemented by up-sampling the inputs with zeros insertions between the inputs,
+ * and convolving the Deconvolution kernels on the up-sampled result.
+ * For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined,
+ * and should be at least 16.\n
+ * round: rounding according the <tt>vx_round_policy_e</tt> enumeration. \n
+ * saturate: A saturation according the <tt>vx_convert_policy_e</tt> enumeration.
+ * The following equation is implemented: \n
+ * \f$ outputs[j,k,i] =  saturate(round(\sum_{l} \sum_{m,n}(inputs_{upscaled}[j+m,k+n,l] \times weights[m,n,l,i])+biasses[j,k,i])) \f$\n
+ * Where \f$m,n\f$ are indexes on the convolution matrices. \f$ l\f$ is an index on all the convolutions per input.\f$ i\f$ is an index per output.
+ * \f$ j,k \f$ are the inputs/outputs spatial indexes.
+ * Deconvolution is done on the width and height dimensions of the <tt>\ref vx_tensor</tt>. Therefore, we use here the term x for the width dimension and y for the height dimension.\n
+ * before the Deconvolution is done, up-scaling the width and height dimensions with zeros is performed.
+ * The relation between input to output is as follows: \n
+ * \f$ width_{output} =  (width_{input} -1) * upscale_x  - 2 * padding_x + kernel_x + a_x \f$\n
+ * and \n
+ * \f$ height_{output} =  (height_{input} - 1) * upscale_y - 2 * padding_y + kernel_y + a_y \f$\n
+ * \f$ depth_{output} =  (depth_{input} - 1) * upscale_d - 2 * padding_d + kernel_d + a_d \f$\n
+ * where
+ *   \f$width_{input}\f$ is the size of the input width dimension.
+ *   \f$height_{input}\f$ is the size of the input height dimension.
+ *   \f$depth_{input}\f$ is the size of the input depth dimension.
+ *
+ *   \f$width_{output}\f$ is the size of the output width dimension.
+ *   \f$height_{output}\f$ is the size of the output height dimension.
+ *   \f$depth_{output}\f$ is the size of the output depth dimension.
+ *
+ *   \f$kernel_x\f$, \f$kernel_y\f$ and \f$kernel_d\f$ are the deconvolutioned sizes in width, height and depth.
+ *   \f$a_x\f$ and \f$a_y\f$ are user-specified quantity used to distinguish between the \f$upscale_x\f$ and \f$upscale_y\f$ different possible output sizes.
+ *   \f$upscale_x\f$, \f$upscale_y\f$ and \f$upscale_d\f$ are calculated by the relation between input and output.
+ * \f$a_x\f$ and \f$a_y\f$ must be positive and smaller then \f$upscale_x\f$ and \f$upscale_y\f$ respectively.
+ * Since the padding parameter is on the output. The effective input padding is: \n
+ * \f$ padding_{input_x} = kernel_x -padding_x -1\f$ \n
+ * \f$ padding_{input_y} = kernel_y -padding_y -1\f$ \n
+ * \f$ padding_{input_d} = kernel_d -padding_d -1\f$ \n
+ * Therfore the following constarints apply :
+ *      \f$kernel_x >= padding_x - 1\f$,
+ *      \f$kernel_y >= padding_y - 1\f$.
+ *      \f$kernel_d >= padding_d - 1\f$.
+ * rounding is done according to <tt>\ref vx_nn_rounding_type_e</tt>.
+ * Notice that this node creation function has more parameters than the corresponding kernel. Numbering of kernel parameters (required if you create this node using the generic interface) is explicitly specified here.
+ * \param [in] graph The handle to the graph.
+ * \param [in] inputs The input tensor. 4 lower dimensions represent a single input, and an optional 5th dimension for batch of inputs. Dimension layout is [width, height, depth, #IFM, #batches].
+ * See <tt>\ref vxCreateTensor2</tt> and <tt>\ref vxCreateVirtualTensor2</tt>.
+ * Implementations must support input tensor data types indicated by the extension strings 'KHR_NN_8' or 'KHR_NN_8 KHR_NN_16'.   (Kernel parameter #0)
+ * \param [in] weights [static] The 5d weights with dimensions [width, height, depth, #IFM, #OFM]. See <tt>\ref vxCreateTensor2</tt> and <tt>\ref vxCreateVirtualTensor2</tt>.  (Kernel parameter #1)
+ * \param [in] biases [static] Optional, ignored if NULL. The biases have one dimension [#OFM]. Implementations must support input tensor data type same as the inputs.  (Kernel parameter #2)
+ * \param [in] deconvolution_params [static] Pointer to parameters of type <tt>\ref vx_nn_deconvolution_params_t</tt>  (Kernel parameter #3)
+ * \param [in] size_of_deconv_params [static] Size in bytes of deconvolution_params. Note that this parameter is not counted as one of the kernel parameters.
+ * \param [out] outputs The output tensor. The output has the same number of dimensions as the input.  (Kernel parameter #4)
+ * \ingroup group_cnn
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxDeconv3dLayer(vx_graph graph, vx_tensor inputs, vx_tensor weights, vx_tensor biases, const vx_nn_deconvolution_3d_params_t *convolution_params, vx_size size_of_deconv_params, vx_tensor outputs);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
@ -304,6 +304,39 @@ typedef struct _vx_tensor_view_t * vx_tensor_view;
 */
 typedef struct _vx_tensor_addressing_t  * vx_tensor_addressing;

+/*!
+ * \brief The addressing image patch structure is used by the Host only
+ * to address pixels in an image patch. The fields of the structure are defined as:
+ * \arg dim - The dimensions of the image in logical pixel units in the x & y direction.
+ * \arg stride - The physical byte distance from a logical pixel to the next
+ * logically adjacent pixel in the positive x or y direction.
+ * \arg scale - The relationship of scaling from the primary plane (typically
+ * the zero indexed plane) to this plane. An integer down-scaling factor of \f$ f \f$ shall be
+ * set to a value equal to \f$ scale = \frac{unity}{f} \f$ and an integer up-scaling factor of \f$ f \f$
+ * shall be set to a value of \f$ scale = unity * f \f$. \f$ unity \f$ is defined as <tt>\ref VX_SCALE_UNITY</tt>.
+ * \arg step - The step is the number of logical pixel units to skip to
+ * arrive at the next physically unique pixel. For example, on a plane that is
+ * half-scaled in a dimension, the step in that dimension is 2 to indicate that
+ * every other pixel in that dimension is an alias. This is useful in situations
+ * where iteration over unique pixels is required, such as in serializing
+ * or de-serializing the image patch information.
+ * \see <tt>\ref vxMapImagePatch</tt>
+ * \ingroup group_image
+ */
+typedef struct _vx_tensorpatch_addressing_t {
+    vx_uint32 num_of_dims;    /*!< \brief Width of patch in X dimension in pixels. */
+    vx_size   *dim_sizes;     /*!< \brief Pointer to dimensions array */
+    vx_size   *strides;       /*!< \brief Pointer to strides array */
+    vx_uint16 stride_x_bits; /*!< \brief Stride in X dimension in bits. Used when stride_x is not an integer number of bytes. */
+} vx_tensorpatch_addressing_t;
+
+/*! \brief The addressing of a tensor patch structure is used by the Host only
+* to address elements in a tensor view patch.
+* \see <tt>\ref vxCopyTensorPatch2</tt>
+* \ingroup group_tensor
+*/
+typedef struct _vx_tensorpatch_addressing_t * vx_trensor_addressing;
+
 /*! \brief The weight bias parameter for fused layers
 * \ingroup group_cnn
 */
@ -437,6 +470,8 @@ enum vx_type_e {
    /* \todo add new object types here */
    VX_TYPE_BFLOAT16        = 0x81A,/*!< \brief A <tt>\ref vx_bfloat16</tt>. */

+    VX_TYPE_INT4            = 0x81C,/*!< \brief A <tt>\ref signed 4bits tensor.</tt>. */
+    VX_TYPE_UINT4           = 0x81D,/*!< \brief A <tt>\ref unsigned 4bits tensor.</tt>. */
 };

 /*! \brief The enumeration of all status codes.
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h
@ -53,17 +53,19 @@ VX_API_ENTRY vx_status VX_API_CALL vxSysSetVipFrequency(
    vx_uint32 shaderFscaleValue
    );

-/*! \brief cancel all VIP processing jobs.
+/*! \brief cancel all VIP processing jobs on a device.
 * \param [in] context The reference to the implementation context.
+ * \param [in] deviceID bound to graph.
 * \return A <tt>\ref vx_status_e</tt> enumeration.
- * \retval VX_SUCCESS Cancelled all VIP processing job successfully
+ * \retval VX_SUCCESS Cancelled all VIP processing job successfully on a device
 *                    and user can check return of vxProcessGraph() to get cancelled status.
 * \retval VX_ERROR_INVAID_PARAMETERS Invalid context reference.
 * \retval VX_ERROR_NOT_SUPPORTED Hardware does not support job cancellation.
- * \retval VX_FAILURE Failed to cancel VIP proccessing job.
+ * \retval VX_FAILURE Failed to cancel VIP proccessing job on a device.
 */
 VX_API_ENTRY vx_status VX_API_CALL vxSysCancelJob(
-    vx_context context
+    vx_context context,
+    vx_uint32  deviceID
    );

 #ifdef  __cplusplus
--- a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so
--- a/prebuilt-sdk/x86_64_linux/lib/libCLC.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libCLC.so
--- a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so
--- a/prebuilt-sdk/x86_64_linux/lib/libGAL.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libGAL.so
--- a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0
+++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so
--- a/prebuilt-sdk/x86_64_linux/lib/libVSC.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libVSC.so
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@ -168,3 +168,7 @@ DEF_OP(CONV3D)
 DEF_OP(DECONV3D)
 DEF_OP(PAD2)
 DEF_OP(COS)
+DEF_OP(PRE_PROCESS_RGB888_PLANAR)
+DEF_OP(GATHER_ELEMENTS)
+DEF_OP(SELU)
+DEF_OP(CELU)
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
@ -35,7 +35,7 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum
    VSI_NN_KERNEL_LUT_MISH             = 1,
    VSI_NN_KERNEL_LUT_LOG              = 2,
    VSI_NN_KERNEL_LUT_EXP              = 3,
-    VSI_NN_KERNEL_LUT_ELU              = 4,
+    VSI_NN_KERNEL_LUT_SELU             = 4,
    VSI_NN_KERNEL_LUT_NEG              = 5,
    VSI_NN_KERNEL_LUT_HSIGMOID         = 6,
    VSI_NN_KERNEL_LUT_SOFT_PLUS        = 7,
@ -45,6 +45,7 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum
    VSI_NN_KERNEL_LUT_RELU_KERAS       = 11,
    VSI_NN_KERNEL_LUT_CLIP             = 12,
    VSI_NN_KERNEL_LUT_SQUARE           = 13,
+    VSI_NN_KERNEL_LUT_CELU             = 14,
 };

 #define VSI_NN_KERNEL_LUT_MAX_SIZE  (1024)
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_celu.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_celu.h
@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CELU_H
+#define _VSI_NN_OP_CELU_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_celu_param
+{
+    struct _celu_local_data_t* local;
+    // Add parameters here
+    float alpha;
+} vsi_nn_celu_param;
+_compiler_assert(offsetof(vsi_nn_celu_param, local) == 0, \
+    vsi_nn_celu_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_clip.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_clip.h
@ -81,12 +81,7 @@ typedef struct _vsi_nn_clip_lcl_data

 typedef struct _vsi_nn_clip_lcl2_data
 {
-    uint32_t    hash_idx;
-    vsi_bool    execute_on_sw;
-    vsi_bool    enable_image_2d;
-    uint32_t    sizes0[VSI_NN_MAX_DIM_NUM];
-    uint32_t    sizes1[VSI_NN_MAX_DIM_NUM];
-    uint32_t    dim_num;
+    vsi_bool is_internal_node;
 } vsi_nn_clip_lcl2_data;

 typedef struct _vsi_nn_clip_param
@ -103,4 +98,3 @@ typedef struct _vsi_nn_clip_param
 #endif

 #endif
-
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_gather_elements.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gather_elements.h
@ -0,0 +1,48 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_GATHER_ELEMENTS_H
+#define _VSI_NN_OP_GATHER_ELEMENTS_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_gather_elements_param
+{
+    struct _gather_elements_local_data_t* local;
+    // Add parameters here
+    int32_t axis;
+} vsi_nn_gather_elements_param;
+_compiler_assert(offsetof(vsi_nn_gather_elements_param, local) == 0, \
+    vsi_nn_gather_elements_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h
@ -0,0 +1,64 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_PRE_PROCESS_RGB888_PLANAR_H
+#define _VSI_NN_OP_PRE_PROCESS_RGB888_PLANAR_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_pre_process_rgb888_planar_param
+{
+    struct _pre_process_rgb888_planar_local_data_t* local;
+    // Add parameters here
+    struct
+    {
+        uint32_t left;
+        uint32_t top;
+        uint32_t width;
+        uint32_t height;
+    } rect;
+
+    struct
+    {
+        vsi_size_t *size;
+        uint32_t   dim_num;
+    } output_attr;
+
+    float r_mean;
+    float g_mean;
+    float b_mean;
+    float scale;
+} vsi_nn_pre_process_rgb888_planar_param;
+_compiler_assert(offsetof(vsi_nn_pre_process_rgb888_planar_param, local) == 0, \
+    vsi_nn_pre_process_rgb888_planar_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_selu.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_selu.h
@ -0,0 +1,48 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_SELU_H
+#define _VSI_NN_OP_SELU_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_selu_param
+{
+    struct _selu_local_data_t* local;
+    // Add parameters here
+    float alpha;
+    float gamma;
+} vsi_nn_selu_param;
+_compiler_assert(offsetof(vsi_nn_selu_param, local) == 0, \
+    vsi_nn_selu_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/tim/vx/internal/include/utils/vsi_nn_math.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_math.h
@ -56,6 +56,7 @@ extern "C" {
    static inline vsi_##NAME##_array_t * vsi_##NAME##_array_create( size_t size ) { \
        vsi_##NAME##_array_t * array = (vsi_##NAME##_array_t *)malloc( \
                sizeof(vsi_##NAME##_array_t) + sizeof(TYPE) * size ); \
+        if (array == NULL) return NULL; \
        array->size = size; \
        return array; \
    } \
@ -205,6 +206,14 @@ static inline double vsi_rint
    return inter;
 } /* vsi_rint() */

+/**
+* Computes an approximation of the error function.
+* This is the same approximation used by Eigen.
+*
+* @param[in] the value for input float.
+*/
+float vsi_nn_erf_impl(float x);
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/tim/vx/internal/include/utils/vsi_nn_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h
@ -58,6 +58,41 @@ extern "C" {

 #define BITS_PER_BYTE 8

+#define VSI_NN_STRINGIZE(X) VSI_NN_DO_STRINGIZE(X)
+#define VSI_NN_DO_STRINGIZE(X) #X
+
+#define VSI_NN_JOIN(X, Y) VSI_NN_DO_JOIN(X, Y)
+#define VSI_NN_DO_JOIN(X, Y) VSI_NN_DO_JOIN2(X,Y)
+#define VSI_NN_DO_JOIN2(X, Y) X##Y
+
+#if defined(_MSC_VER)
+    #define VSI_NN_DEPRECATED(symbol, hints) \
+       __declspec(deprecated(VSI_NN_STRINGIZE(hints))) symbol
+
+    #define VSI_NN_SUPPRESS_DEPRECATED_BEGIN \
+        __pragma(warning( push )) \
+        __pragma(warning(disable : 4996))
+    #define VSI_NN_SUPPRESS_DEPRECATED_END \
+        __pragma(warning(pop))
+
+#elif defined(__GNUC__)
+    #define VSI_NN_DEPRECATED(symbol, hints) \
+        symbol __attribute__((deprecated(VSI_NN_STRINGIZE(hints))))
+
+    #define VSI_NN_SUPPRESS_DEPRECATED_BEGIN \
+        _Pragma("GCC diagnostic push")  \
+        _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+
+    #define VSI_NN_SUPPRESS_DEPRECATED_END \
+        _Pragma("GCC diagnostic pop")
+#else
+    #define VSI_NN_DEPRECATED(symbol, hints) \
+        symbol
+
+    #define VSI_NN_SUPPRESS_DEPRECATED_BEGIN
+    #define VSI_NN_SUPPRESS_DEPRECATED_END
+#endif
+
 /*-------------------------------------------
                  Functions
 -------------------------------------------*/
--- a/src/tim/vx/internal/include/vip/virtual_device.h
+++ b/src/tim/vx/internal/include/vip/virtual_device.h
@ -0,0 +1,56 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VIP_VIRTUAL_DEVICE_H
+#define _VIP_VIRTUAL_DEVICE_H
+
+#include <memory>
+#include <functional>
+
+struct _vsi_nn_graph;
+typedef struct _vsi_nn_graph vsi_nn_graph_t;
+
+namespace vip {
+
+class Device;
+using func_t = std::function<bool (const void*)>;
+using data_t = const void*;
+
+class IDevice {
+    public:
+        IDevice(uint32_t id);
+        ~IDevice();
+        uint32_t Id() const;
+        bool GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data);
+        bool GraphRemove(const vsi_nn_graph_t* graph);
+        bool ThreadExit();
+        bool ThreadIdle();
+        void WaitThreadIdle();
+
+    protected:
+        Device* device_;
+};
+
+}  // namespace vip
+
+#endif
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@ -27,6 +27,7 @@

 #include "vsi_nn_types.h"
 #include "vsi_nn_assert.h"
+#include "utils/vsi_nn_util.h"
 #include "ops/vsi_nn_op_activations.h"
 #include "ops/vsi_nn_op_batch_norm.h"
 #include "ops/vsi_nn_op_multiply.h"
@ -185,6 +186,10 @@
 #include "ops/vsi_nn_op_deconv3d.h"
 #include "ops/vsi_nn_op_reduce_mean_internal.h"
 #include "ops/vsi_nn_op_pad2.h"
+#include "ops/vsi_nn_op_pre_process_rgb888_planar.h"
+#include "ops/vsi_nn_op_gather_elements.h"
+#include "ops/vsi_nn_op_selu.h"
+#include "ops/vsi_nn_op_celu.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"

@ -210,7 +215,7 @@ typedef union _vsi_nn_nn_param
    vsi_nn_multiply_param           multiply;
    vsi_nn_proposal_param           proposal;
    vsi_nn_deconv_param             deconv;
-    vsi_nn_reshape_param            reshape;
+    vsi_nn_reshape_param            VSI_NN_DEPRECATED(reshape, "Replace with reshape2");
    vsi_nn_permute_param            permute;
    vsi_nn_upsample_param           upsample;
    vsi_nn_resize_param             resize;
@ -356,6 +361,10 @@ typedef union _vsi_nn_nn_param
    vsi_nn_deconv3d_param           deconv3d;
    vsi_nn_reduce_mean_internal_param reduce_mean_internal;
    vsi_nn_pad2_param               pad2;
+    vsi_nn_pre_process_rgb888_planar_param pre_process_rgb888_planar;
+    vsi_nn_gather_elements_param    gather_elements;
+    vsi_nn_selu_param               selu;
+    vsi_nn_celu_param               celu;
    void*                         client_param;

    /* custom node data struct define */
--- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
+++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
@ -84,6 +84,7 @@ typedef enum
    VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR,
    VSI_NN_SOURCE_FORMAT_IMAGE_YUV444,
    VSI_NN_SOURCE_FORMAT_IMAGE_NV12,
+    VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP,
 } vsi_nn_preprocess_source_format_e;

 /**
@ -235,6 +236,13 @@ OVXLIB_API vsi_status vsi_nn_AddGraphPostProcess
    uint32_t count
    );

+OVXLIB_API vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam
+    (
+        vsi_nn_graph_t* graph,
+        vsi_nn_node_id_t* enable_nodes,
+        uint32_t enable_nodes_count
+    );
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@ -33,7 +33,7 @@ extern "C"{

 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 39
+#define VSI_NN_VERSION_PATCH 43
 #define VSI_NN_VERSION \
    (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

--- a/src/tim/vx/internal/src/kernel/cl/clip_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c
@ -45,7 +45,7 @@ __BEGIN_DECLS
 #define CLIP_HASH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d ) \
        (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (_image_2d))

-#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+#define PACK_KERNEL_MAP_3D( IN_DTYPE, OUT_DTYPE ) \
        { CLIP_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0 ), \
          CVIVANTE_NAMESPACE("cl.clip_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
          _CLIP_KERNEL_SOURCE(IN_DTYPE) }
@ -64,16 +64,22 @@ typedef struct

 static const _kernel_map_type _clip_kernel_map[] =
 {
-    PACK_KERNEL_MAP(F32,     F32),
-    PACK_KERNEL_MAP(F32,     U8),
-    PACK_KERNEL_MAP(U8,      U8),
-    PACK_KERNEL_MAP(U8,      F32),
-    PACK_KERNEL_MAP(BF16,    BF16),
-    PACK_KERNEL_MAP_2D(F32,  F32),
-    PACK_KERNEL_MAP_2D(F32,  U8),
-    PACK_KERNEL_MAP_2D(U8,   U8),
-    PACK_KERNEL_MAP_2D(U8,   F32),
-    PACK_KERNEL_MAP_2D(BF16, BF16),
+    PACK_KERNEL_MAP_3D(F32,   F32),
+    PACK_KERNEL_MAP_3D(F32,   U8),
+    PACK_KERNEL_MAP_3D(F32,   I32),
+    PACK_KERNEL_MAP_3D(U8,    U8),
+    PACK_KERNEL_MAP_3D(U8,    F32),
+    PACK_KERNEL_MAP_3D(I32,   I32),
+    PACK_KERNEL_MAP_3D(I32,   F32),
+    PACK_KERNEL_MAP_3D(BF16,  BF16),
+    PACK_KERNEL_MAP_2D(F32,   F32),
+    PACK_KERNEL_MAP_2D(F32,   U8),
+    PACK_KERNEL_MAP_2D(F32,   I32),
+    PACK_KERNEL_MAP_2D(U8,    U8),
+    PACK_KERNEL_MAP_2D(U8,    F32),
+    PACK_KERNEL_MAP_2D(I32,   I32),
+    PACK_KERNEL_MAP_2D(I32,   F32),
+    PACK_KERNEL_MAP_2D(BF16,  BF16),
 };


@ -100,9 +106,6 @@ static vx_param_description_t _clip_kernel_param_def[] =
 #define SCALAR_OUTPUT_SCALE       (6)
 #define SCALAR_OUTPUT_TAIL        (7)

-#define CLIP_PARAM_NUM         4
-#define CLIP_QUANT_PARAM_NUM   _cnt_of_array( _clip_kernel_param_def )
-
 /*
 * Kernel initializer
 */
@ -122,7 +125,7 @@ DEF_KERNEL_INITIALIZER(_clip_initializer)
        {0, 0, 0}
        };
    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
-    vsi_size_array_t * out_shape                 = NULL;
+    vsi_size_array_t * out_shape                = NULL;

    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
@ -149,8 +152,6 @@ final:
    return status;
 } /* _clip_initializer() */

-
-
 /*
 * Query kernel
 */
@ -159,8 +160,7 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
-    vsi_bool image_2d,
-    vsi_bool *is_use_u8_kernel
+    vsi_bool image_2d
    )
 {
    vsi_status status = VSI_FAILURE;
@ -178,37 +178,47 @@ static vsi_status _query_kernel
    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

-    if (F16 == in_dtype)
-    {
-        in_dtype = F32;
-    }
+#define _PACK_SELECT_KEY( in_type, out_type ) \
+    ( ( in_type ) | ( out_type << 8 ))

-    if (F16 == out_dtype)
+    switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
    {
-        out_dtype = F32;
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+        key = CLIP_HASH_KEY( F32, F32, image_2d );
+        break;
+    case _PACK_SELECT_KEY(F32, I8):
+    case _PACK_SELECT_KEY(F16, I16):
+    case _PACK_SELECT_KEY(F16, I32):
+        key = CLIP_HASH_KEY( F32, I32, image_2d );
+        break;
+    case _PACK_SELECT_KEY(I8,  I8):
+    case _PACK_SELECT_KEY(I16, I16):
+    case _PACK_SELECT_KEY(I32, I32):
+        key = CLIP_HASH_KEY( I32, I32, image_2d );
+        break;
+    case _PACK_SELECT_KEY(I8,  F16):
+    case _PACK_SELECT_KEY(I16, F16):
+    case _PACK_SELECT_KEY(I32, F16):
+    case _PACK_SELECT_KEY(I8,  F32):
+    case _PACK_SELECT_KEY(I16, F32):
+    case _PACK_SELECT_KEY(I32, F32):
+        key = CLIP_HASH_KEY( I32, F32, image_2d );
+        break;
+    default:
+        key = CLIP_HASH_KEY( in_dtype, out_dtype, image_2d );
+        break;
    }
+#undef _PACK_SELECT_KEY

-   if ((U8 == in_dtype) || (U8 == out_dtype))
-    {
-        param_def_size    = CLIP_QUANT_PARAM_NUM;
-        *is_use_u8_kernel = TRUE;
-    }
-    else
-    {
-        param_def_size    = CLIP_PARAM_NUM;
-        *is_use_u8_kernel = FALSE;
-    }
-
-    key = CLIP_HASH_KEY( in_dtype, out_dtype, image_2d );
-
-    for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
    {
        if( kernel_map[i].key == key )
        {
            break;
        }
    }
-    if( i < (uint32_t)kernel_map_size )
+    if ( i < (uint32_t)kernel_map_size )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
        kernel->info.parameters  = param_def;
@ -246,7 +256,6 @@ static vsi_nn_kernel_node_t _setup
    float    outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
    float    inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
    float    inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
-    vsi_bool is_use_u8_kernel = FALSE;
    float    min_value    = vsi_nn_kernel_param_get_float32( params, "min_value" );
    float    max_value    = vsi_nn_kernel_param_get_float32( params, "max_value" );

@ -261,40 +270,31 @@ static vsi_nn_kernel_node_t _setup

    image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);

-    status = _query_kernel( kernel, inputs, outputs, image_2d, &is_use_u8_kernel);
+    status = _query_kernel( kernel, inputs, outputs, image_2d);

-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status )
    {
-        size_t node_params_num = CLIP_PARAM_NUM;
-
        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
        {
            /* Set inputs and outputs */
            vsi_nn_kernel_node_pack_io( node_params, _CLIP_PARAM_NUM,
                    inputs, input_num, outputs, output_num );
            node_params[SCALAR_MIN_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &min_value );
            node_params[SCALAR_MAX_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &max_value );
-           if (is_use_u8_kernel)
-            {
-                node_params[SCALAR_INPUT_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
-                node_params[SCALAR_INPUT_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &inputTail );
-                node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
-                node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
-                node_params_num = CLIP_QUANT_PARAM_NUM;
-            }
+            node_params[SCALAR_INPUT_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
+            node_params[SCALAR_INPUT_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &inputTail );
+            node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CLIP_PARAM_NUM );
            VSI_ASSERT( status == VSI_SUCCESS );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_MIN_VALUE] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_VALUE] );
-            if (is_use_u8_kernel)
-            {
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
-            }
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
        }
    }
    return node;
--- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
@ -45,13 +45,14 @@ typedef enum
    UNARY_COS,
    UNARY_EXP,
    UNARY_LOG,
-    UNARY_ELU,
    UNARY_NEG,
    UNARY_HSIGMOID,
    UNARY_MISH,
    UNARY_ROUND,
    UNARY_GELU,
-    UNARY_HGELU
+    UNARY_HGELU,
+    UNARY_SELU,
+    UNARY_CELU,
 } unary_type_e;

 /*
@ -60,16 +61,18 @@ typedef enum
 #define HASH_UNARY_KEY(_type, _input_type, _output_type, _image_2d) \
    ((_type << 20) | (_input_type << 12) | (_output_type << 4) | (_image_2d))

- #define VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() \
-    "eltwise_unary"
+#define _UNARY_KERNEL_SOURCE0_NAME() \
+    "eltwise_unary_0"
+#define _UNARY_KERNEL_SOURCE1_NAME() \
+    "eltwise_unary_1"

 #define HASH_UNARY_SH_KERNEL_NAME(FUNC_NAME, SRC_TYPE, DST_TYPE) \
    CVIVANTE_NAMESPACE("cl."#FUNC_NAME"_"#SRC_TYPE"to"#DST_TYPE)

-#define TENSOR_UNARY_KERNELS(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE) \
+#define TENSOR_UNARY_KERNELS_3D(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE) \
    {   HASH_UNARY_KEY(TYPE, SRC_TYPE, OUT_TYPE, 0), \
        HASH_UNARY_SH_KERNEL_NAME(FUNC_NAME, SRC_TYPE, OUT_TYPE), \
-        VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
+        _UNARY_KERNEL_SOURCE1_NAME() },

 #define HASH_UNARY_SH_KERNEL_2D_NAME(FUNC_NAME, SRC_TYPE, DST_TYPE) \
    CVIVANTE_NAMESPACE("cl."#FUNC_NAME"_"#SRC_TYPE"to"#DST_TYPE"_2D")
@ -77,29 +80,20 @@ typedef enum
 #define TENSOR_UNARY_KERNELS_2D(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE) \
    {   HASH_UNARY_KEY(TYPE, SRC_TYPE, OUT_TYPE, 1), \
        HASH_UNARY_SH_KERNEL_2D_NAME(FUNC_NAME, SRC_TYPE, OUT_TYPE), \
-        VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
-
-#define TENSOR_UNARY_KERNELS_FLOAT(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE) \
-    {   HASH_UNARY_KEY(TYPE, SRC_TYPE, OUT_TYPE, 0), \
-        HASH_UNARY_SH_KERNEL_NAME(FUNC_NAME, F32, F32), \
-        VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
-
-#define TENSOR_UNARY_KERNELS_FLOAT_2D(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE) \
-    {   HASH_UNARY_KEY(TYPE, SRC_TYPE, OUT_TYPE, 1), \
-        HASH_UNARY_SH_KERNEL_2D_NAME(FUNC_NAME, F32, F32), \
-        VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
+        _UNARY_KERNEL_SOURCE0_NAME() },

 #define SIN_OPERATION           sin
 #define COS_OPERATION           cos
 #define EXP_OPERATION           exp
 #define LOG_OPERATION           log
-#define ELU_OPERATION           elu
 #define NEG_OPERATION           neg
 #define HSIGMOID_OPERATION      hard_sigmoid
 #define MISH_OPERATION          mish
 #define ROUND_OPERATION         round
 #define GELU_OPERATION          gelu
 #define HGELU_OPERATION         hard_gelu
+#define SELU_OPERATION          selu
+#define CELU_OPERATION          celu

 static const struct {
        uint32_t key;
@ -107,77 +101,59 @@ static const struct {
        const char* source_name;
    } kernel_map[] =
 {
-    TENSOR_UNARY_KERNELS_FLOAT(SIN_OPERATION,      UNARY_SIN,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(SIN_OPERATION,      UNARY_SIN,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(COS_OPERATION,      UNARY_COS,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(COS_OPERATION,      UNARY_COS,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(EXP_OPERATION,      UNARY_EXP,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(EXP_OPERATION,      UNARY_EXP,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(LOG_OPERATION,      UNARY_LOG,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(LOG_OPERATION,      UNARY_LOG,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(ELU_OPERATION,      UNARY_ELU,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(ELU_OPERATION,      UNARY_ELU,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(NEG_OPERATION,      UNARY_NEG,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(NEG_OPERATION,      UNARY_NEG,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(MISH_OPERATION,     UNARY_MISH,     F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(MISH_OPERATION,     UNARY_MISH,     F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION,    UNARY_ROUND,    F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION,    UNARY_ROUND,    F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(GELU_OPERATION,     UNARY_GELU,     F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(GELU_OPERATION,     UNARY_GELU,     F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT(HGELU_OPERATION,    UNARY_HGELU,    F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT(HGELU_OPERATION,    UNARY_HGELU,    F16, F16)
+    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,      UNARY_SIN,      F32, F32)
+    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,      UNARY_COS,      F32, F32)
+    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,      UNARY_EXP,      F32, F32)
+    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,      UNARY_LOG,      F32, F32)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,      UNARY_NEG,      F32, F32)
+    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32)
+    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION,     UNARY_MISH,     F32, F32)
+    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION,    UNARY_ROUND,    F32, F32)
+    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION,     UNARY_GELU,     F32, F32)
+    TENSOR_UNARY_KERNELS_3D(HGELU_OPERATION,    UNARY_HGELU,    F32, F32)
+    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION,     UNARY_SELU,     F32, F32)
+    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION,     UNARY_CELU,     F32, F32)

-    TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION,      UNARY_SIN,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION,      UNARY_SIN,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(COS_OPERATION,      UNARY_COS,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(COS_OPERATION,      UNARY_COS,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(EXP_OPERATION,      UNARY_EXP,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(EXP_OPERATION,      UNARY_EXP,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(LOG_OPERATION,      UNARY_LOG,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(LOG_OPERATION,      UNARY_LOG,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(ELU_OPERATION,      UNARY_ELU,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(ELU_OPERATION,      UNARY_ELU,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(NEG_OPERATION,      UNARY_NEG,      F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(NEG_OPERATION,      UNARY_NEG,      F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(MISH_OPERATION,     UNARY_MISH,     F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(MISH_OPERATION,     UNARY_MISH,     F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION,    UNARY_ROUND,    F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION,    UNARY_ROUND,    F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(GELU_OPERATION,     UNARY_GELU,     F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(GELU_OPERATION,     UNARY_GELU,     F16, F16)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(HGELU_OPERATION,    UNARY_HGELU,    F32, F32)
-    TENSOR_UNARY_KERNELS_FLOAT_2D(HGELU_OPERATION,    UNARY_HGELU,    F16, F16)
+    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,      UNARY_SIN,      F32, F32)
+    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,      UNARY_COS,      F32, F32)
+    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,      UNARY_EXP,      F32, F32)
+    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,      UNARY_LOG,      F32, F32)
+    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,      UNARY_NEG,      F32, F32)
+    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32)
+    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION,     UNARY_MISH,     F32, F32)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION,    UNARY_ROUND,    F32, F32)
+    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION,     UNARY_GELU,     F32, F32)
+    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION,    UNARY_HGELU,    F32, F32)
+    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION,     UNARY_SELU,     F32, F32)
+    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION,     UNARY_CELU,     F32, F32)

-    TENSOR_UNARY_KERNELS(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
-    TENSOR_UNARY_KERNELS(COS_OPERATION,      UNARY_COS,      U8,  U8)
-    TENSOR_UNARY_KERNELS(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
-    TENSOR_UNARY_KERNELS(LOG_OPERATION,      UNARY_LOG,      U8,  U8)
-    TENSOR_UNARY_KERNELS(ELU_OPERATION,      UNARY_ELU,      U8,  U8)
-    TENSOR_UNARY_KERNELS(NEG_OPERATION,      UNARY_NEG,      U8,  U8)
-    TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,  U8)
-    TENSOR_UNARY_KERNELS(MISH_OPERATION,     UNARY_MISH,     U8,  U8)
-    TENSOR_UNARY_KERNELS(ROUND_OPERATION,    UNARY_ROUND,    U8,  U8)
-    TENSOR_UNARY_KERNELS(GELU_OPERATION,     UNARY_GELU,     U8,  U8)
-    TENSOR_UNARY_KERNELS(HGELU_OPERATION,    UNARY_HGELU,    U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(COS_OPERATION,      UNARY_COS,      U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(LOG_OPERATION,      UNARY_LOG,      U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION,      UNARY_NEG,      U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(MISH_OPERATION,     UNARY_MISH,     U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION,    UNARY_ROUND,    U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(GELU_OPERATION,     UNARY_GELU,     U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(HGELU_OPERATION,    UNARY_HGELU,    U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(SELU_OPERATION,     UNARY_SELU,     U8,  U8)
+    TENSOR_UNARY_KERNELS_3D(CELU_OPERATION,     UNARY_CELU,     U8,  U8)

    TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
    TENSOR_UNARY_KERNELS_2D(COS_OPERATION,      UNARY_COS,      U8,  U8)
    TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
    TENSOR_UNARY_KERNELS_2D(LOG_OPERATION,      UNARY_LOG,      U8,  U8)
-    TENSOR_UNARY_KERNELS_2D(ELU_OPERATION,      UNARY_ELU,      U8,  U8)
    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,      UNARY_NEG,      U8,  U8)
    TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,  U8)
    TENSOR_UNARY_KERNELS_2D(MISH_OPERATION,     UNARY_MISH,     U8,  U8)
    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION,    UNARY_ROUND,    U8,  U8)
    TENSOR_UNARY_KERNELS_2D(GELU_OPERATION,     UNARY_GELU,     U8,  U8)
    TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION,    UNARY_HGELU,    U8,  U8)
+    TENSOR_UNARY_KERNELS_2D(SELU_OPERATION,     UNARY_SELU,     U8,  U8)
+    TENSOR_UNARY_KERNELS_2D(CELU_OPERATION,     UNARY_CELU,     U8,  U8)

-    TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I32,  I32)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I32,  I32)

    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I32,  I32)
 };
@ -186,13 +162,14 @@ static const struct {
 #undef COS_OPERATION
 #undef EXP_OPERATION
 #undef LOG_OPERATION
-#undef ELU_OPERATION
 #undef NEG_OPERATION
 #undef HSIGMOID_OPERATION
 #undef MISH_OPERATION
 #undef ROUND_OPERATION
 #undef GELU_OPERATION
 #undef HGELU_OPERATION
+#undef SELU_OPERATION
+#undef CELU_OPERATION
 /*
 * Kernel params
 */
@ -284,7 +261,21 @@ static vsi_status _query_kernel

    input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-    key = HASH_UNARY_KEY( type, input_dtype, output_dtype, image_2d );
+
+#define _PACK_SELECT_KEY( in_type, out_type ) \
+    ( ( in_type ) | ( out_type << 8 ))
+
+    switch (_PACK_SELECT_KEY(input_dtype, output_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+        key = HASH_UNARY_KEY( type, F32, F32, image_2d );
+        break;
+    default:
+        key = HASH_UNARY_KEY( type, input_dtype, output_dtype, image_2d );
+        break;
+    }
+#undef _PACK_SELECT_KEY

    for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
    {
@ -336,6 +327,15 @@ static vsi_nn_kernel_node_t _setup
    float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" );
    float beta = vsi_nn_kernel_param_get_float32( params, "beta" );

+    if (unary_type == UNARY_SELU)
+    {
+        alpha = alpha * beta;
+    }
+    else if (unary_type == UNARY_CELU)
+    {
+        beta = 1.0f / alpha;
+    }
+
    ret = vsi_nn_kernel_optimize_element_shape(
            inputs[0]->attr.size, inputs[0]->attr.dim_num,
            shape, &new_rank );
@ -450,11 +450,12 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( sin,          UNARY_SIN )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( cos,          UNARY_COS )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( exp,          UNARY_EXP )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( log,          UNARY_LOG )
-REGISTER_ELTWISE_UNARY_BACKEND_CL( elu,          UNARY_ELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( neg,          UNARY_NEG )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_sigmoid, UNARY_HSIGMOID )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( mish,         UNARY_MISH )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( round,        UNARY_ROUND )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( gelu,         UNARY_GELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_gelu,    UNARY_HGELU )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( selu,         UNARY_SELU )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( celu,         UNARY_CELU )
 __END_DECLS
--- a/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c
@ -0,0 +1,282 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_GATHER_ELEMENTS,
+} _internal_kernel_e;
+
+#define _GATHER_ELEMENTS_KERNEL_SOURCE      "gather_elements"
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMG_2D ) \
+        (( AXIS ) | ( IN0_DTYPE << 2 ) | ( IN1_DTYPE << 10 ) | ( OUT_DTYPE << 18 ) | ( IMG_2D << 26 ))
+#define PACK_KERNEL_3D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+    { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 ), \
+    CVIVANTE_NAMESPACE("cl.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
+    _GATHER_ELEMENTS_KERNEL_SOURCE}
+
+#define PACK_KERNEL_2D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+    { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \
+    CVIVANTE_NAMESPACE("cl.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
+    _GATHER_ELEMENTS_KERNEL_SOURCE}
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _gather_elements_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_3D_MAP( 0, F32, I32, F32 ),
+    PACK_KERNEL_3D_MAP( 0, I32, I32, I32 ),
+    PACK_KERNEL_3D_MAP( 0, U32, I32, U32 ),
+    PACK_KERNEL_3D_MAP( 1, F32, I32, F32 ),
+    PACK_KERNEL_3D_MAP( 1, I32, I32, I32 ),
+    PACK_KERNEL_3D_MAP( 1, U32, I32, U32 ),
+    PACK_KERNEL_3D_MAP( 2, F32, I32, F32 ),
+    PACK_KERNEL_3D_MAP( 2, I32, I32, I32 ),
+    PACK_KERNEL_3D_MAP( 2, U32, I32, U32 ),
+
+    PACK_KERNEL_2D_MAP( 0, F32, I32, F32 ),
+    PACK_KERNEL_2D_MAP( 0, I32, I32, I32 ),
+    PACK_KERNEL_2D_MAP( 0, U32, I32, U32 ),
+    PACK_KERNEL_2D_MAP( 1, F32, I32, F32 ),
+    PACK_KERNEL_2D_MAP( 1, I32, I32, I32 ),
+    PACK_KERNEL_2D_MAP( 1, U32, I32, U32 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _gather_elements_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _GATHER_ELEMENTS_PARAM_NUM  _cnt_of_array( _gather_elements_kernel_param_def )
+#define SCALAR_INPUT_SCALE        (3)
+#define SCALAR_INPUT_TAIL         (4)
+#define SCALAR_INPUT_AXIS_SIZE    (5)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_gather_elements_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
+    vsi_size_array_t * out_shape              = NULL;
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    out_shape  = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(output_attr);
+    return status;
+} /* _gather_elements_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t axis
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _gather_elements_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _gather_elements_kernel_map );
+    vx_param_description_t * param_def  = _gather_elements_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _gather_elements_initializer;
+    int32_t img_2d = (outputs[0]->attr.dim_num < 3 || outputs[0]->attr.size[2] == 1) ? 1 : 0;
+    uint32_t key = 0;
+    uint32_t i;
+
+    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+#define _PACK_SELECT_KEY( in0_type, out_type ) \
+    ( ( in0_type ) | ( out_type << 8 ))
+
+    switch (_PACK_SELECT_KEY(in0_dtype, out_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+        key = GATHER_ELEMENTS_HASH_KEY( axis, F32, in1_dtype, F32, img_2d );
+        break;
+    case _PACK_SELECT_KEY(U32, U32):
+    case _PACK_SELECT_KEY(U16, U16):
+    case _PACK_SELECT_KEY(U8,  U8):
+        key = GATHER_ELEMENTS_HASH_KEY( axis, U32, in1_dtype, U32, img_2d );
+        break;
+    case _PACK_SELECT_KEY(I32, I32):
+    case _PACK_SELECT_KEY(I16, I16):
+    case _PACK_SELECT_KEY(I8,  I8):
+        key = GATHER_ELEMENTS_HASH_KEY( axis, I32, in1_dtype, I32, img_2d );
+        break;
+    default:
+        break;
+    }
+#undef _PACK_SELECT_KEY
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _gather_elements_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GATHER_ELEMENTS_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
+    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float input_tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
+    int32_t axis_size = (int32_t)inputs[0]->attr.size[axis];
+
+    status = _query_kernel( kernel, inputs, outputs, axis );
+    if ( VSI_SUCCESS == status)
+    {
+        input_scale = input_scale / output_scale;
+        input_tail = output_zp - input_tail * input_scale;
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _GATHER_ELEMENTS_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_INPUT_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
+            node_params[SCALAR_INPUT_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &input_tail );
+            node_params[SCALAR_INPUT_AXIS_SIZE] = vsi_nn_kernel_scalar_create(graph, I32, &axis_size );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GATHER_ELEMENTS_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS_SIZE] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( gather_elements, _setup )
--- a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c
@ -59,9 +59,11 @@ typedef struct
 static const _kernel_map_type _l2normalizescale_kernel_map[] =
 {
    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, F32, F32, F32 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F32, U8  )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8,  F32, U8  )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I32, F32, I32  )
    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, F32, F32, F32 )
-    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F32, U8  )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8,  F32, U8  )
+    HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I32, F32, I32  )
 };


@ -91,9 +93,6 @@ static vx_param_description_t _l2normalizescale_kernel_param_def[] =
 #define SCALAR_OUTPUT_SCALE         (8)
 #define SCALAR_OUTPUT_TAIL          (9)

-#define L2NORMSCALE_PARAM_NUM         6
-#define L2NORMSCALE_QUANT_PARAM_NUM   _cnt_of_array( _l2normalizescale_kernel_param_def )
-
 /*
 * Kernel initializer
 */
@ -168,8 +167,7 @@ static vsi_status _query_kernel
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
    int32_t axis,
-    vsi_bool image_2d,
-    vsi_bool *is_use_u8_kernel
+    vsi_bool image_2d
    )
 {
    vsi_status status = VSI_FAILURE;
@ -193,6 +191,10 @@ static vsi_status _query_kernel
    {
        in0_dtype = F32;
    }
+    else if (I8 == in0_dtype || I16 == in0_dtype)
+    {
+        in0_dtype = I32;
+    }

    if (F16 == in1_dtype)
    {
@ -203,16 +205,9 @@ static vsi_status _query_kernel
    {
        out_dtype = F32;
    }
-
-   if ((U8 == in0_dtype) || (U8 == out_dtype))
+    else if (I8 == out_dtype || I16 == out_dtype)
    {
-        param_def_size = L2NORMSCALE_QUANT_PARAM_NUM;
-        *is_use_u8_kernel = TRUE;
-    }
-    else
-    {
-        param_def_size = L2NORMSCALE_PARAM_NUM;
-        *is_use_u8_kernel = FALSE;
+        out_dtype = I32;
    }

    key = HASH_L2NORMALIZESCALE_HASH_KEY(axis, in0_dtype, in1_dtype, out_dtype, image_2d);
@ -265,7 +260,6 @@ static vsi_nn_kernel_node_t _setup
    float    inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
    float    epsilon      = (float)10e-12;
    float    rsEps        = 1.0f / sqrtf(epsilon);
-    vsi_bool is_use_u8_kernel = FALSE;

    outputScale = 1.0f / outputScale;
    inputTail   = -(inputTail * inputScale);
@ -282,7 +276,7 @@ static vsi_nn_kernel_node_t _setup
    }

    image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
-    status = _query_kernel( kernel, inputs, outputs, axis, image_2d, &is_use_u8_kernel );
+    status = _query_kernel( kernel, inputs, outputs, axis, image_2d );
    axis_size = inputs[0]->attr.size[axis];


@ -291,7 +285,6 @@ static vsi_nn_kernel_node_t _setup
        node = vsi_nn_kernel_create_node( graph, kernel );
        if( node )
        {
-            size_t node_params_num = L2NORMSCALE_PARAM_NUM;
            /* Set inputs and outputs */
            vsi_nn_kernel_node_pack_io( node_params, _L2NORMALIZESCALE_PARAM_NUM,
                    inputs, input_num, outputs, output_num );
@ -301,27 +294,21 @@ static vsi_nn_kernel_node_t _setup
                    graph, I32, &axis_size );
            node_params[SCALAR_EPS_VALUE] = vsi_nn_kernel_scalar_create(
                    graph, F32, &rsEps );
-            if (is_use_u8_kernel)
-            {
-                node_params[SCALAR_INPUT_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
-                node_params[SCALAR_INPUT_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &inputTail );
-                node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
-                node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
-                node_params_num = L2NORMSCALE_QUANT_PARAM_NUM;
-            }
+            node_params[SCALAR_INPUT_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
+            node_params[SCALAR_INPUT_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &inputTail );
+            node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
+
            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _L2NORMALIZESCALE_PARAM_NUM );
            VSI_ASSERT( status == VSI_SUCCESS );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_AXIS_SIZE] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_EPS_VALUE] );
-            if (is_use_u8_kernel)
-            {
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
-            }
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
        }
    }
    return node;
--- a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "vsi_nn_tensor_util.h"
-#include "math.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
@ -43,8 +42,6 @@ __BEGIN_DECLS
 * Define kernel meta.
 */
 #define KERNEL_SOURCE_1    "maximum",
-#define KERNEL_SOURCE_2    "maximum_fp16",
-#define KERNEL_SOURCE_3    "maximum_i16"

 #define HASH_MAXIMUM_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
@ -198,16 +195,25 @@ static vsi_status _query_kernel
    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+#define CONVERT_I8_OR_I16TOI32(dtype) \
+    dtype = (dtype == I8 || dtype == I16) ? I32 : dtype
+
+    CONVERT_I8_OR_I16TOI32(input0_dtype);
+    CONVERT_I8_OR_I16TOI32(input1_dtype);
+    CONVERT_I8_OR_I16TOI32(output_dtype);
+#undef CONVERT_I8_OR_I16TOI32
+
    key = HASH_MAXIMUM_KEY( input0_dtype, input1_dtype, output_dtype, image_2d );

-    for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(kernel_map); i ++ )
    {
-        if( kernel_map[i].key == key )
+        if ( kernel_map[i].key == key )
        {
            break;
        }
    }
-    if( i < _cnt_of_array(kernel_map) )
+    if ( i < _cnt_of_array(kernel_map) )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
        kernel->info.parameters = kernel_param_def;
@ -248,7 +254,7 @@ static vsi_nn_kernel_node_t _setup

    outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;

-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
        return NULL;
@ -256,11 +262,11 @@ static vsi_nn_kernel_node_t _setup

    image_2d = (outputs[0]->attr.dim_num == 2);
    status = _query_kernel( inputs, outputs, image_2d, kernel );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );

-        if( node )
+        if ( node )
        {
            vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
                    inputs, 2, outputs, 1 );
--- a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
@ -42,8 +42,6 @@ __BEGIN_DECLS
 * Define kernel meta.
 */
 #define KERNEL_SOURCE_1    "minimum",
-#define KERNEL_SOURCE_2    "minimum_fp16",
-#define KERNEL_SOURCE_3    "minimum_i16"

 #define HASH_MINIMUM_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
@ -197,16 +195,25 @@ static vsi_status _query_kernel
    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+#define CONVERT_I8_OR_I16TOI32(dtype) \
+    dtype = (dtype == I8 || dtype == I16) ? I32 : dtype
+
+    CONVERT_I8_OR_I16TOI32(input0_dtype);
+    CONVERT_I8_OR_I16TOI32(input1_dtype);
+    CONVERT_I8_OR_I16TOI32(output_dtype);
+#undef CONVERT_I8_OR_I16TOI32
+
    key = HASH_MINIMUM_KEY( input0_dtype, input1_dtype, output_dtype, image_2d );

-    for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(kernel_map); i ++ )
    {
-        if( kernel_map[i].key == key )
+        if ( kernel_map[i].key == key )
        {
            break;
        }
    }
-    if( i < _cnt_of_array(kernel_map) )
+    if ( i < _cnt_of_array(kernel_map) )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
        kernel->info.parameters = kernel_param_def;
@ -247,7 +254,7 @@ static vsi_nn_kernel_node_t _setup

    outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;

-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
        return NULL;
@ -255,11 +262,11 @@ static vsi_nn_kernel_node_t _setup

    image_2d = (outputs[0]->attr.dim_num == 2);
    status = _query_kernel( inputs, outputs, image_2d, kernel );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );

-        if( node )
+        if ( node )
        {
            vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
                    inputs, 2, outputs, 1 );
--- a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
@ -176,6 +176,10 @@ static vsi_status _query_kernel
    {
        in_dtype = F32;
    }
+    else if (in_dtype == I16 || in_dtype == I8)
+    {
+        in_dtype = I32;
+    }

    if (out_dtype == F16)
    {
--- a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"

 __BEGIN_DECLS
@ -146,6 +145,7 @@ DEF_KERNEL_INITIALIZER(_roi_align_initializer)

 final:
 #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(rois_attr);
    SAFE_FREE_TENSOR_ATTR(output_attr);

    return status;
@ -212,7 +212,6 @@ static vsi_status _query_kernel
    }

    return status;
-
 } /* _query_kernel() */

 #define _INPUT_NUM          (3)
@ -326,4 +325,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CL( roi_align, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -76,8 +75,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
@ -93,7 +92,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for(i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -243,4 +241,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( add_mean_std_norm, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c
@ -36,7 +36,6 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 __BEGIN_DECLS

 #define _CPU_ARG_NUM            (1)
@ -138,20 +137,6 @@ static vx_param_description_t kernel_param_def[] =
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
 };

-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _argmax_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -159,7 +144,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _argmax_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -210,4 +199,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( argmax, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c
@ -36,7 +36,6 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -139,20 +138,6 @@ static vx_param_description_t kernel_param_def[] =
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
 };

-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _argmin_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -160,7 +145,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _argmin_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -211,4 +200,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( argmin, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c
@ -108,8 +108,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -128,7 +128,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for (i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -276,4 +275,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( axis_aligned_bbox_transform, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c
@ -34,7 +34,6 @@
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"

 __BEGIN_DECLS
@ -160,20 +159,6 @@ static vx_param_description_t kernel_param_def[] =

 #define SCALAR_INPUT_EPS          (6)

-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _batch_norm_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -181,7 +166,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _batch_norm_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -231,4 +220,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( batchnorm_single, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c
@ -34,7 +34,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -225,8 +224,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    int32_t* int32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
    int32_t* int32_out_buffer[_OUTPUT_NUM] = {0};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
--- a/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 #include "utils/vsi_nn_dtype_util_prv.h"

 __BEGIN_DECLS
@ -72,8 +71,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -92,7 +91,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i]->asymm.zero_point  = 0;
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for(i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -217,4 +215,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( cast, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -77,8 +76,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -93,7 +92,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for(i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -212,10 +210,8 @@ static vsi_nn_kernel_node_t _setup
    }

    return node;
-
 } /* _setup() */

 __END_DECLS

 REGISTER_BACKEND_CPU( clip, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c
@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -206,20 +205,6 @@ static vx_param_description_t kernel_param_def[] =

 #define INPUT_FUNC_OP           (3)

-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _comparisons_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -227,7 +212,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _comparisons_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

--- a/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c
@ -34,7 +34,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -182,7 +181,6 @@ final:
    }

    return status;
-
 } /* _compute() */

 /*
--- a/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -150,20 +149,6 @@ static vx_param_description_t _depth2space_crd_kernel_param_def[] =
 };
 #define _DEPTH2SPACE_CRD_PARAM_NUM  _cnt_of_array( _depth2space_crd_kernel_param_def )

-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _depth2space_crd_exec,
-    _depth2space_crd_kernel_param_def,
-    _cnt_of_array( _depth2space_crd_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -171,7 +156,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _depth2space_crd_exec;
+    kernel->info.parameters  = _depth2space_crd_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _depth2space_crd_kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -220,4 +209,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( depth2space_internal, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/depthwise_conv1d_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/depthwise_conv1d_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 #include "cpu_backend/npuref_interface.h"

 __BEGIN_DECLS
@ -272,4 +271,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( depthwise_conv1d, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/detect_post_box_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/detect_post_box_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -82,8 +81,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -102,7 +101,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for ( i = 0; i < _OUTPUT_NUM; i++ )
    {
@ -252,4 +250,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( detect_post_box, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/detect_post_nms_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/detect_post_nms_cpu.c
@ -22,7 +22,6 @@
 *
 *****************************************************************************/

-
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -35,7 +34,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -199,8 +197,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -222,7 +220,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for ( i = 0; i < _OUTPUT_NUM; i++ )
    {
@ -524,4 +521,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( detect_post_nms, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -43,13 +42,14 @@ typedef enum
    UNARY_COS,
    UNARY_EXP,
    UNARY_LOG,
-    UNARY_ELU,
    UNARY_NEG,
    UNARY_HSIGMOID,
    UNARY_MISH,
    UNARY_ROUND,
    UNARY_GELU,
    UNARY_HGELU,
+    UNARY_SELU,
+    UNARY_CELU,
 } unary_type_e;


@ -80,11 +80,6 @@ static float log_eval(float data)
    return logf(data);
 }

-static float elu_eval(float data, float alpha)
-{
-    return data >=0 ? data : expf(data) * alpha - alpha;
-}
-
 static float neg_eval(float data)
 {
    return data * -1.0f;
@ -117,45 +112,9 @@ static float round_eval(float data)
    return data;
 }

-static float erf_eval(float x)
-{
-    float res = 0;
-    float tmp = x;
-    float factorial = 1; /*n!*/
-    float x_pow = x;
-    int32_t one = 1;
-    int32_t n = 1;
-
-    if (x <= -3)
-    {
-        return -1;
-    }
-    else if (x >= 3)
-    {
-        return 1;
-    }
-
-    while (vsi_abs(tmp) > 1e-5)
-    {
-        res += tmp;
-
-        factorial *= n;
-        one *= -1;
-        x_pow *= x * x;
-        tmp = one / factorial * x_pow / ( 2 * n + 1);
-
-        n ++;
-    }
-#define VSI_MUL2_RSQRTPI    (1.1283791670955126f)
-
-    res *= VSI_MUL2_RSQRTPI;
-
-    return res;
-}
-
 static float gelu_eval(float data)
 {
-    data = (float)(0.5f * data * (1 + erf_eval(data / (float)sqrt(2.0f))));
+    data = (float)(0.5f * data * (1 + vsi_nn_erf_impl(data / (float)sqrt(2.0f))));

    return data;
 }
@ -169,6 +128,23 @@ static float hgelu_eval(float data)
    return data * cdf;
 }

+static float selu_eval(float data, float alpha, float gamma)
+{
+    float y0 = alpha * gamma * expf(data) - alpha * gamma;
+    float y1 = gamma * data;
+    float y = data <= 0 ? y0 : y1;
+
+    return y;
+}
+
+static float celu_eval(float x, float alpha)
+{
+    float positive = vsi_nn_max(0, x);
+    float negative = vsi_nn_min(alpha * (expf(x / alpha) - 1), 0);
+
+    return positive + negative;
+}
+
 DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
    (
    vsi_nn_kernel_node_t node,
@ -227,9 +203,6 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
        case UNARY_LOG:
            data = log_eval(data);
            break;
-        case UNARY_ELU:
-            data = elu_eval(data, alpha);
-            break;
        case UNARY_NEG:
            data = neg_eval(data);
            break;
@ -248,6 +221,12 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
        case UNARY_HGELU:
            data = hgelu_eval(data);
            break;
+        case UNARY_SELU:
+            data = selu_eval(data, alpha, beta);
+            break;
+        case UNARY_CELU:
+            data = celu_eval(data, alpha);
+            break;
        default:
            break;
        }
@ -287,20 +266,6 @@ static vx_param_description_t kernel_param_def[] =
 #define INPUT_SCALAR_ALPHA        (3)
 #define INPUT_SCALAR_BETA         (4)

-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _eltwise_unary_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -308,7 +273,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _eltwise_unary_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -384,10 +353,11 @@ REGISTER_ELTWISE_UNARY_BACKEND_CPU( sin,          UNARY_SIN )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( cos,          UNARY_COS )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( exp,          UNARY_EXP )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( log,          UNARY_LOG )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( elu,          UNARY_ELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( neg,          UNARY_NEG )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_sigmoid, UNARY_HSIGMOID )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( mish,         UNARY_MISH )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( round,        UNARY_ROUND )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( gelu,         UNARY_GELU )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_gelu,    UNARY_HGELU )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_gelu,    UNARY_HGELU )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( selu,         UNARY_SELU )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( celu,         UNARY_CELU )
--- a/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -58,7 +57,6 @@ static vx_param_description_t _erf_kernel_param_def[] =
 };
 #define _ERF_PARAM_NUM  _cnt_of_array( _erf_kernel_param_def )

-
 /*
 * Kernel function
 */
@ -74,8 +72,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -101,34 +99,10 @@ DEF_KERNEL_EXECUTOR(_compute)
        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
        memset( f32_out_buffer[i], 0, out_bytes[i] );
    }
-#define VSI_ERF_PI  3.141592653589793
    for (i = 0; i < out_elements[0]; i ++)
    {
-        /* 2 / sqrt(pi) * (sum[(-1)^n! * x ^ (2n + 1)] + x) */
-        float x = vsi_clamp(f32_in_buffer[0][i], -2, 2);
-        float res = 0;
-        float tmp = x;
-        float factorial = 1; /*n!*/
-        float x_pow = x;
-        int32_t one = 1;
-        int32_t n = 1;
-
-        while (vsi_abs(tmp) > 1e-5)
-        {
-            res += tmp;
-
-            factorial *= n;
-            one *= -1;
-            x_pow *= x * x;
-            tmp = one / factorial * x_pow / ( 2 * n + 1);
-
-            n ++;
-        }
-
-
-        res *= 2.0f / (float)sqrt(VSI_ERF_PI);
-
-        f32_out_buffer[0][i] = res;
+        float x = vsi_nn_erf_impl(f32_in_buffer[0][i]);
+        f32_out_buffer[0][i] = x;
    }

    /* save data */
--- a/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

--- a/src/tim/vx/internal/src/kernel/cpu/floordiv_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/floordiv_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -94,8 +93,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
@ -110,7 +109,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for(i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -236,4 +234,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( floordiv, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -181,20 +180,6 @@ static vx_param_description_t _gather_kernel_param_def[] =
 };
 #define _GATHER_PARAM_NUM  _cnt_of_array( _gather_kernel_param_def )

-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _gather_exec,
-    _gather_kernel_param_def,
-    _cnt_of_array( _gather_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -202,7 +187,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _gather_exec;
+    kernel->info.parameters  = _gather_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _gather_kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -260,4 +249,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( gather, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/gather_elements_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/gather_elements_cpu.c
@ -0,0 +1,228 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _ARG_NUM            (1)
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.gather_elements")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _gather_elements_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _GATHER_ELEMENTS_PARAM_NUM  _cnt_of_array( _gather_elements_kernel_param_def )
+
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    float * buffer[2] = { NULL };
+    int32_t* buffer_idx = NULL;
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
+    vsi_size_t a = 0;
+    vsi_size_t o = 0;
+    vsi_size_t i = 0;
+    vsi_size_t outer_size[2] = {1, 1};
+    vsi_size_t inner_size[2] = {1, 1};
+    vsi_size_t axis_size[2] = {1, 1};
+    int32_t axis = 0;
+
+    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
+    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
+
+    buffer_idx = (int32_t*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE );
+    CHECK_PTR_FAIL_GOTO( buffer_idx, "Create input1 buffer fail.", final );
+
+    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
+    memset( buffer[1], 0, out_elements * sizeof(float) );
+
+    axis_size[0] = attr[0]->shape->data[axis];
+    axis_size[1] = attr[2]->shape->data[axis];
+    for (i = 0; i < (vsi_size_t)axis; ++i)
+    {
+        inner_size[0] *= attr[0]->shape->data[i];
+        inner_size[1] *= attr[2]->shape->data[i];
+    }
+
+    for (i = axis + 1; i < attr[2]->shape->size; ++i)
+    {
+        outer_size[0] *= attr[0]->shape->data[i];
+        outer_size[1] *= attr[2]->shape->data[i];
+    }
+
+    for (o = 0; o < outer_size[1]; o++)
+    {
+        for (a = 0; a < axis_size[1]; a++)
+        {
+            for (i = 0; i < inner_size[1]; i++)
+            {
+                vsi_ssize_t index = 0;
+                vsi_size_t index0 = (o * axis_size[1] + a) * inner_size[1] + i;
+                vsi_size_t index1 = 1;
+
+                index = (vsi_ssize_t)buffer_idx[index0];
+                index = index < 0 ? index + (vsi_ssize_t)axis_size[0] : index;
+                index1 = (o * axis_size[0] + index) * inner_size[0] + i;
+
+                buffer[1][index0] = buffer[0][index1];
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
+            buffer[1], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+final:
+    if ( buffer_idx )
+    {
+        free( buffer_idx );
+    }
+    for ( i = 0; i < 2; i ++ )
+    {
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+    }
+    for ( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _gather_elements_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _gather_elements_kernel_param_def );
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GATHER_ELEMENTS_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _GATHER_ELEMENTS_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GATHER_ELEMENTS_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( gather_elements, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -166,20 +165,6 @@ static vx_param_description_t _gather_nd_kernel_param_def[] =
 };
 #define _GATHER_ND_PARAM_NUM  _cnt_of_array( _gather_nd_kernel_param_def )

-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _gather_nd_exec,
-    _gather_nd_kernel_param_def,
-    _cnt_of_array( _gather_nd_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -187,7 +172,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _gather_nd_exec;
+    kernel->info.parameters  = _gather_nd_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _gather_nd_kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -238,4 +227,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( gather_nd, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/generate_proposals_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/generate_proposals_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel.h"

 __BEGIN_DECLS

@ -195,8 +194,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -215,7 +214,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for (i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -504,4 +502,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( generate_proposals, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"

 __BEGIN_DECLS
@ -187,20 +186,6 @@ static vx_param_description_t _group_normalization_kernel_param_def[] =
 };
 #define _GROUP_NORMALIZATION_PARAM_NUM  _cnt_of_array( _group_normalization_kernel_param_def )

-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _group_norm_exec,
-    _group_normalization_kernel_param_def,
-    _cnt_of_array( _group_normalization_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -208,7 +193,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _group_norm_exec;
+    kernel->info.parameters  = _group_normalization_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _group_normalization_kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -312,4 +301,3 @@ final:
 __END_DECLS

 REGISTER_BACKEND_CPU( group_norm, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_cpu.c
@ -34,7 +34,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -474,7 +473,9 @@ static vsi_nn_kernel_node_t _setup
        if( node )
        {
            _inputs = (vsi_nn_tensor_t**)malloc(input_count * sizeof(vsi_nn_tensor_t**));
+            CHECK_PTR_FAIL_GOTO( _inputs, "Create buffer fail.", final );
            node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count);
+            CHECK_PTR_FAIL_GOTO( node_params, "Create buffer fail.", final );
            for(i = 0; i < input_count; i++)
            {
                _inputs[i] = inputs[i];
@ -504,6 +505,7 @@ static vsi_nn_kernel_node_t _setup
        }
    }

+final:
    vsi_nn_safe_free(_inputs);
    vsi_nn_safe_free(node_params);
    return node;
@ -512,4 +514,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( grucell_activation, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_sma_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_sma_cpu.c
@ -33,7 +33,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -179,4 +178,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( grucell_activation_sma, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c
@ -36,7 +36,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -124,8 +123,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
--- a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -187,20 +186,6 @@ static vx_param_description_t _instance_normalization_kernel_param_def[] =
 };
 #define _INSTANCE_NORMALIZATION_PARAM_NUM  _cnt_of_array( _instance_normalization_kernel_param_def )

-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _instance_norm_exec,
-    _instance_normalization_kernel_param_def,
-    _cnt_of_array( _instance_normalization_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -208,7 +193,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _instance_norm_exec;
+    kernel->info.parameters  = _instance_normalization_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _instance_normalization_kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

--- a/src/tim/vx/internal/src/kernel/cpu/l2normalizescale_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/l2normalizescale_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -77,8 +76,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -99,7 +98,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for(i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -246,4 +244,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( l2normalizescale, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -184,20 +183,6 @@ static vx_param_description_t _layer_normalization_kernel_param_def[] =
 };
 #define _LAYER_NORMALIZATION_PARAM_NUM  _cnt_of_array( _layer_normalization_kernel_param_def )

-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _layer_norm_exec,
-    _layer_normalization_kernel_param_def,
-    _LAYER_NORMALIZATION_PARAM_NUM,
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -205,7 +190,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _layer_norm_exec;
+    kernel->info.parameters  = _layer_normalization_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _layer_normalization_kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -252,4 +241,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( layer_norm, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c
@ -36,7 +36,6 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 __BEGIN_DECLS

 #define _CPU_ARG_NUM            (2)
@ -153,20 +152,6 @@ static vx_param_description_t kernel_param_def[] =
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
 };

-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _log_softmax_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -174,7 +159,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _log_softmax_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -232,4 +221,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( log_softmax, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/logical_not_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/logical_not_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -72,8 +71,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -85,7 +84,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for(i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -197,4 +195,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( logical_not, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/logical_ops_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/logical_ops_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -96,8 +95,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
@ -113,7 +112,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for(i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -264,4 +262,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( logical_ops, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/lstmunit_activation_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/lstmunit_activation_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -156,7 +155,6 @@ DEF_KERNEL_EXECUTOR(_compute)
            f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
            CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
        }
-
    }

    for( i = 0; i < _OUTPUT_NUM; i++ )
@ -308,7 +306,6 @@ final:
    }

    return status;
-
 } /* _compute() */


@ -331,7 +328,6 @@ static vsi_status _query_kernel
    status                   = VSI_SUCCESS;

    return status;
-
 } /* _query_kernel() */


@ -397,10 +393,8 @@ static vsi_nn_kernel_node_t _setup
    }

    return node;
-
 } /* _setup() */

 __END_DECLS

 REGISTER_BACKEND_CPU( lstmunit_activation, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -190,20 +189,6 @@ static vx_param_description_t _matrixmul_kernel_param_def[] =
 };
 #define _MATIRXMUL_PARAM_NUM  _cnt_of_array( _matrixmul_kernel_param_def )

-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _matrixmul_exec,
-    _matrixmul_kernel_param_def,
-    _cnt_of_array( _matrixmul_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -211,7 +196,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _matrixmul_exec;
+    kernel->info.parameters  = _matrixmul_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _matrixmul_kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -261,4 +250,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( matrixmul, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c
@ -36,7 +36,6 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -147,21 +146,6 @@ static vx_param_description_t kernel_param_def[] =
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
 };

-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _maximum_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -169,7 +153,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _maximum_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -211,4 +199,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( maximum, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c
@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -143,21 +142,6 @@ static vx_param_description_t kernel_param_def[] =
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
 };

-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _minimum_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -165,7 +149,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _minimum_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -207,4 +195,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( minimum, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -218,20 +217,6 @@ static vx_param_description_t _moments_kernel_param_def[] =
 };
 #define _MOMENTS_PARAM_NUM  _cnt_of_array( _moments_kernel_param_def )

-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _moments_exec,
-    _moments_kernel_param_def,
-    _cnt_of_array( _moments_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -239,7 +224,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _moments_exec;
+    kernel->info.parameters  = _moments_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _moments_kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -315,4 +304,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( moments, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

--- a/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

--- a/src/tim/vx/internal/src/kernel/cpu/poolwithargmax_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/poolwithargmax_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -86,8 +85,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -112,7 +111,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for(i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -335,4 +333,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( poolwithargmax, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c
@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -146,21 +145,6 @@ static vx_param_description_t kernel_param_def[] =
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
 };

-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _pow_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -168,7 +152,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _pow_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -210,4 +198,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( pow, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c
@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -283,21 +282,6 @@ static vx_param_description_t kernel_param_def[] =
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };

-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _pre_process_bgra_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -305,7 +289,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _pre_process_bgra_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -381,4 +369,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( pre_process_bgra, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c
@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -194,21 +193,6 @@ static vx_param_description_t kernel_param_def[] =
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };

-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _pre_process_gray_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -216,7 +200,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _pre_process_gray_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -280,4 +268,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( pre_process_gray, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c
@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -256,21 +255,6 @@ static vx_param_description_t kernel_param_def[] =
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };

-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _pre_process_nv12_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -278,7 +262,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _pre_process_nv12_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -354,4 +342,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( pre_process_nv12, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c
@ -0,0 +1,297 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _CPU_ARG_NUM        (8)
+#define _CPU_INPUT_NUM      (3)
+#define _CPU_OUTPUT_NUM     (3)
+#define _CPU_IO_NUM         (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM      (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.pre_process_rgb888_planar")
+
+#define DESCALE(x) (((x) + (1<<19)) >> 20)
+/*
+ * Kernel params
+ */
+static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM  _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def )
+
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VX_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    float * buffer[_CPU_IO_NUM] = { NULL };
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
+    uint32_t i = 0;
+    int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0;
+    float mean[3] = {0}, scale = 1;
+
+    for (i = 0; i < _CPU_IO_NUM; i++)
+    {
+        tensors[i] = (vsi_nn_kernel_tensor_t)param[i];
+        attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
+        CHECK_PTR_FAIL_GOTO( attr[i], "Create tensor attr buffer fail.", final );
+    }
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
+
+    i = 6;
+    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &mean[0]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &mean[1]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &mean[2]);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &scale);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    for (i = 0; i < 3; i++)
+    {
+        buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( buffer[i], "Create input0 buffer fail.", final );
+
+        buffer[i + 3] = (float *)malloc( out_elements * sizeof(float) );
+        CHECK_PTR_FAIL_GOTO( buffer[i + 3], "Create output buffer fail.", final );
+        memset( buffer[i + 3], 0, out_elements * sizeof(float) );
+    }
+
+    {
+        int32_t line1[2], line2[2];
+        int32_t dx = 0, dy = 0, idx = 0;
+        int32_t src_width = (int32_t)attr[0]->shape->data[0];
+        int32_t dst_width = (int32_t)attr[3]->shape->data[0];
+        int32_t dst_height = (int32_t)attr[3]->shape->data[1];
+        uint8_t result = 0;
+
+        for ( idx = 0; idx < 3; idx ++)
+        {
+            for ( dy = 0; dy < (int32_t)dst_height; dy ++)
+            {
+                for ( dx = 0; dx < (int32_t)dst_width; dx ++)
+                {
+                    int32_t source_index = 0;
+                    int32_t output_index = dx + dy * dst_width;
+                    float finalVal = 0.0f;
+
+                    if(xRatio != (1 << 15) || yRatio != (1 << 15))
+                    {
+                        int32_t fx = (dx * xRatio + (xRatio >> 1)) - (1 << 14);
+                        int32_t sx = fx & 0xffff8000; // Floor
+                        int32_t fy = 0, sy = 0;
+                        int32_t temp1 = 0;
+                        int32_t temp2 = 0;
+
+                        fx -= sx;
+                        sx = sx >> 15;
+
+                        sx = sx < 0 ? 0 : sx;
+                        sx = sx > src_width ? src_width - 1: sx;
+
+                        fx = (fx +(1 << 4)) >> 5;
+
+                        // for y
+                        fy = (dy * yRatio + (yRatio >> 1)) - (1<< 14);
+                        sy = fy & 0xffff8000; // Floor
+                        fy -= sy;
+                        sy = sy >> 15;
+
+                        sy = sy < 0 ? 0 : sy;
+                        fy = fy < 0 ? 0 : fy;
+
+                        fy = (fy + (1<< 4)) >> 5;
+
+                        sx += xOffset;
+                        sy += yOffset;
+                        source_index = (sx + sy * src_width);
+
+                        line1[0] = (int32_t)buffer[idx][source_index];
+                        line1[1] = (int32_t)buffer[idx][source_index + 1];
+                        line2[0] = (int32_t)buffer[idx][source_index + src_width];
+                        line2[1] = (int32_t)buffer[idx][source_index + src_width + 1];
+
+                        temp1 = fx * (line1[1] - line1[0]) + (line1[0] << 10);
+                        temp2 = fx * (line2[1] - line2[0]) + (line2[0] << 10);
+                        temp1 = fy * (temp2 - temp1) + (temp1 << 10);
+                        result = (uint8_t)(DESCALE(temp1));
+                        finalVal = (result - mean[idx]) * scale;
+                        buffer[idx + 3][output_index] = finalVal;
+                    }
+                    else
+                    {
+                        int32_t offset = xOffset + yOffset * src_width;
+                        source_index = dx + dy * src_width + offset;
+                        finalVal = (buffer[0][source_index] - mean[idx]) * scale;
+                        buffer[1][output_index] = finalVal;
+                    }
+                }
+            }
+        }
+    }
+    for (i = 3; i < _CPU_IO_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( tensors[i], attr[i],
+                buffer[i], out_elements );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    for ( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _pre_process_rgb888_planar_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def );
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_PRE_PROCESS_RGB888_PLANAR_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        uint32_t index = 6;
+        int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
+        int32_t scale_y  = vsi_nn_kernel_param_get_int32( params, "scale_y" );
+        int32_t left     = vsi_nn_kernel_param_get_int32( params, "left" );
+        int32_t top      = vsi_nn_kernel_param_get_int32( params, "top" );
+        float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
+        float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
+        float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
+        float scale      = vsi_nn_kernel_param_get_float32( params, "scale" );
+
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+            vsi_nn_kernel_scalar_release( &node_params[13] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( pre_process_rgb888_planar, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c
@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -282,21 +281,6 @@ static vx_param_description_t kernel_param_def[] =
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };

-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _pre_process_rgb_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -304,7 +288,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _pre_process_rgb_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -380,4 +368,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( pre_process_rgb, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c
@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -331,21 +330,6 @@ static vx_param_description_t kernel_param_def[] =
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };

-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _pre_process_yuv420_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -353,7 +337,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _pre_process_yuv420_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -429,4 +417,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( pre_process_yuv420, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c
@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -325,21 +324,6 @@ static vx_param_description_t kernel_param_def[] =
    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };

-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _pre_process_yuv444_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -347,7 +331,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _pre_process_yuv444_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -423,4 +411,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( pre_process_yuv444, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c
@ -32,7 +32,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -144,21 +143,6 @@ static vx_param_description_t kernel_param_def[] =
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
 };

-
-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _prelu_exec,
-    kernel_param_def,
-    _cnt_of_array( kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -166,7 +150,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _prelu_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -216,4 +204,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( prelu, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c
@ -38,7 +38,6 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -259,4 +258,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( random_multinomial, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/reduceall_internal_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/reduceall_internal_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -75,8 +74,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -95,7 +94,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for(i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -236,4 +234,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( reduceall_internal, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/reduceany_internal_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/reduceany_internal_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -75,8 +74,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -95,7 +94,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for(i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -236,4 +234,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( reduceany_internal, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/reducemax_internal_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/reducemax_internal_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -75,8 +74,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -95,7 +94,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for(i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -236,4 +234,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( reducemax_internal, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/reducemin_internal_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/reducemin_internal_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -75,8 +74,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -95,7 +94,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for(i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -237,4 +235,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( reducemin_internal, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/reduceprod_internal_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/reduceprod_internal_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -74,8 +73,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -94,7 +93,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for(i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -235,4 +233,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( reduceprod_internal, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/relu_keras_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/relu_keras_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -79,8 +78,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -96,7 +95,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for(i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -226,4 +224,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( relu_keras, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vsi_nn_vxkernel.h"

 __BEGIN_DECLS

@ -215,20 +214,6 @@ static vx_param_description_t _repeat_kernel_param_def[] =
 };
 #define _REPEAT_PARAM_NUM  _cnt_of_array( _repeat_kernel_param_def )

-static const vx_kernel_description_t _kernel_info =
-{
-    KERNEL_ID_PLACEHOLDER,
-    _KERNEL_NAME,
-    _repeat_exec,
-    _repeat_kernel_param_def,
-    _cnt_of_array( _repeat_kernel_param_def ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
 static vsi_status _query_kernel
    (
    vsi_nn_tensor_t* const* const inputs,
@ -236,7 +221,11 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
-    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _repeat_exec;
+    kernel->info.parameters  = _repeat_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _repeat_kernel_param_def );
+
    return VSI_SUCCESS;
 } /* _query_kernel() */

@ -283,4 +272,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( repeat, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -77,8 +76,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -100,7 +99,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for (i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -268,4 +266,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( resize_1d_bilinear, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -77,8 +76,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -97,7 +96,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for (i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -268,4 +266,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS

 REGISTER_BACKEND_CPU( resize_1d_nearest, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -77,8 +76,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
--- a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_nhwc_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_nhwc_cpu.c
@ -76,8 +76,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
--- a/src/tim/vx/internal/src/kernel/cpu/resize_nearest_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/resize_nearest_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -77,8 +76,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -102,7 +101,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for(i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -246,7 +244,6 @@ final:
    }

    return status;
-
 } /* _compute() */


@ -269,7 +266,6 @@ static vsi_status _query_kernel
    status = VSI_SUCCESS;

    return status;
-
 } /* _query_kernel() */


@ -310,10 +306,8 @@ static vsi_nn_kernel_node_t _setup
    }

    return node;
-
 } /* _setup() */

 __END_DECLS

 REGISTER_BACKEND_CPU( resize_nearest, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

@ -150,8 +149,8 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
    float *f32_in_buffer[_INPUT_NUM] = {NULL};
    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
@ -179,7 +178,6 @@ DEF_KERNEL_EXECUTOR(_compute)
        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-
    }
    for (i = 0; i < _OUTPUT_NUM; i ++)
    {
@ -369,10 +367,8 @@ static vsi_nn_kernel_node_t _setup
    }

    return node;
-
 } /* _setup() */

 __END_DECLS

 REGISTER_BACKEND_CPU( roi_align, _setup )
-
--- a/Show More
+++ b/Show More