Update internal & prebuilt-sdk for 23Q1 release (#573)

Update internal to 0e9393dbb4f653b9dfceaeaaa920d4deb8b27077 Update prebuilt-sdk to 6.4.14 release Update cmakefiles to support above updates Type: New Feature Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
2023-04-18 22:19:16 +08:00 · 2023-04-18 22:19:16 +08:00 · 6e38e64a1a
parent a32f255d7f
commit 6e38e64a1a
340 changed files with 23171 additions and 29635 deletions
--- a/prebuilt-sdk/x86_64_linux/VERSION
+++ b/prebuilt-sdk/x86_64_linux/VERSION
@ -1 +1 @@
-6.4.12_CL562241A_D561555_A558512_R558399_T558462_Oeb44e5c
+6.4.14_CL650117A_D650117_A648302_R647402_T648811_O646970
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
@ -501,6 +501,8 @@ enum vx_kernel_e {

    VX_KERNEL_STREAM_PROCESSOR = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x32,

+    VX_KERNEL_NN_BATCH_GEMM_RELU_POOLING_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x33,
+
    VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */
 };

--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
@ -173,7 +173,7 @@ VX_DECONV_3D_API_SUPPORT is used to declare that vsi openvx driver can support d
 1: support
 */
 #ifndef VX_DECONV_3D_API_SUPPORT
-#define VX_DECONV_3D_API_SUPPORT 0
+#define VX_DECONV_3D_API_SUPPORT 1
 #endif

 /*
@ -237,4 +237,26 @@ VX_STREAM_PROCESSOR_SUPPORT is used to declare that vsi openvx driver can suppor
 #define VX_SCALE_EXTRA_PARAMETER_SUPPORT 1
 #endif

+/*
+ VX_INVALIDATE_HANDLE_SUPPORT is used to declare that we refined vxSwapTensorHandle API to follow KHR OpenVX 1.3 spec: tensor don't maintain handle internally if new_ptr is NULL.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_INVALIDATE_HANDLE_SUPPORT
+#define VX_INVALIDATE_HANDLE_SUPPORT 1
+#endif
+
+/*
+ VX_ACTIVATION_EXT2_SUPPORT is used to declare that ACTIVATION can support sign, hard_sigmoid, neg, clip, exp, sin, cos,
+ log, mish, gelu, hgelu, elu, selu, celu, rcp, softsign, atan, atanh, acosh, inverse sigmoid, round and erf.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_ACTIVATION_EXT2_SUPPORT
+#define VX_ACTIVATION_EXT2_SUPPORT 1
+#endif
+
+
 #endif /* __VX_KHR_COMPATIBLE_H__ */
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
@ -219,6 +219,28 @@ enum vx_nn_activation_function_e
    VX_NN_ACTIVATION_HSWISH = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x6,
    VX_NN_ACTIVATION_CUSTOM = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x7,
    VX_NN_ACTIVATION_NONE = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x8,
+    VX_NN_ACTIVATION_SIGN_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x9,
+    VX_NN_ACTIVATION_HSIGMOID_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0xa,
+    VX_NN_ACTIVATION_NEG_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0xb,
+    VX_NN_ACTIVATION_CLIP_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0xc,
+    VX_NN_ACTIVATION_EXP_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0xd,
+    VX_NN_ACTIVATION_SIN_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0xe,
+    VX_NN_ACTIVATION_COS_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0xf,
+    VX_NN_ACTIVATION_LOG_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x10,
+    VX_NN_ACTIVATION_MISH_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x11,
+    VX_NN_ACTIVATION_GELU_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x12,
+    VX_NN_ACTIVATION_HGELU_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x13,
+    VX_NN_ACTIVATION_ELU_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x14,
+    VX_NN_ACTIVATION_SELU_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x15,
+    VX_NN_ACTIVATION_CELU_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x16,
+    VX_NN_ACTIVATION_RECIPROCAL_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x17,
+    VX_NN_ACTIVATION_SOFTSIGN_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x18,
+    VX_NN_ACTIVATION_ATAN_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x19,
+    VX_NN_ACTIVATION_ATANH_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x1a,
+    VX_NN_ACTIVATION_ACOSH_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x1b,
+    VX_NN_ACTIVATION_INVERSE_SIGMOID_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x1c,
+    VX_NN_ACTIVATION_ROUND_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x1d,
+    VX_NN_ACTIVATION_ERF_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x1e,
 };

 /*! \brief  The Convolutional network type
@ -623,6 +645,19 @@ VX_API_ENTRY vx_tensor VX_API_CALL vxCreateTensorFromHandle2(
 * \retval VX_ERROR_INVALID_REFERENCE tensor is not a valid <tt>\ref vx_tensor</tt> <tt>\ref vx_image</tt>reference created from Handle.
 */
 VX_API_ENTRY vx_status VX_API_CALL vxFlushHandle(vx_reference ref);
+/* !\brief Same as vxFlushHandle() also added by Verisilicon as extension API.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxFlushHandleVSI(vx_reference ref);
+
+#if defined(VX_INVALIDATE_HANDLE_SUPPORT) && VX_INVALIDATE_HANDLE_SUPPORT
+/*! \brief Invalidate the memory referenced by reference's handle when it is ready.
+* added by Versilicon as extension API.
+* \param [in] ref The reference(image or tensor) which created from handle.
+* \return A <tt>\ref vx_status_e</tt> enumeration.;
+* \retval VX_ERROR_INVALID_REFERENCE tensor is not a valid <tt>\ref vx_tensor</tt> <tt>\ref vx_image</tt>reference created from Handle.
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxInvalidateHandleVSI(vx_reference ref);
+#endif

 #if VX_VA40_EXT_SUPPORT
 /*! \brief Return a new tensor referencing the same memory location but with different shape.
@ -776,6 +811,14 @@ typedef struct _vx_nn_convolution_params_ext2_t

    vx_int32 depth_multiplier;               /*!< \brief depthwise multiplier value, if 0, means convolution, elsewise(>=1), the convolution is depthwiseconvolution. */
 } vx_nn_convolution_params_ext2_t;
+
+typedef struct _vx_nn_convolution_params_ext3_t
+{
+    vx_nn_convolution_params_ext2_t ext2;      /*!< \brief Convolution extension structure head */
+
+    vx_bool isPPU;                            /*!< \brief  merge convolution and relu for PPU.  */
+} vx_nn_convolution_params_ext3_t;
+
 /*==============================================================================
    NN Nodes
 =============================================================================*/
@ -2142,7 +2185,8 @@ typedef struct _vx_hardware_caps_params_ext_t
 typedef struct _vx_hardware_caps_params_ext2_t
 {
    vx_hardware_caps_params_ext_t base;
-    vx_uint32 streamProcessorExecCount;     /*!< \brief  streamprocess execution count.  */
+    vx_uint32 streamProcessorExecCount;     /*!< \brief  stream processor execution count.  */
+    vx_uint32 streamProcessorVectorSize;    /*!< \brief  stream processor vector size.  */
 } vx_hardware_caps_params_ext2_t;

 /*! \brief Queries hardware caps information.
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
@ -236,6 +236,12 @@ typedef struct _vx_nn_convolution_relu_pooling_params_ext6_t

 } vx_nn_convolution_relu_pooling_params_ext6_t, * vx_nn_convolution_relu_pooling_params_ext6;;

+typedef struct _vx_nn_convolution_relu_pooling_params_ext7_t
+{
+    vx_nn_convolution_relu_pooling_params_ext6_t ext6;  /*!< \brief convolution relu pooling params <tt>\ref vx_nn_convolution_relu_pooling_params_ext_t</tt> */
+    vx_bool       isSub;
+} vx_nn_convolution_relu_pooling_params_ext7_t, * vx_nn_convolution_relu_pooling_params_ext7;
+
 /*! \brief [Graph] Creates a Convolutional Network Convolution and Activation(Relu) and Pooling Layer Node, this fucntion match kronos NN Extension 1.2 verion.
 * \details This function implement Convolutional Network Convolution and Activation(Relu) and Pooling layer.
 *  For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined,
@ -1081,6 +1087,48 @@ VX_API_ENTRY vx_node VX_API_CALL vxTensorTableLookupLayer(
    vx_lut OutLut,
    vx_tensor output);

+typedef struct _vx_nn_gemm_relu_pooling_params_t
+{
+    vx_bool    enable_relu;                                 /*!< \brief  Enable Relu layer function or not. */
+    vx_bool    enable_leaky_relu;                           /*!< \brief  Enable LeakyRelu layer function or not. */
+    vx_float32 alpha;                                       /*!< \brief  Alpha value for Activation */
+    vx_float32 beta;                                        /*!< \brief  Beta value for Activation */
+    vx_uint32  node_count;                                  /*!< \brief  node count to merge */
+    vx_float32 merged_scale[MERGED_NODE_COUNT_MAX];         /*!< \brief  scale of merged node output */
+    vx_int32   merged_zero_point[MERGED_NODE_COUNT_MAX];    /*!< \brief  zero point of merged node output */
+    vx_enum    merged_data_type[MERGED_NODE_COUNT_MAX];     /*!< \brief  data type of merged node output */
+    vx_enum    act_func;                                    /*!< \brief  nn activation function */
+    vx_lut     lut_in;                                      /*!< \brief  LUT in */
+    vx_lut     lut_out;                                     /*!< \brief  LUT out */
+    vx_bool    enbale_const_multiplier;                     /*!< \brief  tensor mul with one of inputs as a single pixel const tensor */
+    vx_float32 const_multiplier;                            /*!< \brief  const multiplier */
+} vx_nn_gemm_relu_pooling_params_t, * vx_nn_gemm_relu_pooling_params;
+
+/*! \brief Create a batch gemm node, the calcution formula is output = matrix_a * matrix_b + matrix_c.
+ * \param [in] graph The reference to the graph.
+ * \param [in] matrix_a The first input tensor.
+ * \param [in] matrix_b The second input tensor. Must be in the same data type and batch count as first input tensor.
+ * \param [in] matrix_c The third input tensor. Must be in the same data type and batch count as first input tensor. [optional]
+ * \param [in] trans_a If true, the matrix_a has been transposed before calcution.
+ * \param [in] trans_b If true, the matrix_b has been transposed before calcution.
+ * \param [in] trans_c If true, the matrix_c has been transposed before calcution. [optional]
+ * \param [in] merge_param the parameters for gemm + op merging
+ * \param [out] output The output tensor. Output dimension must agree the formula in the description.
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation
+ * should be checked using <tt>\ref vxGetStatus</tt>
+ * \ingroup group_vision_function_gemm
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxBatchGemmReluPoolingLayer(vx_graph graph,
+                                                             vx_tensor matrix_a,
+                                                             vx_tensor matrix_b,
+                                                             vx_tensor matrix_c,
+                                                             vx_scalar trans_a,
+                                                             vx_scalar trans_b,
+                                                             vx_scalar trans_c,
+                                                             const vx_nn_gemm_relu_pooling_params merge_param,
+                                                             vx_tensor output);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h
@ -165,6 +165,7 @@ typedef enum _vx_sp_attribute_e
    VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL,
    VX_SP_ATTRIBUTE_SUM_ENGINE_NUM_CH_MINUS_ONE,
    VX_SP_ATTRIBUTE_SUM_ENGINE_2D_ACCUM_STORAGE,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_OP_SELECT,

    VX_SP_ATTRIBUTE_NUM_OF_ELEMENTS_PER_LOOP_PER_INPUT,

@ -181,6 +182,18 @@ typedef enum _vx_sp_attribute_e
    VX_SP_ATTRIBUTE_CONST3,     /* NN clamp max          */
    VX_SP_ATTRIBUTE_CONST4,     /* NN clmap min          */

+    VX_SP_ATTRIBUTE_CONST_COUNT,
+
+    VX_SP_ATTRIBUTE_SPLIT_AXIS,
+    VX_SP_ATTRIBUTE_SPLIT_MAX_SIZE,
+    VX_SP_ATTRIBUTE_SPLIT_TILEX_EQUAL_INIMAGEX,
+
+    VX_SP_ATTRIBUTE_NOT_MERGE_CONVSP,
+    VX_SP_ATTRIBUTE_UPDATE_CONST0_TO_PCQ_COEF_TENSOR,
+    VX_SP_ATTRIBUTE_RESHAPE_ARRAY, /* bit layout | output:24-29 | input3:18-23 | input2:12-17 | input1:6-11 | input0:0-5 | */
+    VX_SP_ATTRIBUTE_ALIGN_SP_CORE_AXIS,
+    VX_SP_ATTRIBUTE_KEEP_TILE_SIZE,
+
    VX_SP_ATTRIBUTE_TOTAL_COUNT,
 }
 vx_sp_attribute_e;
@ -274,9 +287,55 @@ typedef enum _vx_sp_attribute_sum_engine_2d_accum_storage_e
 }
 vx_sp_attribute_sum_engine_2d_accum_storage_e;

+typedef enum _vx_sp_attribute_sum_engine_op_select_e
+{
+    VX_SP_ATTRIBUTE_SUM_ENGINE_SUM_OP,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_MAX_OP
+} vx_sp_attribute_sum_engine_op_select_e;
+
+typedef enum _vx_sp_attribute_reshape_e
+{
+    VX_SP_ATTRIBUTE_RESHAPE_CHW2CHW = 0x00,
+    VX_SP_ATTRIBUTE_RESHAPE_CHW2WHC = 0x06,
+    VX_SP_ATTRIBUTE_RESHAPE_CHW2WCH = 0x09,
+    VX_SP_ATTRIBUTE_RESHAPE_CHW2HWC = 0x12,
+    VX_SP_ATTRIBUTE_RESHAPE_CHW2HCW = 0x18,
+    VX_SP_ATTRIBUTE_RESHAPE_CHW2CWH = 0x21,
+}
+vx_sp_attribute_reshape_e;
+
+typedef enum _vx_sp_attribute_split_axis_e
+{
+    VX_SP_ATTRIBUTE_SPLIT_ON_AXIS_X,
+    VX_SP_ATTRIBUTE_SPLIT_ON_AXIS_Y,
+    VX_SP_ATTRIBUTE_SPLIT_ON_AXIS_Z,
+    VX_SP_ATTRIBUTE_SPLIT_ON_AXIS_XY,
+    VX_SP_ATTRIBUTE_SPLIT_ON_AXIS_YZ,
+    VX_SP_ATTRIBUTE_SPLIT_ON_AXIS_XYZ,
+}
+vx_sp_attribute_split_axis_e;
+
+typedef enum _vx_sp_attribute_tile_align_sp_core_e
+{
+    VX_SP_ATTRIBUTE_TILE_ALIGN_SP_CORE_NONE = 0,
+    VX_SP_ATTRIBUTE_TILE_ALIGN_SP_CORE_WITH_AXIS_X,
+    VX_SP_ATTRIBUTE_TILE_ALIGN_SP_CORE_WITH_AXIS_Y,
+    VX_SP_ATTRIBUTE_TILE_ALIGN_SP_CORE_WITH_AXIS_XY,
+}
+vx_sp_attribute_tile_align_sp_core_e;
+
+typedef enum _vx_sp_attribute_keep_tile_size_e
+{
+    VX_SP_ATTRIBUTE_KEEP_TILE_SIZE_NONE = 0,
+    VX_SP_ATTRIBUTE_KEEP_TILE_SIZE_WITH_AXIS_X,
+    VX_SP_ATTRIBUTE_KEEP_TILE_SIZE_WITH_AXIS_Y,
+    VX_SP_ATTRIBUTE_KEEP_TILE_SIZE_WITH_AXIS_XY,
+}
+vx_sp_attribute_keep_tile_size_e;
+
 /**********************************************************************************************/

-/*! \brief Creates an opaque reference to a spinst data.
+/*! \brief Creates an external reference to a spinst data.
 * \param [in] context The reference to the implementation context.
 * \return A spinst data reference.
 * \Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>.
@ -286,7 +345,17 @@ VX_API_ENTRY vx_spinst VX_API_CALL vxCreateSPINST(
    vx_context          context
    );

-/*! \brief Releases a reference to a spinst object.
+/*! \brief Creates an internal reference to a spinst data.
+ * \param [in] context The reference to the implementation context.
+ * \return A spinst data reference.
+ * \Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_object_spinst
+ */
+VX_API_ENTRY vx_spinst VX_API_CALL vxCreateSPINSTInternal(
+    vx_context          context
+    );
+
+/*! \brief Releases a reference to a external spinst object.
 * The object may not be garbage collected until its total reference count is zero.
 * \param [in] spinst_obj The pointer to the spinst data to release.
 * \post After returning from this function the reference is zeroed.
@ -299,6 +368,19 @@ VX_API_ENTRY vx_status VX_API_CALL vxReleaseSPINST(
    vx_spinst            *spinst_obj
    );

+/*! \brief Releases a reference to a internal spinst object.
+ * The object may not be garbage collected until its total reference count is zero.
+ * \param [in] spinst_obj The pointer to the spinst data to release.
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors; all other values indicate failure
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ * \ingroup group_object_spinst
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseSPINSTInternal(
+    vx_spinst            *spinst_obj
+    );
+
 /*! \brief Add a instruction to spinst object.
 * \param [in] spinst_obj The reference to the spinst object.
 * \param [in] inst_unit_array The units of one instruction. Use a <tt>\ref vx_spinst_unit_param</tt>.
@ -332,6 +414,12 @@ VX_API_ENTRY vx_status VX_API_CALL vxSetAttributeToSPINST(
    vx_uint32          value
    );

+VX_API_ENTRY vx_status VX_API_CALL vxGetAttributeToSPINST(
+    vx_spinst          spinst_obj,
+    vx_enum            attribute,
+    vx_uint32* value
+);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
@ -539,6 +539,15 @@ typedef vx_enum vx_action;
 */
 typedef vx_action (VX_CALLBACK *vx_nodecomplete_f)(vx_node node);

+/*! \brief A callback to the client for querying information of a node.
+ * \see vx_action
+ * \see vxAssignNodeCallback
+ * \param [in] node The node to which the callback was attached.
+ * \return An action code from <tt>\ref vx_action_e</tt>.
+ * \ingroup group_node_callback
+ */
+typedef vx_status (VX_CALLBACK *vx_nodequery_f)(vx_node node);
+
 /*! \brief Vendor IDs are 2 nibbles in size and are located in the upper byte of
 * the 4 bytes of an enumeration.
 * \ingroup group_basic_features
@ -1028,6 +1037,11 @@ enum vx_node_attribute_e {

    VX_NODE_ATTRIBUTE_FOR_HW_QUALITY     = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0xA,

+    VX_NODE_SWTILING_TILE_XY                   = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x10,
+    VX_NODE_SPINST_INDEX                       = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x11,
+    VX_NODE_SPCONV_PCQ_REPLACE_SPINST          = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x12,
+    VX_NODE_SP_NAME                            = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x13,
+    VX_NODE_SPINST                             = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x14,
 };

 /*! \brief The parameter attributes list
--- a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so
--- a/prebuilt-sdk/x86_64_linux/lib/libCLC.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libCLC.so
--- a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so
--- a/prebuilt-sdk/x86_64_linux/lib/libGAL.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libGAL.so
--- a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0
+++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so
--- a/prebuilt-sdk/x86_64_linux/lib/libVSC.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libVSC.so
--- a/src/tim/CMakeLists.txt
+++ b/src/tim/CMakeLists.txt
@ -31,6 +31,7 @@ if(${TIM_VX_USE_EXTERNAL_OVXLIB})
    set(OVXLIB_INCLUDE_DIR ${OVXLIB_INC})
 else()
    set(OVXLIB_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/src/tim/vx/internal/include")
+    list(APPEND OVXLIB_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/src/tim/vx/internal/src")
 endif()
 message(STATUS "OVXLIB include directory: ${OVXLIB_INCLUDE_DIR}")

--- a/src/tim/vx/internal/BUILD
+++ b/src/tim/vx/internal/BUILD
@ -69,7 +69,6 @@ filegroup(
        "src/custom/ops/*.c",
        "src/custom/ops/kernel/evis/*.c",
        "src/custom/ops/kernel/cl/*.c",
-        "src/custom/ops/kernel/cpu/*.c",
    ])
 )

@ -84,6 +83,7 @@ cc_library(
    linkstatic = True,
    includes = [
        "include",
+        "src",
    ],
    hdrs = [
        "include/vsi_nn_pub.h",
@ -104,6 +104,7 @@ cc_library(
        "include/vsi_nn_compatibility.h",
        "include/vsi_nn_assert.h",
        "include/vsi_nn_feature.h",
+        "include/vsi_nn_post.h",
        "include/vsi_nn_rnn.h",
        "include/vsi_nn_rnn_helper.h",
        "include/vsi_nn_rnn_prv.h",
@ -121,13 +122,15 @@ cc_library(
        "include/utils/vsi_nn_limits.h",
        "include/utils/vsi_nn_dtype_util.h",
        "include/utils/vsi_nn_dtype_util_prv.h",
-        "include/utils/vsi_nn_vdata.h",
        "include/utils/vsi_nn_tensor_op.h",
+        "include/utils/vsi_nn_dlfcn.h",
        "include/utils/vsi_nn_shape_util.h",
        "include/utils/vsi_nn_constraint_check.h",
        "include/quantization/vsi_nn_asymmetric_affine.h",
        "include/quantization/vsi_nn_dynamic_fixed_point.h",
        "include/quantization/vsi_nn_perchannel_symmetric_affine.h",
+        "include/post/vsi_nn_post_fasterrcnn.h",
+        "include/post/vsi_nn_post_cmupose.h",
        "include/interface/ops.def",
        "include/kernel/vsi_nn_kernel.h",
        "include/kernel/vsi_nn_gpu.h",
@ -168,6 +171,9 @@ cc_library(
        "src/vsi_nn_daemon.c",
        "src/vsi_nn_graph_optimization.c",
        "src/vsi_nn_pre_post_process.c",
+        "src/vsi_nn_tensor_util_prv.h",
+        "src/vsi_nn_types_prv.h",
+        "src/vsi_nn_kernel_prv.h",
        "src/utils/vsi_nn_link_list.c",
        "src/utils/vsi_nn_util.c",
        "src/utils/vsi_nn_math.c",
@ -177,14 +183,16 @@ cc_library(
        "src/utils/vsi_nn_hashmap.c",
        "src/utils/vsi_nn_limits.c",
        "src/utils/vsi_nn_dtype_util.c",
-        "src/utils/vsi_nn_vdata.c",
        "src/utils/vsi_nn_tensor_op.c",
+        "src/utils/vsi_nn_dlfcn.c",
        "src/utils/vsi_nn_shape_util.c",
        "src/utils/vsi_nn_dtype.c",
        "src/utils/vsi_nn_constraint_check.c",
        "src/quantization/vsi_nn_asymmetric_affine.c",
        "src/quantization/vsi_nn_dynamic_fixed_point.c",
        "src/quantization/vsi_nn_perchannel_symmetric_affine.c",
+        "src/post/vsi_nn_post_fasterrcnn.c",
+        "src/post/vsi_nn_post_cmupose.c",
        "src/kernel/vsi_nn_kernel.c",
        "src/kernel/vsi_nn_kernel_util.c",
        "src/kernel/vsi_nn_kernel_backend.c",
@ -202,4 +210,3 @@ cc_library(
      + [":custom_srcs"],
    deps = ["//prebuilt-sdk:VIV_SDK_LIB"]
 )
-
--- a/src/tim/vx/internal/include/custom/custom_node_type.def
+++ b/src/tim/vx/internal/include/custom/custom_node_type.def
@ -5,3 +5,4 @@ DEF_NODE_TYPE(custom_softmax)
 DEF_NODE_TYPE(custom_ainr_denoise_postprocess)
 DEF_NODE_TYPE(custom_warp_affine)
 DEF_NODE_TYPE(custom_warp_perspective)
+DEF_NODE_TYPE(custom_sample)
--- a/src/tim/vx/internal/include/custom/custom_ops.def
+++ b/src/tim/vx/internal/include/custom/custom_ops.def
@ -5,3 +5,4 @@ DEF_OP(CUSTOM_SOFTMAX)
 DEF_OP(CUSTOM_AINR_DENOISE_POSTPROCESS)
 DEF_OP(CUSTOM_WARP_AFFINE)
 DEF_OP(CUSTOM_WARP_PERSPECTIVE)
+DEF_OP(CUSTOM_SAMPLE)
--- a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_sample.h
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_sample.h
@ -0,0 +1,35 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VSI_NN_OP_CUSTOM_SAMPLE_H
+#define _VSI_NN_OP_CUSTOM_SAMPLE_H
+
+#include "vsi_nn_platform.h"
+#include "vsi_nn_types.h"
+
+typedef struct _vsi_nn_custom_sample_param
+{
+    int32_t axis;
+} vsi_nn_custom_sample_param;
+
+#endif
--- a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
+++ b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
@ -30,5 +30,6 @@
 #include "custom/ops/vsi_nn_op_custom_ainr_denoise_postprocess.h"
 #include "custom/ops/vsi_nn_op_custom_warp_affine.h"
 #include "custom/ops/vsi_nn_op_custom_warp_perspective.h"
+#include "custom/ops/vsi_nn_op_custom_sample.h"

 #endif
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@ -183,3 +183,13 @@ DEF_OP(LPPOOL)
 DEF_OP(SCATTER_ELEMENTS)
 DEF_OP(PRE_PROCESS_YUV422)
 DEF_OP(BUCKETIZE)
+DEF_OP(GLOBALLPPOOL)
+DEF_OP(AVG_POOL3D)
+DEF_OP(ATAN)
+DEF_OP(ATANH)
+DEF_OP(ACOSH)
+DEF_OP(MAXUNPOOL)
+DEF_OP(REVERSESEQUENCE)
+DEF_OP(INVERSE_SIGMOID)
+DEF_OP(GRID_SAMPLE)
+DEF_OP(LPNORM)
--- a/src/tim/vx/internal/include/internal/internal_ops.def
+++ b/src/tim/vx/internal/include/internal/internal_ops.def
@ -20,3 +20,4 @@ DEF_OP(SPACE2DEPTH_INTERNAL)
 DEF_OP(GRUCELL_H_TIMES_ACTIVATION_R)
 DEF_OP(GRUCELL_ACTIVATION_Z_H)
 DEF_OP(REDUCE_MEAN_INTERNAL)
+DEF_OP(BILINEAR_GRID_SAMPLE)
--- a/src/tim/vx/internal/include/kernel/vsi_nn_gpu.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_gpu.h
@ -59,7 +59,7 @@ typedef struct
    gpu_dp_type_e type;
 } gpu_dp_inst_t;

-typedef struct
+typedef struct VSI_PUBLIC_TYPE
 {
    uint32_t dim;
    size_t   global_offset[GPU_MAX_DIMENSION_SIZE];
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
@ -51,7 +51,7 @@ typedef enum
    VSI_NN_KERNEL_TYPE_SP,
    VSI_NN_KERNEL_TYPE_NUM,
    VSI_NN_KERNEL_TYPE_NONE = VSI_NN_KERNEL_TYPE_NUM
-} vsi_nn_kernel_type_e;
+} VSI_PUBLIC_TYPE  vsi_nn_kernel_type_e;

 /** Kernel pirority */
 enum
@ -79,7 +79,7 @@ typedef enum
    BOOL8,
    I4,
    U4,
-} vsi_nn_kernel_dtype_e;
+} VSI_PUBLIC_TYPE vsi_nn_kernel_dtype_e;

 typedef enum
 {
@ -98,7 +98,7 @@ typedef enum
    VSI_NN_GPU_SOURCE_FMT_CODE = 0,
    VSI_NN_GPU_SOURCE_FMT_EXECUTABLE = 1,
    VSI_NN_GPU_SOURCE_FMT_NUM
-} vsi_nn_gpu_source_fmt_e;
+} VSI_PUBLIC_TYPE vsi_nn_gpu_source_fmt_e;

 typedef char * vsi_nn_kernel_source_t;
 typedef uint32_t vsi_nn_kernel_unique_id_t;
@ -125,7 +125,7 @@ typedef struct
        vsi_nn_kernel_source_info_t sources[VSI_NN_GPU_SOURCE_FMT_NUM];
        vsi_nn_gpu_source_fmt_e active_source_fmt;
    } gpu;
-} vsi_nn_kernel_t;
+} VSI_PUBLIC_TYPE vsi_nn_kernel_t;

 typedef struct
 {
@ -172,15 +172,15 @@ typedef struct
    int32_t allow_kernel_num;
 } vsi_nn_kernel_selector_t;

-typedef void * vsi_nn_kernel_node_param_t;
+typedef void * VSI_PUBLIC_TYPE vsi_nn_kernel_node_param_t;

 typedef void * vsi_nn_kernel_tensor_t;

-typedef void * vsi_nn_kernel_node_t;
+typedef void * VSI_PUBLIC_TYPE vsi_nn_kernel_node_t;

 typedef void * vsi_nn_kernel_graph_t;

-typedef void * vsi_nn_kernel_scalar_t;
+typedef void * VSI_PUBLIC_TYPE vsi_nn_kernel_scalar_t;

 typedef vsi_nn_hashmap_t vsi_nn_kernel_param_t;

--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
@ -51,6 +51,10 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum
    VSI_NN_KERNEL_LUT_LINEAR_EXP       = 17,
    VSI_NN_KERNEL_LUT_LINEAR_RSQRT     = 18,
    VSI_NN_KERNEL_LUT_LINEAR_SIGMOID   = 19,
+    VSI_NN_KERNEL_LUT_ATAN             = 20,
+    VSI_NN_KERNEL_LUT_ATANH            = 21,
+    VSI_NN_KERNEL_LUT_ACOSH            = 22,
+    VSI_NN_KERNEL_LUT_INVERSE_SIGMOID  = 23,

 };

@ -67,6 +71,8 @@ typedef struct _vsi_nn_kernel_lut_
 typedef struct  _vsi_nn_kernel_lut_params
 {
    vsi_enum act_type;
+    vsi_bool pwl_sign_remove_support;
+    float clamp_min;
    float params[16];
 } vsi_nn_kernel_lut_params;

--- a/src/tim/vx/internal/include/libnnext/vsi_nn_vxkernel.h
+++ b/src/tim/vx/internal/include/libnnext/vsi_nn_vxkernel.h
@ -47,7 +47,7 @@ typedef struct vsi_nn_kernel_info
    vx_kernel_description_t ** kernel;
    uint8_t kernel_index;
    uint8_t init_index;
-} vsi_nn_kernel_info_t;
+} VSI_PUBLIC_TYPE vsi_nn_kernel_info_t;

 uint8_t * vsi_nn_LoadBinarySource
    (
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_argmax.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_argmax.h
@ -112,6 +112,7 @@ typedef struct _vsi_nn_argmax_param
    /* argmax layer local data structure */
    vsi_nn_argmax_lcl_data local;
    int32_t axis;
+    vsi_bool keep_dims;
 } vsi_nn_argmax_param;

 #ifdef __cplusplus
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h
@ -111,6 +111,7 @@ typedef struct _vsi_nn_argmin_param
    /* argmin layer local data structure */
    vsi_nn_argmin_lcl_data local;
    int32_t axis;
+    vsi_bool keep_dims;
 } vsi_nn_argmin_param;

 #ifdef __cplusplus
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_avg_pool3d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_avg_pool3d.h
@ -0,0 +1,53 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_AVG_POOL3D_H
+#define _VSI_NN_OP_AVG_POOL3D_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_avg_pool3d_param
+{
+    /* round_type is used to calculate the output shape */
+    vsi_nn_round_type_e round_type;
+    uint32_t     ksize[3];
+    uint32_t     stride[3];
+    /* Pad left, right, top, bottom, front, end */
+    uint32_t     pad[6];
+    /* Pad type default value shall be AUTO */
+    vsi_nn_pad_e pad_type;
+    /* Whether include pad pixels when calculating value for the edges */
+    int32_t      count_include_pad;
+} vsi_nn_avg_pool3d_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_bilinear_grid_sample.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bilinear_grid_sample.h
@ -21,36 +21,31 @@
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
-#ifndef _VSI_NN_VDATA_H
-#define _VSI_NN_VDATA_H

-#include <stdio.h>
-#include <stdint.h>
+#ifndef _VSI_NN_OP_BILINEAR_GRID_SAMPLE_H
+#define _VSI_NN_OP_BILINEAR_GRID_SAMPLE_H

-#include "vsi_nn_graph.h"
-#include "vsi_nn_node.h"
-#include "vsi_nn_tensor.h"
+#include "vsi_nn_types.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

-OVXLIB_API uint8_t * vsi_nn_VdataCreate
-    (
-    vsi_nn_graph_t * graph,
-    vsi_nn_node_t  * node,
-    uint32_t      * p_stream_size
-    );

-OVXLIB_API vsi_nn_tensor_t * vsi_nn_CreateVDataTensor
-    (
-    vsi_nn_graph_t       * graph,
-    uint8_t             * stream,
-    vsi_nn_tensor_attr_t * attr
-    );
+typedef struct _vsi_nn_bilinear_grid_sample_param
+{
+    struct _bilinear_grid_sample_local_data_t* local;
+    vsi_bool align_corners;
+    vsi_nn_pad_mode_e padding_mode;
+    int32_t const_val;
+} vsi_nn_bilinear_grid_sample_param;
+
+_compiler_assert(offsetof(vsi_nn_bilinear_grid_sample_param, local) == 0, \
+    vsi_nn_bilinear_grid_sample_h );

 #ifdef __cplusplus
 }
 #endif

 #endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_gather_nd.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gather_nd.h
@ -21,34 +21,23 @@
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
+#ifndef _VSI_NN_OP_GATHER_ND_H
+#define _VSI_NN_OP_GATHER_ND_H

-#include "vsi_nn_graph.h"
-#include "vsi_nn_node.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_ops.h"
-#include "vsi_nn_log.h"
-#include "utils/vsi_nn_util.h"
+#include "vsi_nn_types.h"

-uint8_t * vsi_nn_VdataCreate
-    (
-    vsi_nn_graph_t * graph,
-    vsi_nn_node_t  * node,
-    uint32_t      * p_stream_size
-    )
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_gather_nd_param
 {
-    return NULL;
-} /* vsi_nn_VdataCreate() */
+    int32_t     batch_dims;
+} vsi_nn_gather_nd_param;

-vsi_nn_tensor_t * vsi_nn_CreateVDataTensor
-    (
-    vsi_nn_graph_t       * graph,
-    uint8_t             * stream,
-    vsi_nn_tensor_attr_t * attr
-    )
-{
-    return NULL;
-} /* vsi_nn_CreateVDataTensor() */
+#ifdef __cplusplus
+}
+#endif
+
+#endif

--- a/src/tim/vx/internal/include/ops/vsi_nn_op_globallppool.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_globallppool.h
@ -0,0 +1,44 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_GLOBALLPPOOL_H
+#define _VSI_NN_OP_GLOBALLPPOOL_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_globallppool_param
+{
+    int32_t p;
+} vsi_nn_globallppool_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grid_sample.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grid_sample.h
@ -0,0 +1,58 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_GRID_SAMPLE_H
+#define _VSI_NN_OP_GRID_SAMPLE_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//typedef uint32_t vsi_nn_grid_sample_mode_t;
+//enum { bilinear = 0, nearest };
+//
+//typedef uint32_t vsi_nn_grid_sample_padding_mode_t;
+//enum { zeros = 0, CONST };
+
+typedef struct _grid_sample_local_data_t {
+    int32_t placeholder;
+} grid_sample_local_data_t;
+
+typedef struct _vsi_nn_grid_sample_param
+{
+    grid_sample_local_data_t* local;
+    vsi_enum mode;
+    vsi_bool align_corners;
+    vsi_nn_pad_mode_e padding_mode;
+    int32_t const_val;
+} vsi_nn_grid_sample_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_imageprocess.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_imageprocess.h
@ -67,7 +67,7 @@ typedef struct _vsi_nn_imageprocess_param
        int32_t mean_value_size;
        float* mean_value;
    } mean;
-} vsi_nn_imageprocess_param;
+} VSI_PUBLIC_TYPE vsi_nn_imageprocess_param;

 /**
 * Insert imageprocess op for image pre process
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_inverse_sigmoid.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_inverse_sigmoid.h
@ -0,0 +1,45 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_INVERSE_SIGMOID_H
+#define _VSI_NN_OP_INVERSE_SIGMOID_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_inverse_sigmoid_param
+{
+    // Add parameters here
+    float eps;
+} vsi_nn_inverse_sigmoid_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_lpnorm.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lpnorm.h
@ -0,0 +1,45 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_LPNORM_H
+#define _VSI_NN_OP_LPNORM_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_lpnorm_param
+{
+    int axis;
+    int p;
+} vsi_nn_lpnorm_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_maxunpool.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_maxunpool.h
@ -0,0 +1,48 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_MAXUNPOOL_H
+#define _VSI_NN_OP_MAXUNPOOL_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_maxunpool_param
+{
+    // Add parameters here
+    uint32_t ksize[2];
+    uint32_t pad[4];
+    uint32_t stride[2];
+    const uint32_t *output_size;
+} vsi_nn_maxunpool_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h
@ -68,6 +68,8 @@ typedef struct _vsi_nn_pre_process_nv12_param
    vsi_bool reverse_channel;

    vsi_nn_pre_process_nv12_lcl_data* local;
+
+    vsi_nn_nv_type nv_type;
 } vsi_nn_pre_process_nv12_param;

 #ifdef __cplusplus
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reduce_mean_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduce_mean_internal.h
@ -38,6 +38,7 @@ typedef struct _vsi_nn_reduce_mean_internal_param
    vx_int32    *axis;
    vx_uint32   axis_num;
    float       scale;
+    vsi_enum     type;
 } vsi_nn_reduce_mean_internal_param;
 _compiler_assert(offsetof(vsi_nn_reduce_mean_internal_param, local) == 0, \
    vsi_nn_reduce_mean_internal_h );
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reversesequence.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reversesequence.h
@ -0,0 +1,45 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_REVERSESEQUENCE_H
+#define _VSI_NN_OP_REVERSESEQUENCE_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_reversesequence_param
+{
+    int32_t batch_axis;
+    int32_t time_axis;
+} vsi_nn_reversesequence_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h
@ -38,6 +38,7 @@ typedef struct _vsi_nn_roi_align_param
    float width_ratio;
    int32_t height_sample_num;
    int32_t width_sample_num;
+    vsi_nn_roi_align_type_e platform_type;
 } vsi_nn_roi_align_param;

 #ifdef __cplusplus
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_strided_slice.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_strided_slice.h
@ -71,6 +71,7 @@ typedef struct _vsi_nn_strided_slice_lcl_data2

    vsi_bool is_dataconvert_op;
    vsi_bool is_optimized;
+    vsi_bool is_same_shape;

    strided_slice_param params;
 } vsi_nn_strided_slice_lcl_data2;
--- a/src/tim/vx/internal/include/post/vsi_nn_post_cmupose.h
+++ b/src/tim/vx/internal/include/post/vsi_nn_post_cmupose.h
@ -0,0 +1,163 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VSI_NN_POST_CMUPOSE_H_
+#define _VSI_NN_POST_CMUPOSE_H_
+
+#include "utils/vsi_nn_link_list.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_subset_data_t
+{
+    float idx[20];
+}vsi_nn_subset_data_t;
+
+typedef struct _vsi_nn_subset_t
+{
+    vsi_nn_link_list_t link_list;
+    vsi_nn_subset_data_t data;
+} VSI_PUBLIC_TYPE vsi_nn_subset_t;
+
+typedef struct _vsi_nn_peaks_data_t
+{
+    uint32_t location[2];
+    float score;
+    uint32_t id;
+} VSI_PUBLIC_TYPE vsi_nn_peaks_data_t;
+
+typedef struct _vsi_nn_peaks_t
+{
+    vsi_nn_link_list_t link_list;
+    vsi_nn_peaks_data_t peak;
+} VSI_PUBLIC_TYPE vsi_nn_peaks_t;
+
+typedef struct _vsi_nn_conncection_data_t
+{
+    uint32_t x;
+    uint32_t y;
+    float score;
+    uint32_t i;
+    uint32_t j;
+}vsi_nn_connection_data_t;
+
+typedef struct _vsi_nn_connection_t
+{
+    vsi_nn_link_list_t link_list;
+    vsi_nn_connection_data_t data;
+}vsi_nn_connection_t;
+
+typedef struct _vsi_nn_con_candidate_data_t
+{
+    uint32_t i;
+    uint32_t j;
+    float score;
+    float candAB;
+}vsi_nn_con_candidate_data_t;
+
+typedef struct _vsi_nn_con_candidate_t
+{
+    vsi_nn_link_list_t link_list;
+    vsi_nn_con_candidate_data_t data;
+}vsi_nn_con_candidate_t;
+
+typedef struct _vsi_nn_cmupose_multiplier_t
+{
+    float *size;
+    uint32_t num;
+}vsi_nn_cmupose_multiplier_t;
+
+typedef struct _vsi_nn_cmupose_image_t
+{
+    uint32_t width;
+    uint32_t height;
+    uint32_t channel;
+} VSI_PUBLIC_TYPE vsi_nn_cmupose_image_t;
+
+typedef struct _vsi_nn_cmupose_scale_search_t
+{
+    float *size;
+    uint32_t num;
+}vsi_nn_cmupose_scale_search_t;
+
+typedef struct _vsi_nn_cmupose_model_t
+{
+    uint32_t boxsize;
+    uint32_t stride;
+    uint32_t padValue;
+} VSI_PUBLIC_TYPE vsi_nn_cmupose_model_t;
+
+typedef struct _vsi_nn_cmupose_param_t
+{
+    float thre1;
+    float thre2;
+    float thre3;
+    uint32_t mid_num;
+    vsi_nn_cmupose_scale_search_t scale_search;
+} VSI_PUBLIC_TYPE vsi_nn_cmupose_param_t;
+
+typedef struct _vsi_nn_cmupose_inputs_t
+{
+    vsi_nn_tensor_t *net_out;
+} VSI_PUBLIC_TYPE vsi_nn_cmupose_inputs_t;
+
+typedef struct _vsi_nn_cmupose_config_t
+{
+    vsi_nn_cmupose_inputs_t inputs;
+    vsi_nn_cmupose_param_t  param;
+    vsi_nn_cmupose_model_t  model;
+    vsi_nn_cmupose_image_t  image;
+} VSI_PUBLIC_TYPE vsi_nn_cmupose_config_t;
+
+OVXLIB_API vsi_status vsi_nn_CMUPose_Post_Process
+    (
+    float *net_out,
+    vsi_nn_cmupose_config_t *config,
+    vsi_nn_peaks_t ***all_peaks_out,
+    uint32_t *all_peaks_num_out,
+    vsi_nn_subset_t **subset_list_out,
+    vsi_nn_peaks_data_t **peak_candidate_out,
+    uint32_t *peak_candidate_num_out
+    );
+
+OVXLIB_API vsi_status vsi_nn_CMUPose_PostProcess
+    (
+    vsi_nn_graph_t *graph,
+    vsi_nn_cmupose_inputs_t *inputs,
+    vsi_nn_cmupose_image_t *image,
+    vsi_nn_cmupose_param_t *param,
+    vsi_nn_cmupose_model_t *model,
+    vsi_nn_peaks_t ***all_peaks,
+    uint32_t *all_peaks_num,
+    vsi_nn_peaks_data_t **candidate,
+    uint32_t *candidate_num,
+    vsi_nn_subset_t **subset
+    );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/tim/vx/internal/include/post/vsi_nn_post_fasterrcnn.h
+++ b/src/tim/vx/internal/include/post/vsi_nn_post_fasterrcnn.h
@ -0,0 +1,79 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VSI_NN_POST_FASTERRCNN_H_
+#define _VSI_NN_POST_FASTERRCNN_H_
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_node_type.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_link_list.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_fasterrcnn_box_t
+{
+    vsi_nn_link_list_t link_list;
+
+    /* upper-left coordinate(x1,y1) */
+    float x1;
+    float y1;
+    /* lower-right coordinate(x2,y2) */
+    float x2;
+    float y2;
+    float score;
+    uint32_t class_id;
+} VSI_PUBLIC_TYPE vsi_nn_fasterrcnn_box_t;
+
+typedef struct _vsi_nn_fasterrcnn_param_t
+{
+    float conf_thresh;
+    float nms_thresh;
+    const char **classes;
+    uint32_t classes_num;
+    uint32_t rois_num;
+    vsi_nn_proposal_im_info iminfo;
+} VSI_PUBLIC_TYPE vsi_nn_fasterrcnn_param_t;
+
+typedef struct _vsi_nn_fasterrcnn_inputs_t
+{
+    vsi_nn_tensor_t *rois;
+    vsi_nn_tensor_t *cls;
+    vsi_nn_tensor_t *bbox;
+} VSI_PUBLIC_TYPE vsi_nn_fasterrcnn_inputs_t;
+
+OVXLIB_API vsi_status vsi_nn_FasterRCNN_PostProcess
+    (
+    vsi_nn_graph_t *graph,
+    vsi_nn_fasterrcnn_inputs_t *inputs,
+    vsi_nn_fasterrcnn_param_t *param,
+    vsi_nn_fasterrcnn_box_t **dets_box
+    );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/tim/vx/internal/include/utils/vsi_nn_binary_tree.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_binary_tree.h
@ -29,8 +29,9 @@ extern "C"{
 #endif

 #include <stdint.h>
+#include "vsi_nn_feature_config.h"

-typedef int64_t vsi_nn_binary_tree_key_t;
+typedef int64_t VSI_PUBLIC_TYPE vsi_nn_binary_tree_key_t;

 #define vsi_nn_BinaryTreeInitRoot(n) do{n = NULL;} while (0);

@ -40,7 +41,7 @@ typedef struct _vsi_nn_binary_tree
    struct _vsi_nn_binary_tree * right;
    vsi_nn_binary_tree_key_t     key;
    void * data_ptr;
-} vsi_nn_binary_tree_t;
+} VSI_PUBLIC_TYPE vsi_nn_binary_tree_t;

 OVXLIB_API void vsi_nn_BinaryTreeRemoveNode
    (
--- a/src/tim/vx/internal/include/utils/vsi_nn_dlfcn.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dlfcn.h
@ -0,0 +1,65 @@
+#ifndef __VSI_NN_DLFCN_H
+#define __VSI_NN_DLFCN_H
+
+#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
+#define  RTLD_LAZY   0
+#define  RTLD_NOW    0
+
+#define  RTLD_GLOBAL (1 << 1)
+#define  RTLD_LOCAL  (1 << 2)
+
+#define  RTLD_DEFAULT    ((void *)0)
+#define  RTLD_NEXT       ((void *)-1)
+
+#else
+#include <dlfcn.h>
+#endif
+
+/**
+ * Opend a shared library
+ *
+ * @param[in] Library path
+ * @param[in] Opend mode.
+ *
+ * @return Library handle on success, or NULL otherwise.
+ */
+void* vsi_nn_dlopen
+    (
+    const char *file,
+    int mode
+    );
+
+/**
+ * Close the opened library
+ *
+ * @param[in] Library handler
+ *
+ * @return TRUE on success
+ */
+int vsi_nn_dlclose
+    (
+    void *handle
+    );
+
+/**
+ * Find symbol from opened library
+ *
+ * @param[in] Library handler
+ * @param[in] Symbol name to find.
+ *
+ * @return Symbol
+ */
+void* vsi_nn_dlsym
+    (
+    void *handle,
+    const char *name
+    );
+
+/**
+ * Get error info.
+ *
+ * @return Error message.
+ */
+char * vsi_nn_dlerror(void);
+#endif
+
--- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
@ -464,6 +464,7 @@ static VSI_INLINE_API vsi_status dtype_to_float32
    case VSI_NN_TYPE_BOOL8:
    case VSI_NN_TYPE_UINT8:
    case VSI_NN_TYPE_INT16:
+    case VSI_NN_TYPE_UINT16:
    case VSI_NN_TYPE_INT32:
        {
            int32_t src_value = 0;
@ -516,6 +517,7 @@ static VSI_INLINE_API vsi_status float32_to_dtype
    case VSI_NN_TYPE_BOOL8:
    case VSI_NN_TYPE_UINT8:
    case VSI_NN_TYPE_INT16:
+    case VSI_NN_TYPE_UINT16:
    case VSI_NN_TYPE_INT32:
    case VSI_NN_TYPE_UINT32:
        {
--- a/src/tim/vx/internal/include/utils/vsi_nn_link_list.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_link_list.h
@ -36,7 +36,7 @@ typedef struct _vsi_nn_link_list
 {
    struct _vsi_nn_link_list * prev;
    struct _vsi_nn_link_list * next;
-} vsi_nn_link_list_t;
+} VSI_PUBLIC_TYPE vsi_nn_link_list_t;

 typedef void ( * vsi_nn_link_list_init_t )
    (
--- a/src/tim/vx/internal/include/utils/vsi_nn_map.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_map.h
@ -32,7 +32,7 @@
 extern "C"{
 #endif

-typedef vsi_nn_binary_tree_key_t vsi_nn_map_key_t;
+typedef vsi_nn_binary_tree_key_t VSI_PUBLIC_TYPE vsi_nn_map_key_t;

 typedef struct _vsi_nn_map_key_list
 {
@ -45,7 +45,7 @@ typedef struct _vsi_nn_map
    int size;
    vsi_nn_map_key_list_t * keys;
    vsi_nn_binary_tree_t  * values;
-} vsi_nn_map_t;
+} VSI_PUBLIC_TYPE vsi_nn_map_t;

 OVXLIB_API void vsi_nn_MapInit
    (
--- a/src/tim/vx/internal/include/vsi_nn_compatibility.h
+++ b/src/tim/vx/internal/include/vsi_nn_compatibility.h
@ -99,6 +99,30 @@ typedef    enum vx_nn_activation_function_e                                vx_co
 #define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_NONE                        VX_NN_ACTIVATION_NONE
 #define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SWISH                       VX_NN_ACTIVATION_SWISH
 #define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_HWISH                       VX_NN_ACTIVATION_HSWISH
+#if (VX_ACTIVATION_EXT2_SUPPORT)
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SIGN                        VX_NN_ACTIVATION_SIGN_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_HSIGMOID                    VX_NN_ACTIVATION_HSIGMOID_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_NEG                         VX_NN_ACTIVATION_NEG_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_CLIP                        VX_NN_ACTIVATION_CLIP_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_EXP                         VX_NN_ACTIVATION_EXP_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SIN                         VX_NN_ACTIVATION_SIN_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_COS                         VX_NN_ACTIVATION_COS_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LOG                         VX_NN_ACTIVATION_LOG_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_MISH                        VX_NN_ACTIVATION_MISH_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_GELU                        VX_NN_ACTIVATION_GELU_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_HGELU                       VX_NN_ACTIVATION_HGELU_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_ELU                         VX_NN_ACTIVATION_ELU_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SELU                        VX_NN_ACTIVATION_SELU_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_CELU                        VX_NN_ACTIVATION_CELU_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RECIPROCAL                  VX_NN_ACTIVATION_RECIPROCAL_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SOFTSIGN                    VX_NN_ACTIVATION_SOFTSIGN_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_ATAN                        VX_NN_ACTIVATION_ATAN_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_ATANH                       VX_NN_ACTIVATION_ATANH_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_ACOSH                       VX_NN_ACTIVATION_ACOSH_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_INVERSE_SIGMOID             VX_NN_ACTIVATION_INVERSE_SIGMOID_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_ROUND                       VX_NN_ACTIVATION_ROUND_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_ERF                         VX_NN_ACTIVATION_ERF_VSI
+#endif

 /*
  keep the backward compatibility with spec 1.1 for vxCopyTensorPatch_11
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@ -77,6 +77,7 @@ typedef struct _vsi_nn_runtime_option_t
    int32_t enable_concat_optimize;
    int32_t enable_asymi8_to_u8;
    int32_t enable_dataconvert_optimize;
+    int32_t enable_stream_processor;
 } vsi_nn_runtime_option_t;

 /**
@ -87,7 +88,7 @@ typedef struct _vsi_nn_context_t
    vx_context c;
    vsi_nn_hw_config_t config;
    vsi_nn_runtime_option_t options;
-} *vsi_nn_context_t;
+} VSI_PUBLIC_TYPE *vsi_nn_context_t;

 /**
 * Create context
--- a/src/tim/vx/internal/include/vsi_nn_feature_config.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h
@ -1,7 +1,46 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the Software),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
 /*****Auto generated header file, Please DO NOT modify manually!*****/
 #ifndef _VSI_NN_FEATURE_CONFIG_H
 #define _VSI_NN_FEATURE_CONFIG_H

+#define VSI_PUBLIC_TYPE
+#include <VX/vx_khr_cnn.h>
+#if defined(VX_KHR_COMPATIBILITY) && (0x1==VX_KHR_COMPATIBILITY)
+#include <VX/vx_khr_compatible.h>
+#endif
+#ifndef VSI_PERCHANNEL_QUANTIZATION_SUPPORT
 #define VSI_PERCHANNEL_QUANTIZATION_SUPPORT
+#endif
+#if defined(VX_INVALIDATE_HANDLE_SUPPORT) && VX_INVALIDATE_HANDLE_SUPPORT
+#define VSI_INVALIDATE_HANDLE_SUPPORT
+#endif
+#ifndef VSI_0_D_TENSOR_SUPPORT
+#define VSI_0_D_TENSOR_SUPPORT
+#endif
+#if defined(VX_TENSORVIEW_ON_ANY_DIM) && VX_TENSORVIEW_ON_ANY_DIM
+#define VSI_CONCAT_ENHANCE_SUPPORT
+#endif

 #endif
--- a/src/tim/vx/internal/include/vsi_nn_graph.h
+++ b/src/tim/vx/internal/include/vsi_nn_graph.h
@ -74,7 +74,7 @@ extern "C" {
 /**
 * Graph structure
 */
-struct _vsi_nn_graph
+struct VSI_PUBLIC_TYPE _vsi_nn_graph
 {
    /** Context */
    vsi_nn_context_t   ctx;
@ -167,6 +167,8 @@ struct _vsi_nn_graph
    } complete_signal;

    vsi_bool isAllowFastMode;
+
+    //DO NOT modify this sturct.
 };

 /**
--- a/src/tim/vx/internal/include/vsi_nn_log.h
+++ b/src/tim/vx/internal/include/vsi_nn_log.h
@ -46,7 +46,7 @@ typedef enum _vsi_nn_log_level_e
    VSI_NN_LOG_WARN,
    VSI_NN_LOG_INFO,
    VSI_NN_LOG_DEBUG
-}vsi_nn_log_level_e;
+} VSI_PUBLIC_TYPE vsi_nn_log_level_e;

 #define VSI_NN_MAX_DEBUG_BUFFER_LEN 1024
 #define VSILOGE( fmt, ... ) \
--- a/src/tim/vx/internal/include/vsi_nn_node.h
+++ b/src/tim/vx/internal/include/vsi_nn_node.h
@ -58,7 +58,7 @@ typedef struct _vsi_nn_node_attr_t
 } vsi_nn_node_attr_t;

 /** Node structure */
-struct _vsi_nn_node
+struct VSI_PUBLIC_TYPE _vsi_nn_node
 {
    /**
     * Graph handle
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@ -200,8 +200,17 @@
 #include "ops/vsi_nn_op_scatter_elements.h"
 #include "ops/vsi_nn_op_pre_process_yuv422.h"
 #include "ops/vsi_nn_op_bucketize.h"
+#include "ops/vsi_nn_op_globallppool.h"
+#include "ops/vsi_nn_op_gather_nd.h"
+#include "ops/vsi_nn_op_avg_pool3d.h"
+#include "ops/vsi_nn_op_maxunpool.h"
+#include "ops/vsi_nn_op_reversesequence.h"
+#include "ops/vsi_nn_op_grid_sample.h"
+#include "ops/vsi_nn_op_bilinear_grid_sample.h"
+#include "ops/vsi_nn_op_lpnorm.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
+#include "ops/vsi_nn_op_inverse_sigmoid.h"

 #if defined(__cplusplus)
 extern "C"{
@ -386,6 +395,15 @@ typedef union _vsi_nn_nn_param
    vsi_nn_scatter_elements_param   scatter_elements;
    vsi_nn_pre_process_yuv422_param pre_process_yuv422;
    vsi_nn_bucketize_param          bucketize;
+    vsi_nn_globallppool_param       globallppool;
+    vsi_nn_gather_nd_param          gather_nd;
+    vsi_nn_avg_pool3d_param         avg_pool3d;
+    vsi_nn_maxunpool_param          maxunpool;
+    vsi_nn_reversesequence_param    reversesequence;
+    vsi_nn_inverse_sigmoid_param       inverse_sigmoid;
+    vsi_nn_grid_sample_param        gridsample;
+    vsi_nn_bilinear_grid_sample_param bilinear_grid_sample;
+    vsi_nn_lpnorm_param             lpnorm;
    void*                         client_param;

    /* custom node data struct define */
--- a/src/tim/vx/internal/include/vsi_nn_ops.h
+++ b/src/tim/vx/internal/include/vsi_nn_ops.h
@ -48,7 +48,7 @@ extern "C"{
 * @see include/custom/custom_ops.def
 * @see include/internal/internal_ops.def
 */
-typedef int32_t vsi_nn_op_t; enum
+typedef int32_t VSI_PUBLIC_TYPE vsi_nn_op_t; enum
 {
 #define DEF_OP( NAME, ... ) VSI_NN_OP_##NAME,
    #include "interface/ops.def"
@ -126,7 +126,7 @@ typedef struct _vsi_nn_op_proc
    vsi_nn_op_optimize_t optimize;
    uint32_t            input_num;
    uint32_t            output_num;
-} vsi_nn_op_proc_t;
+} VSI_PUBLIC_TYPE vsi_nn_op_proc_t;

 /*------------------------------------
              Functions
--- a/src/tim/vx/internal/include/vsi_nn_platform.h
+++ b/src/tim/vx/internal/include/vsi_nn_platform.h
@ -26,13 +26,6 @@

 #include "vsi_nn_feature_config.h"

-#ifdef VSI_40BIT_VA_SUPPORT
-#ifdef VX_VA40_EXT_SUPPORT
-#undef VX_VA40_EXT_SUPPORT
-#endif
-#define VX_VA40_EXT_SUPPORT 1
-#endif
-
 #include <VX/vx_khr_cnn.h>
 #include <VX/vx_helper.h>
 #include <VX/vx_ext_program.h>
@ -48,12 +41,4 @@
 */
 #include "vsi_nn_compatibility.h"

-#if defined(__cplusplus)
-extern "C"{
-#endif
-
-#if defined(__cplusplus)
-}
-#endif
-
 #endif
--- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
+++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
@ -87,6 +87,7 @@ typedef enum
    VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP,
    VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422,
    VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422,
+    VSI_NN_SOURCE_FORMAT_IMAGE_NV21,
 } vsi_nn_preprocess_source_format_e;

 /**
@ -98,7 +99,7 @@ typedef struct
    vsi_nn_preprocess_type_e type;
    /** Preprocess paramters */
    void* param;
-} vsi_nn_preprocess_base_t;
+} VSI_PUBLIC_TYPE vsi_nn_preprocess_base_t;

 /**
 * Postprocess base structure
@ -109,7 +110,7 @@ typedef struct
    vsi_nn_postprocess_type_e type;
    /** Postrocess paramters */
    void* param;
-} vsi_nn_postprocess_base_t;
+} VSI_PUBLIC_TYPE vsi_nn_postprocess_base_t;

 /**
 * Process dtype convert parameter structure
--- a/src/tim/vx/internal/include/vsi_nn_pub.h
+++ b/src/tim/vx/internal/include/vsi_nn_pub.h
@ -44,6 +44,7 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_version.h"
 #include "vsi_nn_assert.h"
+#include "vsi_nn_post.h"
 #include "vsi_nn_rnn.h"
 #include "vsi_nn_test.h"
 #include "vsi_nn_pre_post_process.h"
--- a/src/tim/vx/internal/include/vsi_nn_rnn.h
+++ b/src/tim/vx/internal/include/vsi_nn_rnn.h
@ -44,7 +44,7 @@ typedef struct
 {
    vsi_nn_tensor_id_t output;
    vsi_nn_tensor_id_t inputs[VSI_NN_MAX_RNN_CONNECTION_INPUTS];
-} vsi_nn_rnn_external_connection_t;
+} VSI_PUBLIC_TYPE vsi_nn_rnn_external_connection_t;

 /*-------------------------------------------
 Procedure to prepare input data, return FALSE
--- a/src/tim/vx/internal/include/vsi_nn_tensor.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor.h
@ -63,7 +63,7 @@ typedef enum
    VSI_NN_DIM_FMT_NHWC = 0x01,
    VSI_NN_DIM_FMT_NA   = 0xFF,
    VSI_NN_DIM_FMT_AUTO = VSI_NN_DIM_FMT_NA - 1,
-} vsi_nn_dim_fmt_e;
+} VSI_PUBLIC_TYPE vsi_nn_dim_fmt_e;

 /**
 * Quantization type.
@ -125,7 +125,7 @@ typedef struct vsi_nn_dtype
 #endif
        };
    };
-} vsi_nn_dtype_t;
+} VSI_PUBLIC_TYPE vsi_nn_dtype_t;

 /**
 * Tensor Attribute
@ -150,15 +150,13 @@ typedef struct vsi_nn_tensor_attr
 #ifdef VX_CREATE_TENSOR_SUPPORT_PHYSICAL
    vsi_memory_type_e vsi_memory_type;
 #endif
-#if VX_STREAM_PROCESSOR_SUPPORT
-    vsi_bool     is_dummy;
-#endif
-} vsi_nn_tensor_attr_t;
+    // DO NOT modify this struct.
+} VSI_PUBLIC_TYPE vsi_nn_tensor_attr_t;

 /**
 * Tensor structure
 */
-struct _vsi_nn_tensor
+struct VSI_PUBLIC_TYPE _vsi_nn_tensor
 {
    /** Tensor attributes */
    vsi_nn_tensor_attr_t attr;
@ -168,6 +166,7 @@ struct _vsi_nn_tensor
    vx_weights_biases_parameter wb;
    /** Mark tensor swapped by vxSwapTensor */
    int8_t  is_swapped;
+    // DO NOT modify this struct.
 };

 /**
--- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
@ -321,10 +321,38 @@ OVXLIB_API vsi_status vsi_nn_CopyDataToTensor
    );

 /**
- * Flush Handle
- * If you swap the handle of the tensor, you should flush it.
+ * Swap a tensor's Handle
+ * Swap handle to old_ptr to read/write, swap new handle to new_ptr to update handle.
 *
- * @param[in] tensor Tensor handle.
+ * APP SHOULD maintain handle that created by itself to manage memory correctly,
+ * never free or wirte data for handel allocated by OVXLIB.
+ *
+ * OVXLIB would not maintain original handle anymore if new_ptr == NULL.
+ *
+ * Before free data in handle allocated by APP, vsi_nn_SwapHandle(tensor, NULL, &prev_ptr)
+ * should be called to get contol of handle.
+ *
+ * @param[in] tensor Tensor.
+ * @param[in] new_ptr New handle of tensor.
+ * @param[in] is_new_ptr_malloc_by_ovxlib If new_ptr is allocated by ovxlib while new_ptr is not NULL.
+ * @param[out] old_ptr Old handle of tensor.
+ *
+ * @return VSI_SUCCESS on success, or error core otherwise.
+ */
+OVXLIB_API vsi_status vsi_nn_SwapHandle
+(
+    vsi_nn_tensor_t* tensor,
+    void* new_ptr,
+    vsi_bool is_new_ptr_malloc_by_ovxlib,
+    void** old_ptr
+);
+
+/**
+ * Flush Handle
+ * Call this function to flush new data to the handle in hand.
+ * vsi_nn_FlushHandle() should be called at last to compleate the data writting operation.
+ *
+ * @param[in] tensor Tensor.
 *
 * @return VSI_SUCCESS on success, or error core otherwise.
 */
@ -333,6 +361,20 @@ OVXLIB_API vsi_status vsi_nn_FlushHandle
    const vsi_nn_tensor_t * tensor
    );

+/**
+ * Invalidate Handle
+ * invalidate handle before copy data from tensor handle.
+ * Before read data in handle, vsi_nn_InvalidateHandle() should be called to do invalidate cache in APP.
+ *
+ * @param[in] tensor Tensor.
+ *
+ * @return VSI_SUCCESS on success, or error core otherwise.
+ */
+OVXLIB_API vsi_status vsi_nn_InvalidateHandle
+(
+    const vsi_nn_tensor_t* tensor
+);
+
 /**
 * Get Tensor Handle
 * Get the handle of the tensor
@ -348,6 +390,34 @@ OVXLIB_API vsi_status vsi_nn_GetTensorHandle
    void** ptr
    );

+/**
+ * Get Tensor is_scalar
+ * Get the is_scalar of the tensor
+ *
+ * @param[in] tensor Tensor.
+ *
+ * @return is_scalar flag of the tensor.
+ */
+OVXLIB_API int8_t vsi_nn_GetTensorIsScalar
+(
+    vsi_nn_tensor_t* tensor
+);
+
+/**
+ * Set Tensor is_scalar
+ * Set the is_scalar for the tensor
+ *
+ * @param[in] tensor Tensor.
+ * @param[in] new is_scalar value of the tensor.
+ *
+ * @return VSI_SUCCESS on success, or error core otherwise.
+ */
+OVXLIB_API vsi_status vsi_nn_SetTensorIsScalar
+(
+    vsi_nn_tensor_t* tensor,
+    int8_t is_scalar
+);
+
 OVXLIB_API vsi_status vsi_nn_CopyRawDataToTensor
    (
    vsi_nn_graph_t*         graph,
@ -722,13 +792,6 @@ vsi_nn_tensor_t* vsi_nn_ConstTensorAdd_impl
 #define vsi_nn_ConstTensorAdd(_graph, _output_attr, ...) \
    vsi_nn_ConstTensorAdd_impl(_graph, _output_attr, __VA_ARGS__, END_OF_VARIADIC_ARGUMENTS)

-vsi_status vsi_nn_SwapHandle
-    (
-    vsi_nn_tensor_t * tensor,
-    void * new_ptr,
-    void ** old_ptr
-    );
-
 vsi_bool vsi_nn_ConvertTensor
    (
    vsi_nn_graph_t* graph,
--- a/src/tim/vx/internal/include/vsi_nn_types.h
+++ b/src/tim/vx/internal/include/vsi_nn_types.h
@ -27,7 +27,6 @@

 #include <stdint.h>
 #include "vsi_nn_platform.h"
-#include "vsi_nn_feature_config.h"

 #if defined(__cplusplus)
 extern "C"{
@ -109,7 +108,7 @@ typedef enum
    VSI_NN_PAD_AUTO,
    VSI_NN_PAD_VALID,
    VSI_NN_PAD_SAME
-} vsi_nn_pad_e;
+} VSI_PUBLIC_TYPE vsi_nn_pad_e;

 /** reduce type enum */
 typedef enum
@ -142,14 +141,14 @@ typedef enum
 {
    VSI_NN_ROUND_CEIL,
    VSI_NN_ROUND_FLOOR
-} vsi_nn_round_type_e;
+} VSI_PUBLIC_TYPE vsi_nn_round_type_e;

 /** Optimize driction */
 typedef enum
 {
    VSI_NN_OPTIMIZE_FORWARD,
    VSI_NN_OPTIMIZE_BACKWARD
-} vsi_nn_opt_direction_e;
+} VSI_PUBLIC_TYPE vsi_nn_opt_direction_e;
 #ifdef VX_CREATE_TENSOR_SUPPORT_PHYSICAL
 typedef enum
 {
@ -195,7 +194,7 @@ typedef enum
 #endif
    VSI_NN_TYPE_VDATA = VX_TYPE_USER_STRUCT_START + 0x1,

-}vsi_nn_type_e;
+} VSI_PUBLIC_TYPE vsi_nn_type_e;

 typedef int32_t vsi_nn_activation_e; enum
 {
@ -236,7 +235,7 @@ typedef enum
 {
    VSI_NN_GRAPH_PRELOAD_VIPSRAM,
    VSI_NN_GRAPH_PRELOAD_AXISRAM
-} vsi_nn_graph_attr_preload_type_e;
+} VSI_PUBLIC_TYPE vsi_nn_graph_attr_preload_type_e;

 typedef enum _vsi_nn_node_attr_preload_type_e
 {
@ -257,23 +256,35 @@ typedef enum _vsi_nn_yuv_type
    VSI_NN_YUV_TYPE_UYUV422
 }vsi_nn_yuv_type;

+typedef enum _vsi_nn_nv_type
+{
+    VSI_NN_YUV_TYPE_NV12,
+    VSI_NN_YUV_TYPE_NV21
+}vsi_nn_nv_type;
+
+typedef enum _vsi_nn_roi_align_type_e
+{
+    VSI_NN_ROI_ALIGN_ANDROID,
+    VSI_NN_ROI_ALIGN
+} vsi_nn_roi_align_type_e;
+
 /** Deprecated */
 typedef uint32_t vsi_nn_size_t;

 /** Tensor id type */
-typedef uint32_t vsi_nn_tensor_id_t;
+typedef uint32_t VSI_PUBLIC_TYPE vsi_nn_tensor_id_t;

 /** Node id type */
 typedef uint32_t vsi_nn_node_id_t;

 /** @see _vsi_nn_graph */
-typedef struct _vsi_nn_graph vsi_nn_graph_t;
+typedef struct _vsi_nn_graph VSI_PUBLIC_TYPE vsi_nn_graph_t;

 /** @see _vsi_nn_node */
-typedef struct _vsi_nn_node vsi_nn_node_t;
+typedef struct _vsi_nn_node VSI_PUBLIC_TYPE vsi_nn_node_t;

 /** @see _vsi_nn_tensor */
-typedef struct _vsi_nn_tensor vsi_nn_tensor_t;
+typedef struct _vsi_nn_tensor VSI_PUBLIC_TYPE vsi_nn_tensor_t;

 #if defined(__cplusplus)
 }
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@ -33,7 +33,7 @@ extern "C"{

 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 57
+#define VSI_NN_VERSION_PATCH 74
 #define VSI_NN_VERSION \
    (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

--- a/src/tim/vx/internal/src/Android.mk
+++ b/src/tim/vx/internal/src/Android.mk
@ -0,0 +1,144 @@
+#
+# Build Vivante chipinfo for android.
+#
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+ifeq ($(AQROOT),)
+$(error Please set AQROOT env first)
+endif
+
+include $(AQROOT)/Android.mk.def
+
+ifeq ($(PLATFORM_VENDOR),1)
+LOCAL_VENDOR_MODULE  := true
+endif
+
+LOCAL_SRC_FILES :=     \
+            vsi_nn_context.c \
+            vsi_nn_client_op.c \
+            vsi_nn_graph.c  \
+            vsi_nn_node_attr_template.c  \
+            vsi_nn_node.c  \
+            vsi_nn_ops.c  \
+            vsi_nn_daemon.c \
+            vsi_nn_tensor.c \
+            vsi_nn_version.c \
+            vsi_nn_rnn.c \
+            vsi_nn_rnn_helper.c \
+            vsi_nn_internal_node.c \
+            vsi_nn_log.c \
+            vsi_nn_graph_optimization.c \
+            vsi_nn_pre_post_process.c
+
+
+LOCAL_SRC_FILES +=      \
+             utils/vsi_nn_code_generator.c   \
+             utils/vsi_nn_binary_tree.c   \
+             utils/vsi_nn_map.c   \
+             utils/vsi_nn_hashmap.c   \
+             utils/vsi_nn_link_list.c   \
+             utils/vsi_nn_math.c   \
+             utils/vsi_nn_dtype.c   \
+             utils/vsi_nn_dtype_util.c   \
+             utils/vsi_nn_shape_util.c   \
+             utils/vsi_nn_limits.c   \
+             utils/vsi_nn_tensor_op.c   \
+             utils/vsi_nn_util.c \
+             utils/vsi_nn_dlfcn.c \
+             utils/vsi_nn_constraint_check.c
+
+
+LOCAL_SRC_FILES +=      \
+             quantization/vsi_nn_dynamic_fixed_point.c   \
+             quantization/vsi_nn_asymmetric_affine.c   \
+             quantization/vsi_nn_perchannel_symmetric_affine.c   \
+
+
+LOCAL_SRC_FILES +=      \
+            post/vsi_nn_post_fasterrcnn.c   \
+            post/vsi_nn_post_cmupose.c
+
+LOCAL_SRC_FILES +=      \
+            cpu_backend/vsi_nn_cpu_backend.c   \
+            cpu_backend/vsi_nn_cpu_backend_conv2d.c   \
+            cpu_backend/vsi_nn_cpu_backend_deconv2d.c   \
+            cpu_backend/npuref_interface.c
+
+
+LOCAL_SRC_FILES += libnnext/vsi_nn_libnnext_resource.c \
+                   libnnext/vsi_nn_vxkernel.c
+
+LOCAL_SRC_FILES += kernel/vsi_nn_kernel.c \
+                   kernel/vsi_nn_kernel_util.c \
+                   kernel/vsi_nn_kernel_backend.c \
+                   kernel/vsi_nn_kernel_eltwise.c \
+                   kernel/vsi_nn_kernel_selector.c \
+                   kernel/vsi_nn_kernel_node.c \
+                   kernel/vsi_nn_kernel_param.c \
+                   kernel/vsi_nn_kernel_gpu_shape_optimize.c \
+                   kernel/vsi_nn_kernel_lut.c \
+                   kernel/vsi_nn_spinst.c \
+                   kernel/vsi_nn_sp_unit_operation.c \
+                   kernel/vsi_nn_sp_lut.c \
+                   kernel/vsi_nn_gpu.c
+
+LIBNNEXT_KERNEL_SOURCES := $(wildcard $(LOCAL_PATH)/libnnext/ops/kernel/*.c)
+LOCAL_SRC_FILES += $(LIBNNEXT_KERNEL_SOURCES:$(LOCAL_PATH)/%=%)
+
+KERNEL_SOURCES := $(wildcard $(LOCAL_PATH)/kernel/cl/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/kernel/cpu/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/kernel/evis/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/kernel/vx/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/kernel/sp/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/custom/ops/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/custom/ops/kernel/evis/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/custom/ops/kernel/cl/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/custom/ops/kernel/cpu/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/custom/ops/kernel/sp/*.c)
+LOCAL_SRC_FILES += $(KERNEL_SOURCES:$(LOCAL_PATH)/%=%)
+
+OPERATION_SOURCES := $(wildcard $(LOCAL_PATH)/ops/*.c)
+LOCAL_SRC_FILES += $(OPERATION_SOURCES:$(LOCAL_PATH)/%=%)
+
+
+LOCAL_SHARED_LIBRARIES := \
+    liblog \
+    libjpeg \
+    libGAL \
+    libOpenVX \
+    libVSC \
+    libdl
+
+LOCAL_C_INCLUDES += \
+    external/libjpeg-turbo \
+    $(AQROOT)/sdk/inc/CL \
+    $(AQROOT)/sdk/inc/VX \
+    $(AQROOT)/sdk/inc/ \
+    $(AQROOT)/sdk/inc/HAL \
+    $(LOCAL_PATH)/../include \
+    $(LOCAL_PATH)/../include/ops \
+    $(LOCAL_PATH)/../include/utils \
+    $(LOCAL_PATH)/../include/infernce \
+    $(LOCAL_PATH)/../include/client \
+    $(LOCAL_PATH)/../include/cpu_backend \
+    $(LOCAL_PATH)/../include/libnnext \
+    $(LOCAL_PATH)/../src
+
+LOCAL_CFLAGS :=  \
+    -DLINUX \
+    -D'OVXLIB_API=__attribute__((visibility("default")))' \
+    -DANDROID_SDK_VERSION=$(PLATFORM_SDK_VERSION)\
+        -Wno-sign-compare \
+        -Wno-implicit-function-declaration \
+        -Wno-sometimes-uninitialized \
+        -Wno-unused-parameter \
+        -Wno-enum-conversion \
+        -Wno-missing-field-initializers \
+        -Wno-tautological-compare \
+        -Wno-missing-braces
+
+LOCAL_MODULE:= libovxlib
+LOCAL_MODULE_TAGS := optional
+LOCAL_PRELINK_MODULE := false
+include $(BUILD_SHARED_LIBRARY)
--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c
@ -0,0 +1,184 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include <stdlib.h>
+#include <math.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_test.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
+
+#define _CPU_ARG_NUM            (1)
+#define _CPU_INPUT_NUM          (2)
+#define _CPU_OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME            ("com.vivantecorp.extension.CustomSampleVXC")
+
+#define SCALAR_INPUT_AXIS          (3)
+
+__BEGIN_DECLS
+
+DEF_KERNEL_EXECUTOR(_softmax_compute)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t* param,
+    size_t param_size
+    )
+{
+    vsi_status status = VX_SUCCESS;
+    float *buffer[_CPU_IO_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *attr[_CPU_IO_NUM] = {NULL};
+    uint32_t i = 0, out_elements = 0;
+    int32_t axis;
+
+    tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; // input0
+    tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; // input1
+    tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; // output
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create(tensors[0]);
+    attr[1] = vsi_nn_kernel_tensor_attr_create(tensors[1]);
+    attr[2] = vsi_nn_kernel_tensor_attr_create(tensors[2]);
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    /* alloc the float32 data buffer */
+    buffer[0] = (float *)vsi_nn_kernel_tensor_create_buffer(tensors[0], attr[0], TRUE);
+    CHECK_PTR_FAIL_GOTO(buffer[0], "Create input0 buffer fail.", final);
+
+    buffer[1] = (float *)vsi_nn_kernel_tensor_create_buffer(tensors[1], attr[1], TRUE);
+    CHECK_PTR_FAIL_GOTO(buffer[1], "Create input1 buffer fail.", final);
+
+    out_elements = (uint32_t)vsi_nn_kernel_tensor_attr_get_size(attr[2]);
+    buffer[2] = (float *)malloc(out_elements * sizeof(float));
+    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
+    memset(buffer[2], 0, out_elements * sizeof(float));
+
+    /* CPU implement */
+    for(i = 0; i < out_elements; i++)
+    {
+        buffer[2][i] = buffer[0][i] + buffer[1][0];
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float(
+        tensors[2], attr[2], buffer[2], out_elements );
+final:
+    for(i = 0; i < _CPU_IO_NUM; i ++)
+    {
+        if(buffer[i])
+        {
+            free(buffer[i]);
+        }
+        vsi_nn_kernel_tensor_attr_release(&attr[i]);
+    }
+    return status;
+}
+
+static vx_param_description_t kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
+};
+
+static const vx_kernel_description_t _kernel_info =
+{
+    KERNEL_ID_PLACEHOLDER,
+    _KERNEL_NAME,
+    _softmax_compute,
+    kernel_param_def,
+    _cnt_of_array( kernel_param_def ),
+    vsi_nn_KernelValidator,
+    NULL,
+    NULL,
+    vsi_nn_KernelInitializer,
+    vsi_nn_KernelDeinitializer
+};
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel
+    )
+{
+    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    return VSI_SUCCESS;
+}
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t axis = 0;
+
+    axis = vsi_nn_kernel_param_get_int32(params, "axis");
+    status = _query_kernel(inputs, outputs, kernel);
+    if(status != VSI_SUCCESS)
+    {
+        return NULL;
+    }
+
+    node = vsi_nn_kernel_create_node(graph, kernel);
+    if(node == NULL)
+    {
+        return NULL;
+    }
+
+    /* Set inputs and outputs */
+    vsi_nn_kernel_node_pack_io(backend_params, _CPU_PARAM_NUM,
+            inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM);
+    backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
+            graph, I32, &axis);
+
+    /* Pass parameters to node. */
+    status = vsi_nn_kernel_node_pass_param(node, backend_params, _CPU_PARAM_NUM);
+    vsi_nn_kernel_scalar_release(&backend_params[SCALAR_INPUT_AXIS]);
+
+    return node;
+}
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( custom_sample, _setup )
--- a/src/tim/vx/internal/src/custom/ops/op_custom_sample.c
+++ b/src/tim/vx/internal/src/custom/ops/op_custom_sample.c
@ -0,0 +1,103 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include <stdlib.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_log.h"
+#include "kernel/vsi_nn_kernel.h"
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_kernel_param_t *param = NULL;
+    vsi_nn_custom_sample_param *p;
+    p = &self->nn_param.custom_sample;
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32(param, "axis", p->axis);
+
+    self->n = (vx_node)vsi_nn_kernel_selector(
+            self->graph,
+            "custom_sample",
+            inputs, 2,
+            outputs, 1,
+            param);
+
+    vsi_nn_kernel_param_release(&param);
+    return VSI_SUCCESS;
+}
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /*TODO: Check params. */
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * node,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        memmove(outputs[0]->attr.size, inputs[0]->attr.size,
+            inputs[0]->attr.dim_num * sizeof(vsi_size_t));
+    }
+    return TRUE;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ CUSTOM_SAMPLE,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ 2,
+    /* output_num */ 1
+    );
+#ifdef __cplusplus
+}
+#endif
--- a/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c
@ -0,0 +1,354 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_AVG_POOL3D,
+} _internal_kernel_e;
+
+#define _AVG_POOL3D_KERNEL_SOURCE_NAME      "avg_pool3d"
+
+// Add kernel hashtable here
+#define AVG_POOL3D_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        (( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
+#define AVG_POOL3D_KERNELS( IN_DTYPE, OUT_DTYPE ) \
+        { AVG_POOL3D_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+        CVIVANTE_NAMESPACE("cl.avg_pool3d_"#IN_DTYPE"to"#OUT_DTYPE), \
+        _AVG_POOL3D_KERNEL_SOURCE_NAME }, \
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _avg_pool3d_kernel_map[] =
+{
+    // Register kernel here
+    AVG_POOL3D_KERNELS( F32, F32 )
+    AVG_POOL3D_KERNELS( F32, U32 )
+    AVG_POOL3D_KERNELS( F32, I32 )
+    AVG_POOL3D_KERNELS( U32, U32 )
+    AVG_POOL3D_KERNELS( U32, F32 )
+    AVG_POOL3D_KERNELS( I32, I32 )
+    AVG_POOL3D_KERNELS( I32, F32 )
+    AVG_POOL3D_KERNELS( BF16, BF16 )
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _avg_pool3d_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _AVG_POOL3D_PARAM_NUM  _cnt_of_array( _avg_pool3d_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_avg_pool3d_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status status = VSI_FAILURE;
+    vx_tensor  output = (vx_tensor)param[1];
+    vx_scalar  depth_out = (vx_scalar)param[14];
+    int32_t depth_out_value;
+    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
+    vsi_size_array_t            *output_shape = NULL;
+
+    vxReadScalarValue(depth_out, &depth_out_value);
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
+    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    output_shape = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = depth_out_value;
+    gpu_param.global_size[0]   = (output_shape->data[0] +  gpu_param.global_scale[0] - 1)
+                                        /  gpu_param.global_scale[0];
+    gpu_param.global_size[1]   = (output_shape->data[1] +  gpu_param.global_scale[1] - 1)
+                                        /  gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = (output_shape->data[2] +  gpu_param.global_scale[2] - 1)
+                                        /  gpu_param.global_scale[2];
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (output_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&output_attr);
+    }
+
+    return status;
+} /* _avg_pool3d_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _avg_pool3d_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _avg_pool3d_kernel_map );
+    vx_param_description_t * param_def  = _avg_pool3d_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _avg_pool3d_initializer;
+
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+#define _PACK_SELECT_KEY( in_dtype, out_dtype ) \
+     (( in_dtype ) | (out_dtype << 8 ))
+    switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+    case _PACK_SELECT_KEY(F32, F16):
+    case _PACK_SELECT_KEY(F16, F32):
+         key = AVG_POOL3D_HASH_KEY( F32, F32);
+         break;
+    case _PACK_SELECT_KEY(F32, U8):
+    case _PACK_SELECT_KEY(F16, U8):
+         key = AVG_POOL3D_HASH_KEY( F32, U32);
+         break;
+    case _PACK_SELECT_KEY(F32, I8):
+    case _PACK_SELECT_KEY(F32, I16):
+    case _PACK_SELECT_KEY(F16, I8):
+    case _PACK_SELECT_KEY(F16, I16):
+         key = AVG_POOL3D_HASH_KEY( F32, I32);
+         break;
+    case _PACK_SELECT_KEY(U8, U8):
+         key = AVG_POOL3D_HASH_KEY( U32, U32);
+         break;
+    case _PACK_SELECT_KEY(U8, F16):
+    case _PACK_SELECT_KEY(U8, F32):
+         key = AVG_POOL3D_HASH_KEY( U32, F32);
+         break;
+    case _PACK_SELECT_KEY(I8, I8):
+    case _PACK_SELECT_KEY(I8, I16):
+    case _PACK_SELECT_KEY(I16, I8):
+    case _PACK_SELECT_KEY(I16, I16):
+         key = AVG_POOL3D_HASH_KEY( I32, I32);
+         break;
+    case _PACK_SELECT_KEY(I8, F16):
+    case _PACK_SELECT_KEY(I8, F32):
+    case _PACK_SELECT_KEY(I16, F16):
+    case _PACK_SELECT_KEY(I16, F32):
+         key = AVG_POOL3D_HASH_KEY( I32, F32);
+         break;
+    default:
+         key = AVG_POOL3D_HASH_KEY( in_dtype, out_dtype);
+         break;
+    }
+#undef _PACK_SELECT_KEY
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _avg_pool3d_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_AVG_POOL3D_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t width     = (int32_t)inputs[0]->attr.size[0];
+    int32_t height    = (int32_t)inputs[0]->attr.size[1];
+    int32_t ksize_x   = vsi_nn_kernel_param_get_int32(params, "ksize_x");
+    int32_t ksize_y   = vsi_nn_kernel_param_get_int32(params, "ksize_y");
+    int32_t ksize_z   = vsi_nn_kernel_param_get_int32(params, "ksize_z");
+    int32_t stride_x  = vsi_nn_kernel_param_get_int32(params, "stride_x");
+    int32_t stride_y  = vsi_nn_kernel_param_get_int32(params, "stride_y");
+    int32_t stride_z  = vsi_nn_kernel_param_get_int32(params, "stride_z");
+    int32_t pad_left  = vsi_nn_kernel_param_get_int32(params, "pad_left");
+    int32_t pad_top   = vsi_nn_kernel_param_get_int32(params, "pad_top");
+    int32_t pad_front = vsi_nn_kernel_param_get_int32(params, "pad_front");
+    int32_t depth_in  = vsi_nn_kernel_param_get_int32(params, "depth_in");
+    int32_t depth_out = vsi_nn_kernel_param_get_int32(params, "depth_out");
+    int32_t count_include_pad = vsi_nn_kernel_param_get_int32(params, "count_include_pad");
+    float   outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float   inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
+                inputs[0]->attr.dim_num )
+     || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ))
+    {
+        return NULL;
+    }
+
+    outputScale = 1.0f / outputScale;
+    inputTail   = -(inputTail * inputScale);
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            uint32_t index = 2;
+            vsi_nn_kernel_node_pack_io( node_params, _AVG_POOL3D_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_z );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_z );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_left );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_top );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_front );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &depth_in );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &depth_out );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &count_include_pad );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _AVG_POOL3D_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+            vsi_nn_kernel_scalar_release( &node_params[13] );
+            vsi_nn_kernel_scalar_release( &node_params[14] );
+            vsi_nn_kernel_scalar_release( &node_params[15] );
+            vsi_nn_kernel_scalar_release( &node_params[16] );
+            vsi_nn_kernel_scalar_release( &node_params[17] );
+            vsi_nn_kernel_scalar_release( &node_params[18] );
+            vsi_nn_kernel_scalar_release( &node_params[19] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( avg_pool3d, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c
@ -0,0 +1,381 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_BILINEAR_GRID_SAMPLE,
+} _internal_kernel_e;
+
+#define _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE()      "bilinear_grid_sample"
+
+#define STR(a) #a
+
+// Add kernel hashtable here
+#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+    ((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE))
+
+#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE)                \
+    {                                                                   \
+        BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
+            CVIVANTE_NAMESPACE("cl.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
+            _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE()   \
+    }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _bilinear_grid_sample_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP(F32, F32, F32 ),
+    PACK_KERNEL_MAP(U8,  U8,  U8),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _bilinear_grid_sample_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+#define _BILINEAR_GRID_SAMPLE_PARAM_NUM 8
+#define _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM \
+    _cnt_of_array(_bilinear_grid_sample_kernel_param_def)
+
+#define SCALAR_HALF_INPUT0_W (3)
+#define SCALAR_HALF_INPUT0_H (4)
+#define SCALAR_ADD_VALUE_W   (5)
+#define SCALAR_ADD_VALUE_H   (6)
+#define SCALAR_DEPTH         (7)
+#define SCALAR_INPUT0_SCALE  (8)
+#define SCALAR_INPUT0_TAIL   (9)
+#define SCALAR_INPUT1_SCALE  (10)
+#define SCALAR_INPUT1_TAIL   (11)
+#define SCALAR_OUTPUT_SCALE  (12)
+#define SCALAR_OUTPUT_TAIL   (13)
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {3, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
+    vsi_nn_kernel_tensor_attr_t* output_attr = NULL;
+    vsi_size_array_t* out_shape = NULL;
+
+    output_attr =
+        vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
+    CHECK_PTR_FAIL_GOTO(output_attr, "Create tensor attr buffer fail.", final);
+
+    out_shape = output_attr->shape;
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+
+    gpu_param.dim = 2;
+    gpu_param.global_size[0] =
+        gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1) /
+                         gpu_param.global_scale[0],
+                     4);
+    gpu_param.global_size[1] =
+        ((out_shape->data[1] + gpu_param.global_scale[1] - 1) /
+         gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = 1;
+    status = vsi_nn_kernel_gpu_config(node, &gpu_param);
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR)               \
+    if (_PTR) {                                   \
+        vsi_nn_kernel_tensor_attr_release(&_PTR); \
+        _PTR = NULL;                              \
+    }
+    SAFE_FREE_TENSOR_ATTR(output_attr);
+    return status;
+} /* _bilinear_grid_sample_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool* is_use_u8_kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype, in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _bilinear_grid_sample_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _bilinear_grid_sample_kernel_map );
+    vx_param_description_t * param_def  = _bilinear_grid_sample_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array(_bilinear_grid_sample_kernel_param_def);
+    vx_kernel_initialize_f  initializer = _bilinear_grid_sample_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (F16 == in0_dtype) {
+        in0_dtype = F32;
+    }
+    if (F16 == in1_dtype) {
+        in1_dtype = F32;
+    }
+    if (F16 == out_dtype) {
+        out_dtype = F32;
+    }
+    if ((U8 == in0_dtype) || (U8 == out_dtype)) {
+        param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM;
+        *is_use_u8_kernel = TRUE;
+    } else {
+        param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
+        *is_use_u8_kernel = FALSE;
+    }
+
+    key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype);
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM];
+    vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
+    uint32_t final_in1_rank = 0;
+    vsi_nn_tensor_t* rs_tensors = NULL;
+    vsi_nn_tensor_t* final_tensors[3] = {NULL};
+    vsi_size_t in0_width  = inputs[0]->attr.size[0];
+    vsi_size_t in0_height = inputs[0]->attr.size[1];
+    float input0_zp    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input0_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float input0_tail  = -(input0_zp * input0_scale);
+    float input1_zp    = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
+    float input1_scale = vsi_nn_get_tensor_scale(inputs[1]);
+    float input1_tail  = -(input1_zp * input1_scale);
+    float output_zp    = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    vsi_bool is_use_u8_kernel = FALSE;
+    int32_t align_corners =
+        vsi_nn_kernel_param_get_int32(params, "align_corners");
+    uint32_t pad_val = 0;
+    int32_t  depth = 0;
+    vsi_nn_kernel_dtype_e in0_dtype;
+
+    float half_input0_w, half_input0_h, add_float_value_w, add_float_value_h;
+
+    // Check if gpu can support the size
+    if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
+                                       inputs[0]->attr.dim_num)) {
+        return NULL;
+    }
+
+    if (!vsi_nn_kernel_gpu_check_shape(inputs[1]->attr.size,
+                                       inputs[1]->attr.dim_num)) {
+        return NULL;
+    }
+
+    final_tensors[0] = inputs[0];
+
+    if (inputs[1]->attr.dim_num >= 3) {
+
+        final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0];
+        final_shape[1] = inputs[1]->attr.size[2];
+        final_shape[2] = 1;
+        final_shape[3] = inputs[1]->attr.dim_num > 3 ? inputs[1]->attr.size[3] : 1;
+        final_in1_rank =
+            inputs[1]->attr.dim_num == 3 ? 2 : inputs[1]->attr.dim_num;
+        if (!vsi_nn_kernel_gpu_check_shape(final_shape, final_in1_rank)) {
+            return NULL;
+        }
+
+        rs_tensors = vsi_nn_reshape_tensor(graph, inputs[1], final_shape, final_in1_rank);
+        final_tensors[1] = rs_tensors;
+    } else {
+        final_tensors[1] = inputs[1];
+    }
+    final_tensors[2] = outputs[0];
+
+    if (align_corners) {
+        half_input0_w     = ((float)in0_width - 1.0f) * 0.5f;
+        half_input0_h     = ((float)in0_height - 1.0f) * 0.5f;
+        add_float_value_w = half_input0_w;
+        add_float_value_h = half_input0_h;
+    } else {
+        half_input0_w     = (float)in0_width * 0.5f;
+        half_input0_h     = (float)in0_height * 0.5f;
+        add_float_value_w = half_input0_w - 0.5f;
+        add_float_value_h = half_input0_h - 0.5f;
+    }
+
+    depth = (int32_t)inputs[0]->attr.size[2];
+    in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
+    if (U8 == in0_dtype) {
+        pad_val = inputs[0]->attr.dtype.zero_point;
+    }
+    status = _query_kernel(kernel, inputs, outputs, &is_use_u8_kernel);
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            size_t node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM,
+                    final_tensors, input_num, &final_tensors[2], output_num );
+            node_params[SCALAR_HALF_INPUT0_W] = vsi_nn_kernel_scalar_create( graph, F32, &half_input0_w );
+            node_params[SCALAR_HALF_INPUT0_H] = vsi_nn_kernel_scalar_create( graph, F32, &half_input0_h );
+            node_params[SCALAR_ADD_VALUE_W]   = vsi_nn_kernel_scalar_create( graph, F32, &add_float_value_w );
+            node_params[SCALAR_ADD_VALUE_H]   = vsi_nn_kernel_scalar_create( graph, F32, &add_float_value_h );
+            node_params[SCALAR_DEPTH]         = vsi_nn_kernel_scalar_create( graph, I32, &depth );
+            if (is_use_u8_kernel)
+            {
+                node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale );
+                node_params[SCALAR_INPUT0_TAIL]  = vsi_nn_kernel_scalar_create( graph, F32, &input0_tail );
+                node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input1_scale );
+                node_params[SCALAR_INPUT1_TAIL]  = vsi_nn_kernel_scalar_create( graph, F32, &input1_tail );
+                node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+                node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
+                node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM;
+            }
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            VSI_ASSERT(status == VSI_SUCCESS);
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_HALF_INPUT0_W]);
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_HALF_INPUT0_H]);
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_W]);
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_H]);
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_DEPTH]);
+            if (is_use_u8_kernel) {
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_SCALE]);
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_TAIL]);
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_SCALE]);
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_TAIL]);
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_SCALE]);
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_TAIL]);
+            }
+            {
+                // Set default border mode.
+                vx_border_t border;
+                border.mode = VX_BORDER_CONSTANT;
+                border.constant_value.U32 = pad_val;
+                status = vxSetNodeAttribute(
+                    (vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
+                CHECK_STATUS(status);
+            }
+        }
+    }
+
+    vsi_safe_release_tensor(rs_tensors);
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( bilinear_grid_sample, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cl/clip_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c
@ -35,6 +35,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"

 __BEGIN_DECLS

@ -258,19 +259,36 @@ static vsi_nn_kernel_node_t _setup
    float    inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
    float    min_value    = vsi_nn_kernel_param_get_float32( params, "min_value" );
    float    max_value    = vsi_nn_kernel_param_get_float32( params, "max_value" );
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = TRUE;

-    outputScale = 1.0f / outputScale;
-    inputTail   = -(inputTail * inputScale);
+    ret = vsi_nn_kernel_optimize_element_shape(
+        inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank);

-    if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
-                inputs[0]->attr.dim_num ) )
+    if ( ret )
    {
        return NULL;
    }

-    image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+            inputs[0], shape, new_rank );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shape, new_rank );

-    status = _query_kernel( kernel, inputs, outputs, image_2d);
+    outputScale = 1.0f / outputScale;
+    inputTail   = -(inputTail * inputScale);
+
+    if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size,
+                reshape_tensors[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    image_2d = (reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1);
+
+    status = _query_kernel( kernel, reshape_tensors, &reshape_tensors[1], image_2d);

    if ( VSI_SUCCESS == status )
    {
@ -279,7 +297,7 @@ static vsi_nn_kernel_node_t _setup
        {
            /* Set inputs and outputs */
            vsi_nn_kernel_node_pack_io( node_params, _CLIP_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
+                    reshape_tensors, input_num, &reshape_tensors[1], output_num );
            node_params[SCALAR_MIN_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &min_value );
            node_params[SCALAR_MAX_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &max_value );
            node_params[SCALAR_INPUT_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
@ -297,6 +315,10 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
        }
    }
+
+    vsi_safe_release_tensor( reshape_tensors[0] );
+    vsi_safe_release_tensor( reshape_tensors[1] );
+
    return node;
 } /* _setup() */

--- a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
@ -34,6 +34,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"

 __BEGIN_DECLS

@ -287,7 +288,7 @@ static vsi_status _query_kernel
    int i;

    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
-    input1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && output_dtype == I8)
@ -335,31 +336,85 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
    vsi_bool image_2d = FALSE;
    vsi_nn_kernel_node_t node = NULL;
-    int32_t operation = 0;
+    int32_t operation = vsi_nn_kernel_param_get_int32( params, "operation" );
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = FALSE;

    float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
    float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale;
    float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
    float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;

-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    ret = vsi_nn_kernel_optimize_eltwise_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            inputs[1]->attr.size, inputs[1]->attr.dim_num,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes[0], shapes[1], shapes[2], &new_rank );
+
+    if ( ret )
    {
-        return NULL;
+        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shapes[0], new_rank );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                inputs[1], shapes[1], new_rank );
+        reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shapes[2], new_rank );
+
+#define _swap_tensor(a, b, tmp)  \
+    do { \
+        tmp = a; \
+        a = b; \
+        b = tmp; \
+    } while(0)
+
+        if (shapes[1][3] > shapes[0][3] && new_rank == 4)
+        {
+            vsi_nn_tensor_t* reshape_tmp;
+            _swap_tensor(reshape_tensors[0], reshape_tensors[1], reshape_tmp);
+
+            if (VSI_NN_RELATIONAL_OPS_GREAT == operation)
+            {
+                operation = VSI_NN_RELATIONAL_OPS_LESS;
+            }
+            else if (VSI_NN_RELATIONAL_OPS_LESS == operation)
+            {
+                operation = VSI_NN_RELATIONAL_OPS_GREAT;
+            }
+            else if (VSI_NN_RELATIONAL_OPS_GREAT_EQUAL == operation)
+            {
+                operation = VSI_NN_RELATIONAL_OPS_LESS_EQUAL;
+            }
+            else if (VSI_NN_RELATIONAL_OPS_LESS_EQUAL == operation)
+            {
+                operation = VSI_NN_RELATIONAL_OPS_GREAT_EQUAL;
+            }
+        }
+
+#undef _swap_tensor
+    }
+    else
+    {
+        goto final;
    }

-    operation = vsi_nn_kernel_param_get_int32( params, "operation" );
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
+                reshape_tensors[2]->attr.dim_num ) )
+    {
+        goto final;
+    }

-    image_2d = (outputs[0]->attr.dim_num == 2);
-    status = _query_kernel( inputs, outputs, operation, image_2d, kernel );
-    if( VSI_SUCCESS == status)
+    image_2d = (reshape_tensors[2]->attr.dim_num == 2 || reshape_tensors[2]->attr.size[2] == 1);
+    status = _query_kernel( reshape_tensors, &reshape_tensors[2], operation, image_2d, kernel );
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );

-        if( node )
+        if ( node )
        {
            vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
-                    inputs, 2, outputs, 1 );
+                    reshape_tensors, 2, &reshape_tensors[2], 1 );
            node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create(
                    graph, F32, &input0Scale );
            node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(
@ -379,6 +434,12 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] );
        }
    }
+
+final:
+    vsi_safe_release_tensor( reshape_tensors[0] );
+    vsi_safe_release_tensor( reshape_tensors[1] );
+    vsi_safe_release_tensor( reshape_tensors[2] );
+
    return node;
 } /* _setup() */

--- a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
@ -69,14 +69,19 @@ static const struct {
 {
    HASH_CUMSUM_KERNELS(0, U8,  U8)
    HASH_CUMSUM_KERNELS(0, F32, F32)
+    HASH_CUMSUM_KERNELS(0, F32, U8)
    HASH_CUMSUM_KERNELS(1, U8,  U8)
    HASH_CUMSUM_KERNELS(1, F32, F32)
+    HASH_CUMSUM_KERNELS(1, F32, U8)
    HASH_CUMSUM_KERNELS(2, U8,  U8)
    HASH_CUMSUM_KERNELS(2, F32, F32)
+    HASH_CUMSUM_KERNELS(2, F32, U8)
    HASH_CUMSUM_KERNELS_2D(0, U8,  U8)
    HASH_CUMSUM_KERNELS_2D(0, F32, F32)
+    HASH_CUMSUM_KERNELS_2D(0, F32, U8)
    HASH_CUMSUM_KERNELS_2D(1, U8,  U8)
    HASH_CUMSUM_KERNELS_2D(1, F32, F32)
+    HASH_CUMSUM_KERNELS_2D(1, F32, U8)
 };

 /*
--- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
@ -56,6 +56,10 @@ typedef enum
    UNARY_RCP,
    UNARY_SIGN,
    UNARY_SOFTSIGN,
+    UNARY_ATAN,
+    UNARY_ATANH,
+    UNARY_ACOSH,
+    UNARY_INVERSE_SIGMOID,
 } unary_type_e;

 /*
@ -100,10 +104,18 @@ typedef enum
 #define RCP_OPERATION           rcp
 #define SIGN_OPERATION          sign
 #define SOFTSIGN_OPERATION      softsign
+#define ATAN_OPERATION          atan
+#define ATANH_OPERATION         atanh
+#define ACOSH_OPERATION         acosh
+#define INVERSE_SIGMOID_OPERATION inverse_sigmoid

-#define ADD_UNARY_SH_KERNELS(name, src_type, dst_type) \
-    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, src_type, dst_type) \
-    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, src_type, dst_type)
+#define ADD_UNARY_SH_KERNELS(name) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F32, F32) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F32, F32) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, U8,  U8) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, U8,  U8) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, U8,  F32) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, U8,  F32)

 static const struct {
        uint32_t key;
@ -111,39 +123,28 @@ static const struct {
        const char* source_name;
    } kernel_map[] =
 {
-    ADD_UNARY_SH_KERNELS(SIN,      F32, F32)
-    ADD_UNARY_SH_KERNELS(COS,      F32, F32)
-    ADD_UNARY_SH_KERNELS(EXP,      F32, F32)
-    ADD_UNARY_SH_KERNELS(LOG,      F32, F32)
-    ADD_UNARY_SH_KERNELS(NEG,      F32, F32)
-    ADD_UNARY_SH_KERNELS(HSIGMOID, F32, F32)
-    ADD_UNARY_SH_KERNELS(MISH,     F32, F32)
-    ADD_UNARY_SH_KERNELS(ROUND,    F32, F32)
-    ADD_UNARY_SH_KERNELS(GELU,     F32, F32)
-    ADD_UNARY_SH_KERNELS(HGELU,    F32, F32)
-    ADD_UNARY_SH_KERNELS(SELU,     F32, F32)
-    ADD_UNARY_SH_KERNELS(CELU,     F32, F32)
-    ADD_UNARY_SH_KERNELS(RCP,      F32, F32)
-    ADD_UNARY_SH_KERNELS(SIGN,     F32, F32)
-    ADD_UNARY_SH_KERNELS(SOFTSIGN, F32, F32)
+    ADD_UNARY_SH_KERNELS(SIN)
+    ADD_UNARY_SH_KERNELS(COS)
+    ADD_UNARY_SH_KERNELS(EXP)
+    ADD_UNARY_SH_KERNELS(LOG)
+    ADD_UNARY_SH_KERNELS(NEG)
+    ADD_UNARY_SH_KERNELS(HSIGMOID)
+    ADD_UNARY_SH_KERNELS(MISH)
+    ADD_UNARY_SH_KERNELS(ROUND)
+    ADD_UNARY_SH_KERNELS(GELU)
+    ADD_UNARY_SH_KERNELS(HGELU)
+    ADD_UNARY_SH_KERNELS(SELU)
+    ADD_UNARY_SH_KERNELS(CELU)
+    ADD_UNARY_SH_KERNELS(RCP)
+    ADD_UNARY_SH_KERNELS(SIGN)
+    ADD_UNARY_SH_KERNELS(SOFTSIGN)
+    ADD_UNARY_SH_KERNELS(ATAN)
+    ADD_UNARY_SH_KERNELS(ATANH)
+    ADD_UNARY_SH_KERNELS(ACOSH)
+    ADD_UNARY_SH_KERNELS(INVERSE_SIGMOID)

-    ADD_UNARY_SH_KERNELS(SIN,      U8,  U8)
-    ADD_UNARY_SH_KERNELS(COS,      U8,  U8)
-    ADD_UNARY_SH_KERNELS(EXP,      U8,  U8)
-    ADD_UNARY_SH_KERNELS(LOG,      U8,  U8)
-    ADD_UNARY_SH_KERNELS(NEG,      U8,  U8)
-    ADD_UNARY_SH_KERNELS(HSIGMOID, U8,  U8)
-    ADD_UNARY_SH_KERNELS(MISH,     U8,  U8)
-    ADD_UNARY_SH_KERNELS(ROUND,    U8,  U8)
-    ADD_UNARY_SH_KERNELS(GELU,     U8,  U8)
-    ADD_UNARY_SH_KERNELS(HGELU,    U8,  U8)
-    ADD_UNARY_SH_KERNELS(SELU,     U8,  U8)
-    ADD_UNARY_SH_KERNELS(CELU,     U8,  U8)
-    ADD_UNARY_SH_KERNELS(RCP,      U8,  U8)
-    ADD_UNARY_SH_KERNELS(SIGN,     U8,  U8)
-    ADD_UNARY_SH_KERNELS(SOFTSIGN, U8,  U8)
-
-    ADD_UNARY_SH_KERNELS(NEG,      I32, I32)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I32, I32)
+    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I32, I32)
 };

 #undef SIN_OPERATION
@ -161,6 +162,10 @@ static const struct {
 #undef RCP_OPERATION
 #undef SIGN_OPERATION
 #undef SOFTSIGN_OPERATION
+#undef ATAN_OPERATION
+#undef ATANH_OPERATION
+#undef ACOSH_OPERATION
+#undef INVERSE_SIGMOID_OPERATION
 /*
 * Kernel params
 */
@ -262,6 +267,10 @@ static vsi_status _query_kernel
    case _PACK_SELECT_KEY(F16, F16):
        key = HASH_UNARY_KEY( type, F32, F32, image_2d );
        break;
+    case _PACK_SELECT_KEY(U8, F32):
+    case _PACK_SELECT_KEY(U8, F16):
+        key = HASH_UNARY_KEY( type, U8, F32, image_2d );
+        break;
    default:
        key = HASH_UNARY_KEY( type, input_dtype, output_dtype, image_2d );
        break;
@ -330,7 +339,7 @@ static vsi_nn_kernel_node_t _setup
    ret = vsi_nn_kernel_optimize_element_shape(
            inputs[0]->attr.size, inputs[0]->attr.dim_num,
            shape, &new_rank );
-    if( ret )
+    if ( ret )
    {
        rs_tensors[0] = vsi_nn_reshape_tensor( graph,
                inputs[0], shape, new_rank );
@ -338,7 +347,7 @@ static vsi_nn_kernel_node_t _setup
                outputs[0], shape, new_rank );
    }

-    if( !vsi_nn_kernel_gpu_check_shape( rs_tensors[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( rs_tensors[0]->attr.size,
                rs_tensors[0]->attr.dim_num ) )
    {
        return NULL;
@ -348,11 +357,11 @@ static vsi_nn_kernel_node_t _setup

    image_2d = (rs_tensors[0]->attr.dim_num == 2 || rs_tensors[0]->attr.size[2] == 1);
    status = _query_kernel( rs_tensors, &rs_tensors[1], unary_type, image_2d, kernel );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );

-        if( node )
+        if ( node )
        {
            vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
                    rs_tensors, 1, &rs_tensors[1], 1 );
@ -452,5 +461,9 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( celu,         UNARY_CELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( rcp,          UNARY_RCP )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( sign,         UNARY_SIGN )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( softsign,     UNARY_SOFTSIGN )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( atan,         UNARY_ATAN )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( atanh,        UNARY_ATANH )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( acosh,        UNARY_ACOSH )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( inverse_sigmoid, UNARY_INVERSE_SIGMOID )

 __END_DECLS
--- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
@ -49,6 +49,7 @@ typedef enum

 #define _GATHER_KERNEL_SOURCE           "gather"
 #define _GATHER_BATCH_KERNEL_SOURCE     "gather_batch"
+#define _GATHER_ARRAY_KERNEL_SOURCE     "gather_array"

 // Add kernel hashtable here
 #define VX_KERNEL_NAME_GATHER_U8TOU8       CVIVANTE_NAMESPACE("cl.gather_U8toU8")
@ -61,9 +62,14 @@ typedef enum
 #define VX_KERNEL_NAME_GATHER_BATCH_I32TOI32     CVIVANTE_NAMESPACE("cl.gather_batch_I32toI32")
 #define VX_KERNEL_NAME_GATHER_BATCH_F32TOF32     CVIVANTE_NAMESPACE("cl.gather_batch_F32toF32")

+#define VX_KERNEL_NAME_GATHER_ARRAY_U8TOU8       CVIVANTE_NAMESPACE("cl.gather_array_U8toU8")
+#define VX_KERNEL_NAME_GATHER_ARRAY_F16TOF16     CVIVANTE_NAMESPACE("cl.gather_array_F16toF16")
+#define VX_KERNEL_NAME_GATHER_ARRAY_I32TOI32     CVIVANTE_NAMESPACE("cl.gather_array_I32toI32")
+#define VX_KERNEL_NAME_GATHER_ARRAY_F32TOF32     CVIVANTE_NAMESPACE("cl.gather_array_F32toF32")
+
 // Add kernel hashtable here
-#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _image_2d, _batch) \
-    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d << 4) | (_batch))
+#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_array, _batch) \
+    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_array << 4) | (_batch))

 #define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0), \
@ -75,6 +81,11 @@ typedef enum
        VX_KERNEL_NAME_GATHER_BATCH_##IN0_TYPE##TO##OUT_TYPE, \
        SOURCE },

+#define TENSOR_GATHER_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0), \
+        VX_KERNEL_NAME_GATHER_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
 static const struct {
        uint32_t key;
        char* function_name;
@ -89,6 +100,10 @@ static const struct {
    TENSOR_GATHER_BATCH_KERNELS(F16, I32, F16, _GATHER_BATCH_KERNEL_SOURCE)
    TENSOR_GATHER_BATCH_KERNELS(I32, I32, I32, _GATHER_BATCH_KERNEL_SOURCE)
    TENSOR_GATHER_BATCH_KERNELS(F32, I32, F32, _GATHER_BATCH_KERNEL_SOURCE)
+    TENSOR_GATHER_ARRAY_KERNELS(U8,  I32, U8,  _GATHER_ARRAY_KERNEL_SOURCE)
+    TENSOR_GATHER_ARRAY_KERNELS(F16, I32, F16, _GATHER_ARRAY_KERNEL_SOURCE)
+    TENSOR_GATHER_ARRAY_KERNELS(I32, I32, I32, _GATHER_ARRAY_KERNEL_SOURCE)
+    TENSOR_GATHER_ARRAY_KERNELS(F32, I32, F32, _GATHER_ARRAY_KERNEL_SOURCE)
 };

 /*
@ -114,7 +129,8 @@ static vsi_status cal_gather_tensor_reshape_size
    vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
    uint32_t block_size,
    vsi_size_t batch_dims,
-    uint32_t idxFlg
+    uint32_t idxFlg,
+    int32_t* arrayFlg
    )
 {
    vsi_status status = VSI_FAILURE;
@ -148,18 +164,19 @@ static vsi_status cal_gather_tensor_reshape_size
    }
    else
    {
-        if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+        sizes[0] = block_size;
+        sizes[1] = elementCnt / block_size;
+        sizes[2] = outerCnt;
+        if ((elementCnt / block_size) >= VSI_NN_MAX_IMAGE_WIDTH)
        {
-            sizes[0] = block_size;
-            sizes[1] = elementCnt / block_size;
-            sizes[2] = outerCnt;
-            status = VSI_SUCCESS;
+            arrayFlg[0] |= 1;
        }
+        status = VSI_SUCCESS;
    }
 #undef VSI_NN_MAX_IMAGE_WIDTH

    return status;
-} /* _get_EltOP_tensor_reshape_size */
+} /* cal_gather_tensor_reshape_size */

 /*
 * Kernel initializer
@ -209,8 +226,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
    gpu_param.global_scale[1]  = 1;
    gpu_param.global_scale[2]  = 1;

-    gpu_param.global_size[0]   = gpu_align_p2((block_size + gpu_param.global_scale[0] - 1)
-                                        / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[0]   = block_size;
    gpu_param.global_size[1]   = indices_num;
    gpu_param.global_size[2]   = block_num;

@ -239,7 +255,8 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
-    int32_t is_batch
+    int32_t is_batch,
+    int32_t is_array
    /* Add extra params */
    )
 {
@ -262,7 +279,7 @@ static vsi_status _query_kernel
        output_dtype = I32;
    }

-    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0, is_batch );
+    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, is_array, is_batch );

    for ( i = 0; i < _cnt_of_array(gather_map); i ++ )
    {
@ -314,11 +331,12 @@ static vsi_nn_kernel_node_t _setup
    int32_t indices_num = vsi_nn_kernel_param_get_int32( params, "indices_num" );
    int32_t is_batch    = batch_dims > 0 ? 1 : 0;
    vsi_size_t rs_dim   = batch_dims == 0 ? 2 : 3;
+    int32_t is_array    = block_size >= GPU_TENSOR_MAX_WIDTH ? 1 : 0;
    int32_t i           = 0;

-    status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0);
-    status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1);
-    status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0);
+    status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0, &is_array);
+    status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array);
+    status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0, &is_array);
    if (status != VSI_SUCCESS)
    {
        return NULL;
@ -337,7 +355,7 @@ static vsi_nn_kernel_node_t _setup
        return NULL;
    }

-    status = _query_kernel( kernel, inputs, outputs, is_batch );
+    status = _query_kernel( kernel, inputs, outputs, is_batch, is_array );
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
--- a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
@ -43,6 +43,7 @@ __BEGIN_DECLS
 */
 #define KERNEL_SOURCE_1    "gather_nd"
 #define KERNEL_SOURCE_2    "gather_nd_3d"
+#define KERNEL_SOURCE_3    "gather_nd_batch"

 typedef enum
 {
@ -52,17 +53,25 @@ __BEGIN_DECLS
    _3D
 } vsi_nn_kernel_coord_type_e;

-#define HASH_GATHER_ND_KEY(_input0_type, _input1_type, _output_type, _coord_dim) \
-    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_coord_dim))
+#define HASH_GATHER_ND_KEY(_input0_type, _input1_type, _output_type, _coord_dim, _batch_dims) \
+    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_coord_dim << 4) | (_batch_dims))

 #define HASH_GATHER_ND_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
    CVIVANTE_NAMESPACE("cl.gather_nd_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)

 #define TENSOR_GATHER_ND_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
-    { HASH_GATHER_ND_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE), \
+    { HASH_GATHER_ND_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, 0), \
        HASH_GATHER_ND_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
        SOURCE },

+#define HASH_GATHER_ND_BATCH_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
+    CVIVANTE_NAMESPACE("cl.gather_nd_batch_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
+
+#define TENSOR_GATHER_ND_BATCH_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
+    { HASH_GATHER_ND_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, 1), \
+        HASH_GATHER_ND_BATCH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
+        SOURCE },
+
 static const struct {
        uint32_t key;
        char* function_name;
@ -81,6 +90,12 @@ static const struct {
    TENSOR_GATHER_ND_KERNELS(F16, I32, F16, _3D,      KERNEL_SOURCE_2)
    TENSOR_GATHER_ND_KERNELS(I32, I32, I32, _3D,      KERNEL_SOURCE_2)
    TENSOR_GATHER_ND_KERNELS(F32, I32, F32, _3D,      KERNEL_SOURCE_2)
+    TENSOR_GATHER_ND_BATCH_KERNELS(U8,  I32, U8,  _1D,      KERNEL_SOURCE_3)
+    TENSOR_GATHER_ND_BATCH_KERNELS(I8,  I32, I8,  _1D,      KERNEL_SOURCE_3)
+    TENSOR_GATHER_ND_BATCH_KERNELS(F16, I32, F16, _1D,      KERNEL_SOURCE_3)
+    TENSOR_GATHER_ND_BATCH_KERNELS(U8,  I32, U8,  _2D,      KERNEL_SOURCE_3)
+    TENSOR_GATHER_ND_BATCH_KERNELS(I8,  I32, I8,  _2D,      KERNEL_SOURCE_3)
+    TENSOR_GATHER_ND_BATCH_KERNELS(F16, I32, F16, _2D,      KERNEL_SOURCE_3)
 };

 /*
@ -103,7 +118,8 @@ static vsi_status cal_gather_nd_tensor_reshape_size
    vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
    uint32_t block_size,
    uint32_t coordDim,
-    int32_t* newDim
+    int32_t* newDim,
+    int32_t  batch_dims
    )
 {
    vsi_status status = VSI_FAILURE;
@ -114,45 +130,63 @@ static vsi_status cal_gather_nd_tensor_reshape_size
 #define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH

    newDim[0] = 0;
-    for(i = 0; i < dims_num; ++i)
+    for (i = 0; i < dims_num; ++i)
    {
        elementCnt *= input_size[i];
    }

-    for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
+    for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
    {
        sizes[i] = 1;
    }

-    if(coordDim) // input reshape
+    if (coordDim) // input reshape
    {
-        uint32_t offset = dims_num - coordDim + 1;
-        for(i = coordDim-1; i > 0; i--)
-        {
-            sizes[i] = input_size[i + offset - 1];
-        }
-        for(i = 0; i < offset; i++)
-        {
-            sizes[0] *= input_size[i];
-        }
+        uint32_t offset = dims_num - coordDim + 1 - batch_dims;

-        newDim[0] = coordDim;
-        if(coordDim == 1)
+        if (batch_dims)
        {
-            newDim[0] = 2;
-            sizes[0] = block_size;
-            sizes[1] = elementCnt / block_size;
+            for (i = 0; i < offset; i++)
+            {
+                sizes[0] *= input_size[i];
+            }
+
+            for (i = 0; i < coordDim; i++)
+            {
+                sizes[i + 1] = input_size[i + offset];
+            }
+
+            newDim[0] = coordDim == 1 ? 2 : 3;
        }
-        else if(coordDim == 4)
+        else
        {
-            newDim[0] = 3;
+            for (i = coordDim-1; i > 0; i--)
+            {
+                sizes[i] = input_size[i + offset - 1];
+            }
+            for (i = 0; i < offset; i++)
+            {
+                sizes[0] *= input_size[i];
+            }
+
+            newDim[0] = coordDim;
+            if (coordDim == 1)
+            {
+                newDim[0] = 2;
+                sizes[0] = block_size;
+                sizes[1] = elementCnt / block_size;
+            }
+            else if (coordDim == 4)
+            {
+                newDim[0] = 3;
+            }
        }

        status = VSI_SUCCESS;
    }
    else  // indices&output reshape
    {
-        if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+        if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
        {
            sizes[0] = block_size;
            sizes[1] = elementCnt / block_size;
@ -222,7 +256,8 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
-    int32_t coord_dim
+    int32_t coord_dim,
+    int32_t batch_dims
    )
 {
    vsi_status status = VSI_FAILURE;
@ -234,30 +269,49 @@ static vsi_status _query_kernel

    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-    if(coord_dim == 1)
+
+    if (input0_dtype == F32)
+    {
+        input0_dtype = F16;
+    }
+    else if (input0_dtype == I32 || input0_dtype == I16)
+    {
+        input0_dtype = I8;
+    }
+
+    if (output_dtype == F32)
+    {
+        output_dtype = F16;
+    }
+    else if (output_dtype == I32 || output_dtype == I16)
+    {
+        output_dtype = I8;
+    }
+
+    if (coord_dim == 1)
    {
        coord_type = _1D;
    }
-    else if(coord_dim == 2)
+    else if (coord_dim == 2)
    {
        coord_type = _2D;
    }
-    else if(coord_dim == 3 || coord_dim == 4)
+    else if (coord_dim == 3 || coord_dim == 4)
    {
        coord_type = _3D;
    }

-    key = HASH_GATHER_ND_KEY( input0_dtype, I32, output_dtype, coord_type );
+    key = HASH_GATHER_ND_KEY( input0_dtype, I32, output_dtype, coord_type, batch_dims );

-    for( i = 0; i < _cnt_of_array(gather_nd_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(gather_nd_map); i ++ )
    {
-        if( gather_nd_map[i].key == key )
+        if ( gather_nd_map[i].key == key )
        {
            break;
        }
    }

-    if( i < _cnt_of_array(gather_nd_map) )
+    if ( i < _cnt_of_array(gather_nd_map) )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  gather_nd_map[i].function_name );
        kernel->info.parameters = _gather_nd_kernel_param_def;
@ -289,29 +343,30 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t node_params[_GATHER_ND_PARAM_NUM] = {NULL};
    vsi_nn_kernel_node_t node = NULL;
    vsi_size_t  shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+    int32_t batch_dims  = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
    int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
    int32_t coord_dim  = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
    int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;

-    status = cal_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim);
-    status |= cal_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim);
-    status |= cal_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim);
-    if(status != VSI_SUCCESS)
+    status = cal_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim, batch_dims);
+    status |= cal_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim, batch_dims);
+    status |= cal_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim, batch_dims);
+    if (status != VSI_SUCCESS)
    {
        return NULL;
    }

-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
        return NULL;
    }

-    status = _query_kernel( kernel, inputs, outputs, coord_dim );
-    if( VSI_SUCCESS == status)
+    status = _query_kernel( kernel, inputs, outputs, coord_dim, batch_dims );
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
        {
            uint32_t index = 0;
            /* Pass parameters to node. */
--- a/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c
@ -0,0 +1,292 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+#define _GLOBALLPPOOL_KERNEL_SOURCE_NAME      "globallppool"
+
+// Add kernel hashtable here
+#define GLOBALLPPOOL_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        (( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
+#define GLOBALLPPOOL_KERNELS( IN_DTYPE, OUT_DTYPE ) \
+        { GLOBALLPPOOL_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+        CVIVANTE_NAMESPACE("cl.globallppool_"#IN_DTYPE"to"#OUT_DTYPE), \
+        _GLOBALLPPOOL_KERNEL_SOURCE_NAME }, \
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _globallppool_kernel_map[] =
+{
+    // Register kernel here
+    GLOBALLPPOOL_KERNELS( F32, F32 )
+    GLOBALLPPOOL_KERNELS( F32, U32 )
+    GLOBALLPPOOL_KERNELS( F32, I32 )
+    GLOBALLPPOOL_KERNELS( U32, U32 )
+    GLOBALLPPOOL_KERNELS( U32, F32 )
+    GLOBALLPPOOL_KERNELS( I32, I32 )
+    GLOBALLPPOOL_KERNELS( I32, F32 )
+    GLOBALLPPOOL_KERNELS( BF16, BF16 )
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _globallppool_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _GLOBALLPPOOL_PARAM_NUM  _cnt_of_array( _globallppool_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_globallppool_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        1,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status status = VSI_FAILURE;
+    vx_tensor  output = (vx_tensor)param[1];
+    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
+    vsi_size_array_t            *output_shape = NULL;
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
+    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+    output_shape = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_size[0]   = (output_shape->data[2] +  gpu_param.global_scale[0] - 1)
+                                        /  gpu_param.global_scale[0];
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+final:
+    if (output_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&output_attr);
+    }
+
+    return status;
+} /* _globallppool_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _globallppool_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _globallppool_kernel_map );
+    vx_param_description_t * param_def  = _globallppool_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _globallppool_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+#define _PACK_SELECT_KEY( in_dtype, out_dtype ) \
+     (( in_dtype ) | (out_dtype << 8 ))
+    switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+    case _PACK_SELECT_KEY(F32, F16):
+    case _PACK_SELECT_KEY(F16, F32):
+         key = GLOBALLPPOOL_HASH_KEY( F32, F32);
+         break;
+    case _PACK_SELECT_KEY(F32, U8):
+    case _PACK_SELECT_KEY(F16, U8):
+         key = GLOBALLPPOOL_HASH_KEY( F32, U32);
+         break;
+    case _PACK_SELECT_KEY(F32, I8):
+    case _PACK_SELECT_KEY(F32, I16):
+    case _PACK_SELECT_KEY(F16, I8):
+    case _PACK_SELECT_KEY(F16, I16):
+         key = GLOBALLPPOOL_HASH_KEY( F32, I32);
+         break;
+    case _PACK_SELECT_KEY(U8, U8):
+         key = GLOBALLPPOOL_HASH_KEY( U32, U32);
+         break;
+    case _PACK_SELECT_KEY(U8, F16):
+    case _PACK_SELECT_KEY(U8, F32):
+         key = GLOBALLPPOOL_HASH_KEY( U32, F32);
+         break;
+    case _PACK_SELECT_KEY(I8, I8):
+    case _PACK_SELECT_KEY(I8, I16):
+    case _PACK_SELECT_KEY(I16, I8):
+    case _PACK_SELECT_KEY(I16, I16):
+         key = GLOBALLPPOOL_HASH_KEY( I32, I32);
+         break;
+    case _PACK_SELECT_KEY(I8, F16):
+    case _PACK_SELECT_KEY(I8, F32):
+    case _PACK_SELECT_KEY(I16, F16):
+    case _PACK_SELECT_KEY(I16, F32):
+         key = GLOBALLPPOOL_HASH_KEY( I32, F32);
+         break;
+    default:
+         key = GLOBALLPPOOL_HASH_KEY( in_dtype, out_dtype);
+         break;
+    }
+#undef _PACK_SELECT_KEY
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _globallppool_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GLOBALLPPOOL_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t p           = vsi_nn_kernel_param_get_int32(params, "p");
+    int32_t width       = (int32_t)inputs[0]->attr.size[0];
+    int32_t height      = (int32_t)inputs[0]->attr.size[1];
+    float   outputScale = vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputTail  = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   inputScale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float   inputTail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
+                inputs[0]->attr.dim_num )
+     || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ))
+    {
+        return NULL;
+    }
+
+    outputScale = 1.0f / outputScale;
+    inputTail   = -(inputTail * inputScale);
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            uint32_t index = 2;
+            vsi_nn_kernel_node_pack_io( node_params, _GLOBALLPPOOL_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &p );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputTail );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GLOBALLPPOOL_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( globallppool, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cl/l1norm_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/l1norm_cl.c
@ -0,0 +1,365 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define _L1NORM_KERNEL_SOURCE_NAME      "l1norm"
+
+// Add kernel hashtable here
+#define L1NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d, AXIS) \
+        (( IN_DTYPE << 24 ) | ( OUT_DTYPE << 16) | (_image_2d << 8) | (AXIS))
+#define L1NORM_KERNELS( IN_DTYPE, OUT_DTYPE, AXIS ) \
+        { L1NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0 , AXIS), \
+        CVIVANTE_NAMESPACE("cl.l1norm_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
+        _L1NORM_KERNEL_SOURCE_NAME }
+
+#define L1NORM_KERNELS_2D( IN_DTYPE, OUT_DTYPE, AXIS ) \
+        { L1NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, AXIS), \
+        CVIVANTE_NAMESPACE("cl.l1norm_"#IN_DTYPE"to"#OUT_DTYPE"_2D_axis"#AXIS), \
+        _L1NORM_KERNEL_SOURCE_NAME }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _l1norm_kernel_map[] =
+{
+    // Register kernel here
+    L1NORM_KERNELS( U32, U32, 0 ),
+    L1NORM_KERNELS( U32, I32, 0 ),
+    L1NORM_KERNELS( U32, F32, 0 ),
+    L1NORM_KERNELS( I32, I32, 0 ),
+    L1NORM_KERNELS( I32, U32, 0 ),
+    L1NORM_KERNELS( I32, F32, 0 ),
+    L1NORM_KERNELS( F32, F32, 0 ),
+    L1NORM_KERNELS( F32, U32, 0 ),
+    L1NORM_KERNELS( F32, I32, 0 ),
+
+    L1NORM_KERNELS( U32, U32, 1 ),
+    L1NORM_KERNELS( U32, I32, 1 ),
+    L1NORM_KERNELS( U32, F32, 1 ),
+    L1NORM_KERNELS( I32, I32, 1 ),
+    L1NORM_KERNELS( I32, U32, 1 ),
+    L1NORM_KERNELS( I32, F32, 1 ),
+    L1NORM_KERNELS( F32, F32, 1 ),
+    L1NORM_KERNELS( F32, U32, 1 ),
+    L1NORM_KERNELS( F32, I32, 1 ),
+
+    L1NORM_KERNELS( U32, U32, 2 ),
+    L1NORM_KERNELS( U32, I32, 2 ),
+    L1NORM_KERNELS( U32, F32, 2 ),
+    L1NORM_KERNELS( I32, I32, 2 ),
+    L1NORM_KERNELS( I32, U32, 2 ),
+    L1NORM_KERNELS( I32, F32, 2 ),
+    L1NORM_KERNELS( F32, F32, 2 ),
+    L1NORM_KERNELS( F32, U32, 2 ),
+    L1NORM_KERNELS( F32, I32, 2 ),
+
+    L1NORM_KERNELS_2D( U32, U32, 0 ),
+    L1NORM_KERNELS_2D( U32, I32, 0 ),
+    L1NORM_KERNELS_2D( U32, F32, 0 ),
+    L1NORM_KERNELS_2D( I32, I32, 0 ),
+    L1NORM_KERNELS_2D( I32, U32, 0 ),
+    L1NORM_KERNELS_2D( I32, F32, 0 ),
+    L1NORM_KERNELS_2D( F32, F32, 0 ),
+    L1NORM_KERNELS_2D( F32, U32, 0 ),
+    L1NORM_KERNELS_2D( F32, I32, 0 ),
+
+    L1NORM_KERNELS_2D( U32, U32, 1 ),
+    L1NORM_KERNELS_2D( U32, I32, 1 ),
+    L1NORM_KERNELS_2D( U32, F32, 1 ),
+    L1NORM_KERNELS_2D( I32, I32, 1 ),
+    L1NORM_KERNELS_2D( I32, U32, 1 ),
+    L1NORM_KERNELS_2D( I32, F32, 1 ),
+    L1NORM_KERNELS_2D( F32, F32, 1 ),
+    L1NORM_KERNELS_2D( F32, U32, 1 ),
+    L1NORM_KERNELS_2D( F32, I32, 1 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _l1norm_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
+
+    // Add kererl parameters here
+};
+#define _L1NORM_PARAM_NUM  _cnt_of_array( _l1norm_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_l1norm_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status status = VSI_FAILURE;
+    vx_tensor  output = (vx_tensor)param[1];
+    vx_int32   axis   = 0;
+    vx_int32   dim    = 0;
+    vx_int32   width  = 0;
+    vx_int32   height = 0;
+    vx_int32   depth  = 0;
+
+    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
+    vsi_size_array_t            *output_shape = NULL;
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis);
+    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    output_shape = output_attr->shape;
+
+    dim    = output_shape->size < 3 ? 2 : 3;
+    width  = (vx_int32)output_shape->data[0];
+    height = (vx_int32)output_shape->data[1];
+    depth  = dim < 3 ? 1 : (vx_int32)output_shape->data[2];
+
+    gpu_param.dim = dim;
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    if (axis == 0)
+    {
+        gpu_param.local_size[0]  = 16;
+        gpu_param.local_size[1]  = 1;
+        gpu_param.local_size[2]  = 1;
+        gpu_param.global_size[0] = 16;
+        gpu_param.global_size[1] = height;
+        gpu_param.global_size[2] = depth;
+    }
+    else if (axis == 1)
+    {
+        gpu_param.local_size[0]  = 1;
+        gpu_param.local_size[1]  = 16;
+        gpu_param.local_size[2]  = 1;
+        gpu_param.global_size[0] = width;
+        gpu_param.global_size[1] = 16;
+        gpu_param.global_size[2] = depth;
+    }
+    else
+    {
+        gpu_param.local_size[0]  = 1;
+        gpu_param.local_size[1]  = 1;
+        gpu_param.local_size[2]  = 16;
+
+        gpu_param.global_size[0]   = width;
+        gpu_param.global_size[1]   = height;
+        gpu_param.global_size[2]   = 16;
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (output_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&output_attr);
+    }
+
+    return status;
+} /* _l1norm_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool image_2d,
+    int32_t axis
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _l1norm_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _l1norm_kernel_map );
+    vx_param_description_t * param_def  = _l1norm_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _l1norm_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (F16 == in_dtype)
+    {
+        in_dtype = F32;
+    }
+    else if (U8 == in_dtype)
+    {
+        in_dtype = U32;
+    }
+    else if (I16 == in_dtype || I8 == in_dtype)
+    {
+        in_dtype = I32;
+    }
+
+    if (F16 == out_dtype)
+    {
+        out_dtype = F32;
+    }
+    else if (U8 == out_dtype)
+    {
+        out_dtype = U32;
+    }
+    else if (I16 == out_dtype || I8 == out_dtype)
+    {
+        out_dtype = I32;
+    }
+
+    key = L1NORM_HASH_KEY( in_dtype, out_dtype, image_2d, axis);
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _l1norm_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_L1NORM_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_bool image_2d = FALSE;
+    int32_t axis      = vsi_nn_kernel_param_get_int32(params, "axis");
+    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
+    float outputTail  = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float inputZp     = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    int32_t axis_size   = (int32_t)outputs[0]->attr.size[axis];
+    outputScale = 1.0f / outputScale;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    image_2d = (outputs[0]->attr.dim_num == 2);
+
+    status = _query_kernel( kernel, inputs, outputs, image_2d, axis );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            uint32_t index = 2;
+            vsi_nn_kernel_node_pack_io( node_params, _L1NORM_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputZp );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_size );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _L1NORM_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( l1norm, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c
@ -35,6 +35,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"

 __BEGIN_DECLS

@ -212,27 +213,52 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t node_params[_LOGICAL_NOT_PARAM_NUM] = {NULL};
    vsi_bool image_2d = FALSE;
    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = FALSE;

-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    ret = vsi_nn_kernel_optimize_element_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            shape, &new_rank );
+
+    if ( ret )
    {
-        return NULL;
+        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shape, new_rank );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shape, new_rank );
+    }
+    else
+    {
+        goto final;
    }

-    image_2d = (outputs[0]->attr.dim_num == 2 || outputs[0]->attr.size[2] == 1);
-    status = _query_kernel( kernel, inputs, outputs, image_2d);
-    if( VSI_SUCCESS == status)
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[1]->attr.size,
+                reshape_tensors[1]->attr.dim_num ) )
+    {
+        goto final;
+    }
+
+    image_2d = (reshape_tensors[1]->attr.dim_num == 2 || reshape_tensors[1]->attr.size[2] == 1);
+    status = _query_kernel( kernel, &reshape_tensors[0], &reshape_tensors[1], image_2d);
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
        {
            /* Set inputs and outputs */
            vsi_nn_kernel_node_pack_io( node_params, _LOGICAL_NOT_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
+                    &reshape_tensors[0], input_num, &reshape_tensors[1], output_num );
            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _LOGICAL_NOT_PARAM_NUM );
        }
    }
+
+final:
+    vsi_safe_release_tensor( reshape_tensors[0] );
+    vsi_safe_release_tensor( reshape_tensors[1] );
+
    return node;
 } /* _setup() */

--- a/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c
@ -35,7 +35,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"

 __BEGIN_DECLS

@ -228,30 +228,75 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t node_params[_LOGICAL_OPS_PARAM_NUM] = {NULL};
    vsi_bool image_2d = FALSE;
    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = FALSE;
    uint32_t ops_type  = vsi_nn_kernel_param_get_int32( params, "ops_type" );

-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    ret = vsi_nn_kernel_optimize_eltwise_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            inputs[1]->attr.size, inputs[1]->attr.dim_num,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes[0], shapes[1], shapes[2], &new_rank );
+
+    if ( ret )
    {
-        return NULL;
+        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shapes[0], new_rank );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                inputs[1], shapes[1], new_rank );
+        reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shapes[2], new_rank );
+
+#define _swap_tensor(a, b, tmp)  \
+    do { \
+        tmp = a; \
+        a = b; \
+        b = tmp; \
+    } while(0)
+
+        if (shapes[1][3] > shapes[0][3] && new_rank == 4)
+        {
+            vsi_nn_tensor_t* reshape_tmp;
+            _swap_tensor(reshape_tensors[0], reshape_tensors[1], reshape_tmp);
+        }
+
+#undef _swap_tensor
+    }
+    else
+    {
+        goto final;
+    }
+
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
+                reshape_tensors[2]->attr.dim_num ) )
+    {
+        goto final;
    }

    image_2d = (outputs[0]->attr.dim_num == 2);

-    status = _query_kernel( kernel, inputs, outputs, image_2d, (vsi_nn_logical_ops_type_t)ops_type);
+    status = _query_kernel( kernel, reshape_tensors, &reshape_tensors[2],
+                                image_2d, (vsi_nn_logical_ops_type_t)ops_type);

-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
        {
            /* Pass parameters to node. */
            vsi_nn_kernel_node_pack_io( node_params, _LOGICAL_OPS_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
+                    reshape_tensors, input_num, &reshape_tensors[2], output_num );
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _LOGICAL_OPS_PARAM_NUM );
        }
    }

+final:
+    vsi_safe_release_tensor( reshape_tensors[0] );
+    vsi_safe_release_tensor( reshape_tensors[1] );
+    vsi_safe_release_tensor( reshape_tensors[2] );
+
    return node;
 } /* _setup() */

--- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
@ -64,12 +64,12 @@ __BEGIN_DECLS

 #define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
    { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0), \
-        HASH_MATRIXMUL_SH_KERNEL_NAME(F32, F32, F32, IMAGE_DIM), \
+        HASH_MATRIXMUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
        SOURCE },

 #define TENSOR_MATRIXMUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
    { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1), \
-        HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(F32, F32, F32, IMAGE_DIM), \
+        HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
        SOURCE },

 #define TENSOR_MATRIXMUL_TRANSB_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
@ -83,18 +83,32 @@ static const struct {
        const char* source_name;
    } matrixmul_map[] =
 {
-    TENSOR_MATRIXMUL_KERNELS(F16, F16, F16, _2D,           KERNEL_SOURCE_1)
-    TENSOR_MATRIXMUL_KERNELS(F16, F16, F16, _3D,           KERNEL_SOURCE_1)
-    TENSOR_MATRIXMUL_TRANSA_KERNELS(F16, F16, F16, _2D,    KERNEL_SOURCE_2)
-    TENSOR_MATRIXMUL_TRANSA_KERNELS(F16, F16, F16, _3D,    KERNEL_SOURCE_2)
    TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _2D,           KERNEL_SOURCE_1)
    TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _3D,           KERNEL_SOURCE_1)
    TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _2D,    KERNEL_SOURCE_2)
    TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _3D,    KERNEL_SOURCE_2)
    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _2D,    KERNEL_SOURCE_1)
    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _3D,    KERNEL_SOURCE_1)
-    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8,  F32, _2D,    KERNEL_SOURCE_1)
-    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8,  F32, _3D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8, F32, _2D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8, F32, _3D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_KERNELS(I8, I8, I8, _2D,           KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_KERNELS(I8, I8, I8, _3D,           KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSA_KERNELS(I8, I8, I8, _2D,    KERNEL_SOURCE_2)
+    TENSOR_MATRIXMUL_TRANSA_KERNELS(I8, I8, I8, _3D,    KERNEL_SOURCE_2)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(I8, I8, I8, _2D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(I8, I8, I8, _3D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_KERNELS(U8, U8, U8, _2D,           KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_KERNELS(U8, U8, U8, _3D,           KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, U8, _2D,    KERNEL_SOURCE_2)
+    TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, U8, _3D,    KERNEL_SOURCE_2)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, U8, _2D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, U8, _3D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_KERNELS(U8, U8, F32, _2D,           KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_KERNELS(U8, U8, F32, _3D,           KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, F32, _2D,    KERNEL_SOURCE_2)
+    TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, F32, _3D,    KERNEL_SOURCE_2)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _2D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _3D,    KERNEL_SOURCE_1)
 };

 /*
@ -198,10 +212,44 @@ static vsi_status _query_kernel
        dim_type = _3D;
    }

+    if (input0_dtype == I16 || input0_dtype == I32)
+    {
+        input0_dtype = I8;
+    }
+    else if (input0_dtype == F16)
+    {
+        input0_dtype = F32;
+    }
+    else if (input0_dtype == U32)
+    {
+        input0_dtype = U8;
+    }
+
    if (input1_dtype == I16 || input1_dtype == I32)
    {
        input1_dtype = I8;
    }
+    else if (input1_dtype == F16)
+    {
+        input1_dtype = F32;
+    }
+    else if (input1_dtype == U32)
+    {
+        input1_dtype = U8;
+    }
+
+    if (output_dtype == I16 || output_dtype == I32)
+    {
+        output_dtype = I8;
+    }
+    else if (output_dtype == F16)
+    {
+        output_dtype = F32;
+    }
+    else if (output_dtype == U32)
+    {
+        output_dtype = U8;
+    }

    key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, transa );

@ -260,6 +308,8 @@ static vsi_nn_kernel_node_t _setup
    float    scale_out = vsi_nn_get_tensor_scale(outputs[0]);
    float    zp_out = (float)vsi_nn_get_tensor_zero_point(outputs[0]);

+    scale_out = 1 / scale_out;
+
    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
--- a/src/tim/vx/internal/src/kernel/cl/maxunpool_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/maxunpool_cl.c
@ -0,0 +1,330 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define _MAXUNPOOL_KERNEL_SOURCE_NAME      "maxunpool"
+
+// Add kernel hashtable here
+#define MAXUNPOOL_HASH_KEY( IN_DTYPE0, IN_DTYPE1, OUT_DTYPE ) \
+        (( IN_DTYPE0 << 16 ) | ( IN_DTYPE1 << 8 ) | ( OUT_DTYPE ))
+#define MAXUNPOOL_KERNELS( IN_DTYPE0, IN_DTYPE1, OUT_DTYPE ) \
+        { MAXUNPOOL_HASH_KEY( IN_DTYPE0, I32, OUT_DTYPE ), \
+        CVIVANTE_NAMESPACE("cl.maxunpool_"#IN_DTYPE0"to"#OUT_DTYPE), \
+        _MAXUNPOOL_KERNEL_SOURCE_NAME },
+
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _maxunpool_kernel_map[] =
+{
+    // Register kernel here
+    MAXUNPOOL_KERNELS( F32, I32, F32)
+    MAXUNPOOL_KERNELS( F32, I32, U32)
+    MAXUNPOOL_KERNELS( F32, I32, I32)
+    MAXUNPOOL_KERNELS( U32, I32, U32)
+    MAXUNPOOL_KERNELS( U32, I32, F32)
+    MAXUNPOOL_KERNELS( I32, I32, I32)
+    MAXUNPOOL_KERNELS( I32, I32, F32)
+    MAXUNPOOL_KERNELS( BF16, I32, BF16)
+};
+
+
+/*
+ * Kernel params
+ */
+
+static vx_param_description_t _maxunpool_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _MAXUNPOOL_PARAM_NUM  _cnt_of_array( _maxunpool_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_maxunpool_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status status = VSI_FAILURE;
+    vx_tensor  output = (vx_tensor)param[2];
+    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
+    vsi_size_array_t            *output_shape = NULL;
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
+    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    output_shape = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    gpu_param.global_size[0]   = (output_shape->data[0] +  gpu_param.global_scale[0] - 1)
+                                        /  gpu_param.global_scale[0];
+    gpu_param.global_size[1]   = (output_shape->data[1] +  gpu_param.global_scale[1] - 1)
+                                        /  gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = (output_shape->data[2] +  gpu_param.global_scale[2] - 1)
+                                        /  gpu_param.global_scale[2];
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (output_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&output_attr);
+    }
+
+    return status;
+} /* _maxunpool_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _maxunpool_kernel_map;
+    vx_kernel_initialize_f initializer = _maxunpool_initializer;
+    vx_param_description_t * param_def = _maxunpool_kernel_param_def;
+    size_t kernel_map_size = _cnt_of_array( _maxunpool_kernel_map );
+    size_t param_size = _cnt_of_array( _maxunpool_kernel_param_def );
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+#define _PACK_SELECT_KEY( in_dtype, out_dtype ) \
+     (( in_dtype ) | (out_dtype << 8 ))
+    switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+    case _PACK_SELECT_KEY(F32, F16):
+    case _PACK_SELECT_KEY(F16, F32):
+         key = MAXUNPOOL_HASH_KEY( F32, I32, F32);
+         break;
+    case _PACK_SELECT_KEY(F32, U8):
+    case _PACK_SELECT_KEY(F16, U8):
+         key = MAXUNPOOL_HASH_KEY( F32, I32, U32);
+         break;
+    case _PACK_SELECT_KEY(F32, I8):
+    case _PACK_SELECT_KEY(F32, I16):
+    case _PACK_SELECT_KEY(F16, I8):
+    case _PACK_SELECT_KEY(F16, I16):
+         key = MAXUNPOOL_HASH_KEY( F32, I32, I32);
+         break;
+    case _PACK_SELECT_KEY(U8, U8):
+         key = MAXUNPOOL_HASH_KEY( U32, I32, U32);
+         break;
+    case _PACK_SELECT_KEY(U8, F16):
+    case _PACK_SELECT_KEY(U8, F32):
+         key = MAXUNPOOL_HASH_KEY( U32, I32, F32);
+         break;
+    case _PACK_SELECT_KEY(I8, I8):
+    case _PACK_SELECT_KEY(I8, I16):
+    case _PACK_SELECT_KEY(I16, I8):
+    case _PACK_SELECT_KEY(I16, I16):
+         key = MAXUNPOOL_HASH_KEY( I32, I32, I32);
+         break;
+    case _PACK_SELECT_KEY(I8, F16):
+    case _PACK_SELECT_KEY(I8, F32):
+    case _PACK_SELECT_KEY(I16, F16):
+    case _PACK_SELECT_KEY(I16, F32):
+         key = MAXUNPOOL_HASH_KEY( I32, I32, F32);
+         break;
+    default:
+         key = MAXUNPOOL_HASH_KEY( in_dtype, I32, out_dtype);
+         break;
+    }
+#undef _PACK_SELECT_KEY
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_MAXUNPOOL_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t pad_left   = vsi_nn_kernel_param_get_int32(params, "pad_left");
+    int32_t pad_right  = vsi_nn_kernel_param_get_int32(params, "pad_right");
+    int32_t pad_top    = vsi_nn_kernel_param_get_int32(params, "pad_top");
+    int32_t pad_bottom = vsi_nn_kernel_param_get_int32(params, "pad_bottom");
+    int32_t width_in   = (int32_t)inputs[0]->attr.size[0];
+    int32_t height_in  = (int32_t)inputs[0]->attr.size[1];
+    int32_t width      = (int32_t)outputs[0]->attr.size[0];
+    int32_t height     = (int32_t)outputs[0]->attr.size[1];
+    int32_t batch      = (int32_t)outputs[0]->attr.size[2];
+    int32_t width_nopad  = width - pad_left - pad_right;
+    int32_t height_nopad = height - pad_top - pad_bottom;
+    float   outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float   inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
+                inputs[0]->attr.dim_num )
+     || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ))
+    {
+        return NULL;
+    }
+
+    outputScale = 1.0f / outputScale;
+    inputTail   = -(inputTail * inputScale);
+
+    status = _query_kernel( kernel, inputs, outputs );
+
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            uint32_t index = 3;
+            vsi_nn_kernel_node_pack_io( node_params, _MAXUNPOOL_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width_nopad );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height_nopad );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width_in );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height_in );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &batch );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_left );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_top );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputTail );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _MAXUNPOOL_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+            vsi_nn_kernel_scalar_release( &node_params[13] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( maxunpool, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cl/pow_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/pow_cl.c
@ -81,9 +81,11 @@ static const struct {
 {
    TENSOR_POW_KERNELS_FLOAT(F32, F32, F32,  KERNEL_SOURCE_1)
    TENSOR_POW_KERNELS_FLOAT(F16, F16, F16, KERNEL_SOURCE_1)
+    TENSOR_POW_KERNELS(U32, F32, U32, KERNEL_SOURCE_1)

    TENSOR_POW_KERNELS_2D_FLOAT(F32, F32, F32, KERNEL_SOURCE_1)
    TENSOR_POW_KERNELS_2D_FLOAT(F16, F16, F16, KERNEL_SOURCE_1)
+    TENSOR_POW_KERNELS_2D(U32, F32, U32, KERNEL_SOURCE_1)
 };

 /*
@ -94,6 +96,10 @@ static vx_param_description_t kernel_param_def[] =
    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };

 #define _CL_PARAM_NUM          _cnt_of_array(kernel_param_def)
@ -179,7 +185,25 @@ static vsi_status _query_kernel
    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-    key = HASH_POW_KEY( input0_dtype, input1_dtype, output_dtype, image_2d );
+
+#define _PACK_SELECT_KEY( input0_dtype, input1_dtype, output_dtype) \
+    ((input0_dtype) | (input1_dtype << 8) | (output_dtype << 16))
+    switch(_PACK_SELECT_KEY(input0_dtype, input1_dtype, output_dtype))
+    {
+    case _PACK_SELECT_KEY(F16, F16, F16):
+    case _PACK_SELECT_KEY(F32, F32, F32):
+        key = HASH_POW_KEY( F32, F32, F32, image_2d );
+        break;
+    case _PACK_SELECT_KEY(U8, F16, U8):
+    case _PACK_SELECT_KEY(U8, F32, U8):
+    case _PACK_SELECT_KEY(U32, F16, U32):
+    case _PACK_SELECT_KEY(U32, F32, U32):
+        key = HASH_POW_KEY( U32, F32, U32, image_2d );
+        break;
+    default:
+        key = HASH_POW_KEY( input0_dtype, input1_dtype, output_dtype, image_2d );
+        break;
+    }

    for( i = 0; i < _cnt_of_array(pow_map); i ++ )
    {
@ -219,6 +243,13 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
    vsi_bool image_2d = FALSE;
    vsi_nn_kernel_node_t node = NULL;
+    float   outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float   inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+
+    outputScale = 1.0f / outputScale;
+    inputTail   = -(inputTail * inputScale);

    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
@ -234,11 +265,20 @@ static vsi_nn_kernel_node_t _setup

        if( node )
        {
+            uint32_t index = 3;
            vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
                    inputs, 2, outputs, 1 );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputTail );

            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
            VSI_ASSERT( status == VSI_SUCCESS );
        }
    }
--- a/src/tim/vx/internal/src/kernel/cl/reversesequence_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/reversesequence_cl.c
@ -0,0 +1,307 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define _REVERSESEQUENCE_KERNEL_SOURCE_NAME       "reversesequence"
+
+// Add kernel hashtable here
+#define REVERSESEQUENCE_HASH_KEY( IN_DTYPE0, IN_DTYPE1, OUT_DTYPE, batch_axis ) \
+        (( IN_DTYPE0 << 24 ) | ( IN_DTYPE1 << 16 ) | ( OUT_DTYPE << 8) | (batch_axis) )
+#define REVERSESEQUENCE_KERNELS( IN_DTYPE0, IN_DTYPE1, OUT_DTYPE, batch_axis ) \
+        { REVERSESEQUENCE_HASH_KEY( IN_DTYPE0, IN_DTYPE1, OUT_DTYPE, batch_axis ), \
+        CVIVANTE_NAMESPACE("cl.reversesequence_"#IN_DTYPE0"to"#OUT_DTYPE#batch_axis), \
+        _REVERSESEQUENCE_KERNEL_SOURCE_NAME },
+
+typedef enum
+{
+    _axis1 = 0,
+    _axis2
+} vsi_nn_kernel_batch_axis_type_e;
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _reversesequence_kernel_map[] =
+{
+    // Register kernel here
+    REVERSESEQUENCE_KERNELS( F32, I32, F32, _axis1)
+    REVERSESEQUENCE_KERNELS( F32, I32, U32, _axis1)
+    REVERSESEQUENCE_KERNELS( F32, I32, I32, _axis1)
+    REVERSESEQUENCE_KERNELS( U32, I32, U32, _axis1)
+    REVERSESEQUENCE_KERNELS( U32, I32, F32, _axis1)
+    REVERSESEQUENCE_KERNELS( I32, I32, I32, _axis1)
+    REVERSESEQUENCE_KERNELS( I32, I32, F32, _axis1)
+    REVERSESEQUENCE_KERNELS( BF16, I32, BF16, _axis1)
+
+    REVERSESEQUENCE_KERNELS( F32, I32, F32, _axis2)
+    REVERSESEQUENCE_KERNELS( F32, I32, U32, _axis2)
+    REVERSESEQUENCE_KERNELS( F32, I32, I32, _axis2)
+    REVERSESEQUENCE_KERNELS( U32, I32, U32, _axis2)
+    REVERSESEQUENCE_KERNELS( U32, I32, F32, _axis2)
+    REVERSESEQUENCE_KERNELS( I32, I32, I32, _axis2)
+    REVERSESEQUENCE_KERNELS( I32, I32, F32, _axis2)
+    REVERSESEQUENCE_KERNELS( BF16, I32, BF16, _axis2)
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _reversesequence_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _REVERSESEQUENCE_PARAM_NUM  _cnt_of_array( _reversesequence_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_reversesequence_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status status = VSI_FAILURE;
+    vx_tensor  input = (vx_tensor)param[0];
+    vsi_nn_kernel_tensor_attr_t *input_attr  = NULL;
+    vsi_size_array_t            *input_shape = NULL;
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input );
+    CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    input_shape = input_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    gpu_param.global_size[0]   = (input_shape->data[0] +  gpu_param.global_scale[0] - 1)
+                                        /  gpu_param.global_scale[0];
+    gpu_param.global_size[1]   = (input_shape->data[1] +  gpu_param.global_scale[1] - 1)
+                                        /  gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = (input_shape->data[2] +  gpu_param.global_scale[2] - 1)
+                                        /  gpu_param.global_scale[2];
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+final:
+    if (input_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&input_attr);
+    }
+
+    return status;
+} /* _reversesequence_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t batch_axis
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _reversesequence_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _reversesequence_kernel_map );
+    vx_param_description_t * param_def  = _reversesequence_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _reversesequence_initializer;
+    vsi_nn_kernel_batch_axis_type_e axis_type = _axis1;
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (batch_axis == 2)
+    {
+        axis_type = _axis2;
+    }
+
+#define _PACK_SELECT_KEY( in_dtype, out_dtype ) \
+    (( in_dtype ) | (out_dtype << 8 ))
+    switch(_PACK_SELECT_KEY( in_dtype, out_dtype ))
+    {
+    case _PACK_SELECT_KEY(F16, F16):
+    case _PACK_SELECT_KEY(F32, F32):
+        key = REVERSESEQUENCE_HASH_KEY( F32, I32, F32, axis_type);
+        break;
+    case _PACK_SELECT_KEY(F16, U8):
+    case _PACK_SELECT_KEY(F32, U8):
+        key = REVERSESEQUENCE_HASH_KEY( F32, I32, U32, axis_type);
+        break;
+    case _PACK_SELECT_KEY(F16, I8):
+    case _PACK_SELECT_KEY(F32, I8):
+    case _PACK_SELECT_KEY(F16, I16):
+    case _PACK_SELECT_KEY(F32, I16):
+        key = REVERSESEQUENCE_HASH_KEY( F32, I32, I32, axis_type);
+        break;
+    case _PACK_SELECT_KEY(U8, U8):
+        key = REVERSESEQUENCE_HASH_KEY( U32, I32, U32, axis_type);
+        break;
+    case _PACK_SELECT_KEY(U8, F16):
+    case _PACK_SELECT_KEY(U8, F32):
+        key = REVERSESEQUENCE_HASH_KEY( U32, I32, F32, axis_type);
+        break;
+    case _PACK_SELECT_KEY(I8, I8):
+    case _PACK_SELECT_KEY(I16, I16):
+        key = REVERSESEQUENCE_HASH_KEY( I32, I32, I32, axis_type);
+        break;
+    case _PACK_SELECT_KEY(I8, F16):
+    case _PACK_SELECT_KEY(I8, F32):
+    case _PACK_SELECT_KEY(I16, F16):
+    case _PACK_SELECT_KEY(I16, F32):
+        key = REVERSESEQUENCE_HASH_KEY( I32, I32, F32, axis_type);
+        break;
+    case _PACK_SELECT_KEY(BF16, BF16):
+        key = REVERSESEQUENCE_HASH_KEY( BF16, I32, BF16, axis_type);
+        break;
+    default:
+        key = REVERSESEQUENCE_HASH_KEY( in_dtype, I32, out_dtype, axis_type);
+        break;
+    }
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _reversesequence_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_REVERSESEQUENCE_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t batch_axis = vsi_nn_kernel_param_get_int32(params, "batch_axis");
+    float   outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float   inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float   inoutScale   = inputScale / outputScale;
+    float   inoutTail    = outputTail - inputTail * inoutScale;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
+                inputs[0]->attr.dim_num )
+     || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ))
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, batch_axis );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            uint32_t index = 3;
+            vsi_nn_kernel_node_pack_io( node_params, _REVERSESEQUENCE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inoutScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inoutTail );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _REVERSESEQUENCE_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( reversesequence, _setup )
+
--- a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
@ -88,6 +88,7 @@ static vx_param_description_t _roi_align_kernel_param_def[] =
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _ROI_ALIGN_PARAM_NUM  _cnt_of_array( _roi_align_kernel_param_def )

@ -105,8 +106,9 @@ static vx_param_description_t _roi_align_kernel_param_def[] =
 #define SCALAR_SAMPLING_Y_RATIO         (15)
 #define SCALAR_DEPTH                    (16)
 #define SCALAR_FORMAT                   (17)
+#define PLATFORM_TYPE                   (18)

-#define ROI_ALIGN_PARAM_NUM         18
+#define ROI_ALIGN_PARAM_NUM         19
 #define ROI_ALIGN_QUANT_PARAM_NUM   _cnt_of_array( _roi_align_kernel_param_def )

 /*
@ -250,6 +252,7 @@ static vsi_nn_kernel_node_t _setup
    float   height_ratio        = vsi_nn_kernel_param_get_float32( params, "height_ratio" );
    int32_t width_sample_num    = vsi_nn_kernel_param_get_int32( params, "width_sample_num" );
    int32_t height_sample_num   = vsi_nn_kernel_param_get_int32( params, "height_sample_num" );
+    int32_t platform_type       = vsi_nn_kernel_param_get_int32( params, "platform_type" );
    float   input_zp    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
    float   input_scale = vsi_nn_get_tensor_scale(inputs[0]);
    float   input_tail  = -(input_zp * input_scale);
@ -318,6 +321,7 @@ static vsi_nn_kernel_node_t _setup
            node_params[SCALAR_SAMPLING_Y_RATIO]     = vsi_nn_kernel_scalar_create( graph, F32, &sampling_y_ratio );
            node_params[SCALAR_DEPTH]                = vsi_nn_kernel_scalar_create( graph, I32, &depth );
            node_params[SCALAR_FORMAT]               = vsi_nn_kernel_scalar_create( graph, I32, &dtype );
+            node_params[PLATFORM_TYPE]               = vsi_nn_kernel_scalar_create( graph, I32, &platform_type );

            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
@ -336,6 +340,7 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_DEPTH] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_DEPTH] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_FORMAT] );
+            vsi_nn_kernel_scalar_release( &node_params[PLATFORM_TYPE] );
        }
    }

--- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c
@ -110,7 +110,7 @@ static vsi_status cal_scatter_nd_tensor_reshape_size
    uint32_t i = 0;
    vsi_size_t elementCnt = 1;

-    if(coordDim != 0 && (width == NULL || area == NULL))
+    if (coordDim != 0 && (width == NULL || area == NULL))
    {
        return status;
    }
@ -118,17 +118,17 @@ static vsi_status cal_scatter_nd_tensor_reshape_size
 #define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH

    newDim[0] = 0;
-    for(i = 0; i < dims_num; ++i)
+    for (i = 0; i < dims_num; ++i)
    {
        elementCnt *= input_size[i];
    }

-    for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
+    for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
    {
        sizes[i] = 1;
    }

-    if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+    if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
    {
        sizes[0] = block_size;
        sizes[1] = elementCnt / block_size;
@ -140,17 +140,17 @@ static vsi_status cal_scatter_nd_tensor_reshape_size
        return status;
    }

-    if(coordDim == 1) // index shape
+    if (coordDim == 1) // index shape
    {
        *width = 0;
        *area = 0;
    }
-    else if(coordDim == 2)
+    else if (coordDim == 2)
    {
        *width = input_size[dims_num - 2];
        *area = 0;
    }
-    else if(coordDim == 3)
+    else if (coordDim == 3)
    {
        *width = input_size[dims_num - 3];
        *area = input_size[dims_num - 3] * input_size[dims_num - 2];
@ -226,30 +226,33 @@ static vsi_status _query_kernel

    input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-    if(coord_dim == 1)
+    if (coord_dim == 1)
    {
        coord_type = _1D;
    }
-    else if(coord_dim == 2)
+    else if (coord_dim == 2)
    {
        coord_type = _2D;
    }
-    else if(coord_dim == 3)
+    else if (coord_dim == 3)
    {
        coord_type = _3D;
    }

+    input1_dtype = input1_dtype == F16 ? F32 : input1_dtype;
+    output_dtype = output_dtype == F16 ? F32 : output_dtype;
+
    key = HASH_SCATTER_ND_KEY( I32, input1_dtype, output_dtype, coord_type );

-    for( i = 0; i < _cnt_of_array(scatter_nd_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(scatter_nd_map); i ++ )
    {
-        if( scatter_nd_map[i].key == key )
+        if ( scatter_nd_map[i].key == key )
        {
            break;
        }
    }

-    if( i < _cnt_of_array(scatter_nd_map) )
+    if ( i < _cnt_of_array(scatter_nd_map) )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  scatter_nd_map[i].function_name );
        kernel->info.parameters = _scatter_nd_kernel_param_def;
@ -287,26 +290,31 @@ static vsi_nn_kernel_node_t _setup
    int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
    vsi_size_t width = 0, area = 0;

-    status = cal_scatter_nd_tensor_reshape_size(&inputs[0], shapes[0], coord_dim, 0, NULL, NULL, &rs_in_dim);
-    status |= cal_scatter_nd_tensor_reshape_size(&inputs[1], shapes[1], block_size, 0, NULL, NULL, &rs_idx_dim);
-    status |= cal_scatter_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim,
-                                            &width, &area, &rs_out_dim);
-    if(status != VSI_SUCCESS)
+    if (coord_dim > 3)
    {
        return NULL;
    }

-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    status = cal_scatter_nd_tensor_reshape_size(&inputs[0], shapes[0], coord_dim, 0, NULL, NULL, &rs_in_dim);
+    status |= cal_scatter_nd_tensor_reshape_size(&inputs[1], shapes[1], block_size, 0, NULL, NULL, &rs_idx_dim);
+    status |= cal_scatter_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim,
+                                            &width, &area, &rs_out_dim);
+    if (status != VSI_SUCCESS)
+    {
+        return NULL;
+    }
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
        return NULL;
    }

    status = _query_kernel( kernel, inputs, outputs, coord_dim );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
        {
            uint32_t index = 0;
            /* Pass parameters to node. */
--- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
@ -111,12 +111,12 @@ static vsi_status cal_scatter_nd_update_tensor_reshape_size
 #define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH

    newDim[0] = 0;
-    for(i = 0; i < dims_num; ++i)
+    for (i = 0; i < dims_num; ++i)
    {
        elementCnt *= input_size[i];
    }

-    for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
+    for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
    {
        sizes[i] = 1;
    }
@ -235,7 +235,7 @@ static vsi_status _query_kernel

    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 0 );

-    for( i = 0; i < _cnt_of_array(scatter_nd_update_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_map); i ++ )
    {
        if ( scatter_nd_update_map[i].key == key )
        {
@ -281,6 +281,13 @@ static vsi_nn_kernel_node_t _setup
    int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
    vsi_size_t width = 0, area = 0, vol = 0;
    int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0;
+    vsi_size_t *input_size = inputs[2]->attr.size;
+    uint32_t dims_num = inputs[2]->attr.dim_num;
+
+    if (coord_dim > 4 && input_size[dims_num - 1] > 1)
+    {
+        return NULL;
+    }

    status = cal_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0],
                    coord_dim, 0, NULL, NULL, NULL, &rs_in_dim);
--- a/src/tim/vx/internal/src/kernel/cl/swish_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/swish_cl.c
@ -113,6 +113,8 @@ static const _kernel_map_type _swish_kernel_map[] =
    SWISH_PACK_KERNEL_MAP_2D(U8,  U8),
    SWISH_PACK_KERNEL_MAP(I32,  I32),
    SWISH_PACK_KERNEL_MAP_2D(I32,  I32),
+    SWISH_PACK_KERNEL_MAP(F32,  U8),
+    SWISH_PACK_KERNEL_MAP_2D(F32,  U8),
    HSWISH_PACK_KERNEL_FLOAT_MAP(F32,  F32),
    HSWISH_PACK_KERNEL_FLOAT_MAP_2D(F32,  F32),
    HSWISH_PACK_KERNEL_FLOAT_MAP(F16,  F16),
@ -222,6 +224,11 @@ static vsi_status _query_kernel
    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

+    if (in_dtype == F16)
+        in_dtype = F32;
+    if (out_dtype == F16)
+        out_dtype = F32;
+
    key = SWISH_HASH_KEY(swish_type, in_dtype, out_dtype, image_2d);

    for( i = 0; i < kernel_map_size; i ++ )
--- a/src/tim/vx/internal/src/kernel/cl/tile_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/tile_cl.c
@ -279,7 +279,7 @@ static vsi_nn_kernel_node_t _setup
    vsi_size_t  new_rank = 0;
    vsi_bool ret = FALSE;
    uint32_t dim = inputs[0]->attr.dim_num;
-    vsi_size_t multiples[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_size_t multiples[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 };

    for ( i = 0;  i < dim;  i++)
    {
--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@ -55,6 +55,13 @@ __BEGIN_DECLS
          CVIVANTE_NAMESPACE("cl.topk_odd_even_sort_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
          "topk_odd_even_sort" }

+#define TOPK_ODD_EVEN_SORT_HASH_KEY2( IN_DTYPE, OUT_DTYPE ) \
+       ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) )
+#define PACK_ODD_EVEN_SORT_KERNEL_MAP2( IN_DTYPE, OUT_DTYPE ) \
+       { TOPK_ODD_EVEN_SORT_HASH_KEY2( IN_DTYPE, OUT_DTYPE ), \
+         CVIVANTE_NAMESPACE("cl.topk_odd_even_sort_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
+         "topk_odd_even_sort2" }
+
 typedef struct
 {
    uint32_t key;
@ -88,6 +95,22 @@ static const _kernel_map_type _topk_kernel_map[] =
    PACK_KERNEL_MAP( I32, I32, 4 ),
    PACK_KERNEL_MAP( I32, I32, 5 ),
    PACK_KERNEL_MAP( I32, I32, 6 ),
+
+    PACK_KERNEL_MAP( F32, U32, 0 ),
+    PACK_KERNEL_MAP( F32, U32, 1 ),
+    PACK_KERNEL_MAP( F32, U32, 2 ),
+    PACK_KERNEL_MAP( F32, U32, 3 ),
+    PACK_KERNEL_MAP( F32, U32, 4 ),
+    PACK_KERNEL_MAP( F32, U32, 5 ),
+    PACK_KERNEL_MAP( F32, U32, 6 ),
+
+    PACK_KERNEL_MAP( F32, I32, 0 ),
+    PACK_KERNEL_MAP( F32, I32, 1 ),
+    PACK_KERNEL_MAP( F32, I32, 2 ),
+    PACK_KERNEL_MAP( F32, I32, 3 ),
+    PACK_KERNEL_MAP( F32, I32, 4 ),
+    PACK_KERNEL_MAP( F32, I32, 5 ),
+    PACK_KERNEL_MAP( F32, I32, 6 ),
 };

 static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
@ -96,6 +119,8 @@ static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
    PACK_ODD_EVEN_SORT_KERNEL_MAP( F32, F32 ),
    PACK_ODD_EVEN_SORT_KERNEL_MAP( U32, U32 ),
    PACK_ODD_EVEN_SORT_KERNEL_MAP( I32, I32 ),
+    PACK_ODD_EVEN_SORT_KERNEL_MAP2( F32, U32 ),
+    PACK_ODD_EVEN_SORT_KERNEL_MAP2( F32, I32 ),
 };

 /*
@ -108,11 +133,15 @@ static vx_param_description_t _topk_kernel_param_def[] =
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    // Add kererl parameters here
 };
 #define _TOPK_PARAM_NUM  _cnt_of_array( _topk_kernel_param_def )
-#define SCALAR_INPUT_NUM_STAGES (3)
-#define SCALAR_INPUT_WIDTH      (4)
+#define SCALAR_INPUT_NUM_STAGES (7)
+#define SCALAR_INPUT_WIDTH      (8)

 static vx_param_description_t _topk_odd_even_sort_kernel_param_def[] =
 {
@ -122,10 +151,14 @@ static vx_param_description_t _topk_odd_even_sort_kernel_param_def[] =
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    // Add kererl parameters here
 };
 #define _TOPK_ODD_EVEN_SORT_PARAM_NUM  _cnt_of_array( _topk_odd_even_sort_kernel_param_def )
-#define SCALAR_INPUT_SIZE  (5)
+#define SCALAR_INPUT_SIZE  (9)
 /*
 * Kernel initializer
 */
@ -251,6 +284,22 @@ static vsi_status _query_kernel
    case _PACK_SELECT_KEY(I8,  I8):
        key = TOPK_HASH_KEY( I32, I32, num_stages );
        break;
+    case _PACK_SELECT_KEY(F32, U32):
+    case _PACK_SELECT_KEY(F16, U32):
+    case _PACK_SELECT_KEY(F32, U16):
+    case _PACK_SELECT_KEY(F16, U16):
+    case _PACK_SELECT_KEY(F32, U8):
+    case _PACK_SELECT_KEY(F16, U8):
+        key = TOPK_HASH_KEY( F32, U32, num_stages );
+        break;
+    case _PACK_SELECT_KEY(F32, I32):
+    case _PACK_SELECT_KEY(F16, I32):
+    case _PACK_SELECT_KEY(F32, I16):
+    case _PACK_SELECT_KEY(F16, I16):
+    case _PACK_SELECT_KEY(F32, I8):
+    case _PACK_SELECT_KEY(F16, I8):
+        key = TOPK_HASH_KEY( F32, I32, num_stages );
+        break;
    default:
        break;
    }
@ -318,6 +367,22 @@ static vsi_status _query_odd_even_sort_kernel
    case _PACK_SELECT_KEY(I8,  I8):
        key = TOPK_ODD_EVEN_SORT_HASH_KEY( I32, I32 );
        break;
+    case _PACK_SELECT_KEY(F32, U32):
+    case _PACK_SELECT_KEY(F16, U32):
+    case _PACK_SELECT_KEY(F32, U16):
+    case _PACK_SELECT_KEY(F16, U16):
+    case _PACK_SELECT_KEY(F32, U8):
+    case _PACK_SELECT_KEY(F16, U8):
+        key = TOPK_ODD_EVEN_SORT_HASH_KEY2( F32, U32 );
+        break;
+    case _PACK_SELECT_KEY(F32, I32):
+    case _PACK_SELECT_KEY(F16, I32):
+    case _PACK_SELECT_KEY(F32, I16):
+    case _PACK_SELECT_KEY(F16, I16):
+    case _PACK_SELECT_KEY(F32, I8):
+    case _PACK_SELECT_KEY(F16, I8):
+        key = TOPK_ODD_EVEN_SORT_HASH_KEY2( F32, I32 );
+        break;
    default:
        break;
    }
@ -372,14 +437,24 @@ static vsi_nn_kernel_node_t _setup
    int32_t num_stages = (int32_t)ceil(log10(block_size / 2.0f) / log10(2.0f));
    vsi_bool is_odd_even_sort = FALSE;
    size_t param_num = _TOPK_PARAM_NUM;
+    float inputScale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float inputTail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
+    float outputTail  = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+
+    outputScale = 1.0f / outputScale;
+    inputTail   = -(inputTail * inputScale);

    for (i = 1; i < inputs[0]->attr.dim_num; i ++)
    {
        block_num = block_num * inputs[0]->attr.size[i];
    }

-    if( vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE ||
-        outputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_INT32 )
+    if ((vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE ||
+       outputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_INT32 ) &&
+      !(inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 &&
+        (outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 ||
+        outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16)))
    {
        return NULL;
    }
@ -425,10 +500,15 @@ static vsi_nn_kernel_node_t _setup
        node = vsi_nn_kernel_create_node( graph, kernel );
        if ( node )
        {
-            /* Set inputs and outputs */
+            uint32_t index = (uint32_t)(input_num + output_num);
+            /* Set inputs and outputs  */
            vsi_nn_kernel_node_pack_io( node_params, param_num,
                    rs_tensors, input_num, &rs_tensors[input_num], output_num );
            /* Pass parameters to node. */
+            node_params[index++]  = vsi_nn_kernel_scalar_create(graph, I32, &inputScale );
+            node_params[index++]   = vsi_nn_kernel_scalar_create(graph, I32, &inputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &outputScale );
+            node_params[index++]  = vsi_nn_kernel_scalar_create(graph, I32, &outputTail );
            if (is_odd_even_sort)
            {
                node_params[SCALAR_INPUT_SIZE] = vsi_nn_kernel_scalar_create(
@ -452,8 +532,25 @@ final:
    vsi_safe_release_tensor(rs_tensors[2]);
    vsi_safe_release_tensor(rs_tensors[3]);
    vsi_safe_release_tensor(rs_tensors[4]);
+
    if (is_odd_even_sort)
    {
+        if (node_params[5])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+        }
+        if (node_params[6])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+        }
+        if (node_params[7])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+        }
+        if (node_params[8])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+        }
        if (node_params[SCALAR_INPUT_SIZE])
        {
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SIZE] );
@ -461,6 +558,22 @@ final:
    }
    else
    {
+        if (node_params[3])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+        }
+        if (node_params[4])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+        }
+        if (node_params[5])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+        }
+        if (node_params[6])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+        }
        if (node_params[SCALAR_INPUT_NUM_STAGES])
        {
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_NUM_STAGES] );
--- a/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c
@ -1,243 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (1)
-#define _CPU_IO_NUM         (_INPUT_NUM + _OUTPUT_NUM)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.add_mean_std_norm")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _add_mean_std_norm_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _ADD_MEAN_STD_NORM_PARAM_NUM  _cnt_of_array( _add_mean_std_norm_kernel_param_def )
-
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    int32_t  i;
-    float mean = .0f, stddev_inv = .0f, variance = .0f, input_d = .0f, data = .0f, eps = .0f;
-    vsi_ssize_t v_size, n_batch, batch;
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[_CPU_IO_NUM], &(eps));
-    v_size  = in_attr[0]->shape->data[0];
-    n_batch = in_attr[0]->shape->data[1];
-
-    for (batch = 0; batch < n_batch; ++batch)
-    {
-        float   sum         = 0.0f;
-        float   sum_sq      = 0.0f;
-        vsi_ssize_t index_base  = batch * v_size;
-        for (i = 0; i < v_size; ++i)
-        {
-            vsi_ssize_t index = i + index_base;
-            input_d = f32_in_buffer[0][index] + f32_in_buffer[1][index];
-            sum    += input_d;
-            sum_sq += input_d * input_d;
-        }
-
-        mean = sum / v_size;
-        stddev_inv = 0.0f;
-        variance = sum_sq / v_size - mean * mean;
-
-        if (variance == 0)
-        {
-            stddev_inv = (float)(1.0f / sqrt(eps));
-        }
-        else
-        {
-            stddev_inv = (float)(1.0f / sqrt(variance));
-        }
-
-        for (i = 0; i < v_size; ++i)
-        {
-            vsi_ssize_t index = i + index_base;
-            input_d   = f32_in_buffer[0][index] + f32_in_buffer[1][index];
-            data      = (input_d - mean) * stddev_inv;
-            f32_out_buffer[0][index] = data;
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _add_mean_std_norm_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _add_mean_std_norm_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_ADD_MEAN_STD_NORM_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _ADD_MEAN_STD_NORM_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[_CPU_IO_NUM] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ADD_MEAN_STD_NORM_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[_CPU_IO_NUM] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( add_mean_std_norm, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c
@ -1,201 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel_eltwise.h"
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (1)
-#define _CPU_INPUT_NUM          (1)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("argmax_sw")
-
-DEF_KERNEL_EXECUTOR(_argmax_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    int32_t i;
-    int32_t axis = 0;
-    vsi_ssize_t outerSize = 1;
-    vsi_ssize_t axisSize = 1;
-    vsi_ssize_t innerSize = 1;
-    vsi_ssize_t inner = 0;
-    vsi_ssize_t outer = 0;
-
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    for (i = 0; i < axis; i++)
-    {
-        innerSize *= attr[0]->shape->data[i];
-    }
-
-    axisSize = attr[0]->shape->data[axis];
-
-    for (i = axis + 1; i < (int32_t)attr[0]->shape->size; i++)
-    {
-        outerSize *= attr[0]->shape->data[i];
-    }
-
-    for ( outer = 0; outer < outerSize; ++outer)
-    {
-        for ( inner = 0; inner < innerSize; ++inner)
-        {
-            float minMaxValue = buffer[0][outer * axisSize * innerSize + inner];
-            int32_t minMaxIndex = 0;
-            for (i = 1; i < axisSize; ++i)
-            {
-                float value = buffer[0][(outer * axisSize + i) * innerSize + inner];
-                if (value > minMaxValue)
-                {
-                    minMaxValue = value;
-                    minMaxIndex = i;
-                }
-            }
-            buffer[1][outer * innerSize + inner] = (float)minMaxIndex;
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            buffer[1], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-    return status;
-} /* _minimum_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _argmax_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-#define SCALAR_INPUT_AXIS          (2)
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t axis = 0;
-
-    axis = vsi_nn_kernel_param_get_int32(params, "axis");
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
-
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_AXIS] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( argmax, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c
@ -1,202 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel_eltwise.h"
-
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (1)
-#define _CPU_INPUT_NUM          (1)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("argmin_sw")
-
-DEF_KERNEL_EXECUTOR(_argmin_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    int32_t i;
-    int32_t axis = 0;
-    vsi_ssize_t outerSize = 1;
-    vsi_ssize_t axisSize = 1;
-    vsi_ssize_t innerSize = 1;
-    vsi_ssize_t inner = 0;
-    vsi_ssize_t outer = 0;
-
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    for (i = 0; i < axis; i++)
-    {
-        innerSize *= attr[0]->shape->data[i];
-    }
-
-    axisSize = attr[0]->shape->data[axis];
-
-    for (i = axis + 1; i < (int32_t)attr[0]->shape->size; i++)
-    {
-        outerSize *= attr[0]->shape->data[i];
-    }
-
-    for ( outer = 0; outer < outerSize; ++outer)
-    {
-        for ( inner = 0; inner < innerSize; ++inner)
-        {
-            float minMaxValue = buffer[0][outer * axisSize * innerSize + inner];
-            int32_t minMaxIndex = 0;
-            for (i = 1; i < axisSize; ++i)
-            {
-                float value = buffer[0][(outer * axisSize + i) * innerSize + inner];
-                if (value < minMaxValue)
-                {
-                    minMaxValue = value;
-                    minMaxIndex = i;
-                }
-            }
-            buffer[1][outer * innerSize + inner] = (float)minMaxIndex;
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            buffer[1], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-    return status;
-} /* _minimum_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _argmin_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-#define SCALAR_INPUT_AXIS          (2)
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t axis = 0;
-
-    axis = vsi_nn_kernel_param_get_int32(params, "axis");
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
-
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_AXIS] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( argmin, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c
@ -1,277 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (4)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.axis_aligned_bbox_transform")
-
-typedef struct vsi_nn_box_encoding_corner_t
-{
-    float x1, y1, x2, y2;
-}vsi_nn_box_encoding_corner;
-
-typedef struct vsi_nn_box_encoding_center_t
-{
-    float w, h, x, y;
-}vsi_nn_box_encoding_center;
-
-/*
- * Kernel params
- */
-static vx_param_description_t _axis_aligned_bbox_transform_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM  _cnt_of_array( _axis_aligned_bbox_transform_kernel_param_def )
-
-
-static void _to_box_encoding_corner
-    (
-    vsi_nn_box_encoding_center* ctr,
-    vsi_nn_box_encoding_corner* cnr
-    )
-{
-    cnr->x1 = ctr->x - ctr->w / 2;
-    cnr->y1 = ctr->y - ctr->h / 2;
-    cnr->x2 = ctr->x + ctr->w / 2;
-    cnr->y2 = ctr->y + ctr->h / 2;
-}
-
-static void _to_box_encoding_center
-    (
-    vsi_nn_box_encoding_corner* cnr,
-    vsi_nn_box_encoding_center* ctr
-    )
-{
-    ctr->w = cnr->x2 - cnr->x1;
-    ctr->h = cnr->y2 - cnr->y1;
-    ctr->x = (cnr->x1 + cnr->x2) / 2;
-    ctr->y = (cnr->y1 + cnr->y2) / 2;
-}
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i;
-    const uint32_t roiLength = 4;
-    const uint32_t imageLength = 2;
-    vsi_size_t numClasses = 0;
-    vsi_size_t numRois = 0;
-    vsi_size_t j;
-    vsi_size_t roiIndex;
-
-    /* prepare data */
-    for (i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for (i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    numClasses = in_attr[1]->shape->data[0] / roiLength;
-    numRois = in_attr[0]->shape->data[1];
-
-    for (roiIndex = 0; roiIndex < numRois; roiIndex++)
-    {
-        uint32_t batchIndex = (uint32_t)f32_in_buffer[2][roiIndex];
-        float imageHeight = f32_in_buffer[3][batchIndex * imageLength];
-        float imageWidth = f32_in_buffer[3][batchIndex * imageLength + 1];
-        vsi_nn_box_encoding_corner roi_cnr;
-        vsi_nn_box_encoding_center roiBefore;
-        roi_cnr.x1 = f32_in_buffer[0][roiIndex * roiLength];
-        roi_cnr.y1 = f32_in_buffer[0][roiIndex * roiLength + 1];
-        roi_cnr.x2 = f32_in_buffer[0][roiIndex * roiLength + 2];
-        roi_cnr.y2 = f32_in_buffer[0][roiIndex * roiLength + 3];
-        _to_box_encoding_center(&roi_cnr, &roiBefore);
-
-        for (j = 0; j < numClasses; j++)
-        {
-            vsi_nn_box_encoding_center roi_ctr;
-            vsi_nn_box_encoding_corner roiAfter;
-            vsi_nn_box_encoding_corner cliped;
-            vsi_size_t index = (roiIndex * numClasses + j) * roiLength;
-
-            roi_ctr.w = (float)(exp(f32_in_buffer[1][index + 2]) * roiBefore.w);
-            roi_ctr.h = (float)(exp(f32_in_buffer[1][index + 3]) * roiBefore.h);
-            roi_ctr.x = roiBefore.x + f32_in_buffer[1][index] * roiBefore.w;
-            roi_ctr.y = roiBefore.y + f32_in_buffer[1][index + 1] * roiBefore.h;
-            _to_box_encoding_corner(&roi_ctr, &roiAfter);
-
-            cliped.x1 = vsi_nn_min(vsi_nn_max(roiAfter.x1, 0.0f), imageWidth);
-            cliped.y1 = vsi_nn_min(vsi_nn_max(roiAfter.y1, 0.0f), imageHeight);
-            cliped.x2 = vsi_nn_min(vsi_nn_max(roiAfter.x2, 0.0f), imageWidth);
-            cliped.y2 = vsi_nn_min(vsi_nn_max(roiAfter.y2, 0.0f), imageHeight);
-            f32_out_buffer[0][index] = cliped.x1;
-            f32_out_buffer[0][index + 1] = cliped.y1;
-            f32_out_buffer[0][index + 2] = cliped.x2;
-            f32_out_buffer[0][index + 3] = cliped.y2;
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _axis_aligned_bbox_transform_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _axis_aligned_bbox_transform_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( axis_aligned_bbox_transform, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c
@ -1,222 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_tensor_util.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel_eltwise.h"
-
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (1)
-#define _CPU_INPUT_NUM          (5)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("batch_norm_sw")
-
-static vsi_ssize_t _expand_offset
-    (
-    vsi_ssize_t index,
-    vsi_size_t * shape, vsi_size_t rank,
-    vsi_size_t * strides, vsi_size_t * out_shape
-    )
-{
-    vsi_size_t i;
-    vsi_ssize_t offset = 0;
-
-    for( i = 0; i < rank && index; i ++ )
-    {
-        if( shape[i] == out_shape[i] )
-        {
-            offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
-        }
-        index /= out_shape[i];
-    }
-    return offset;
-}
-
-DEF_KERNEL_EXECUTOR(_batch_norm_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    vsi_size_t out_elements = 0;
-    vsi_size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i = 0;
-    float eps = 0.f;
-
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &eps);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    for ( i = 0;  i < _CPU_INPUT_NUM;  i++)
-    {
-        tensors[i]  = (vsi_nn_kernel_tensor_t)param[i];
-        attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
-
-        vsi_nn_kernel_tensor_attr_get_stride( attr[i], stride_size[i] );
-        buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( buffer[i], "Create input buffer fail.", final );
-    }
-
-    tensors[5]  = (vsi_nn_kernel_tensor_t)param[5];
-    attr[5] = vsi_nn_kernel_tensor_attr_create( tensors[5] );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[5] );
-
-    buffer[5] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[5], "Create output buffer fail.", final );
-    memset( buffer[5], 0, out_elements * sizeof(float) );
-
-    for( i = 0; i < out_elements; i ++ )
-    {
-        vsi_ssize_t in_offset[5] = {0};
-        int32_t j = 0;
-        float src = 0.f;
-        float mean = 0.f;
-        float variance = 0.f;
-        float beta = 0.f;
-        float gamma = 0.f;
-
-        for ( j = 0; j < 5; j++)
-        {
-            in_offset[j] = _expand_offset( i, attr[j]->shape->data, (vsi_size_t)attr[j]->shape->size,
-                    stride_size[j], attr[5]->shape->data );
-        }
-
-        src = buffer[0][in_offset[0]];
-        mean = buffer[1][in_offset[1]];
-        variance = buffer[2][in_offset[2]];
-        gamma = buffer[3][in_offset[3]];
-        beta = buffer[4][in_offset[4]];
-
-
-        buffer[5][i] = (src - mean) * gamma/ sqrtf(variance + eps) + beta;
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[5], attr[5],
-            buffer[5], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-    return status;
-} /* _batch_norm_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-#define SCALAR_INPUT_EPS          (6)
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _batch_norm_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    float eps = 0;
-
-    eps = vsi_nn_kernel_param_get_float32(params, "eps");
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            /* Pass parameters to node. */
-            backend_params[SCALAR_INPUT_EPS] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &eps );
-
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-
-            vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_EPS] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( batchnorm_single, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c
@ -1,534 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (3)
-#define _OUTPUT_NUM         (4)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.box_with_nms_limit")
-
-/*
- * Kernel params
- */
-static vx_param_description_t _box_with_nms_limit_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _BOX_WITH_NMS_LIMIT_PARAM_NUM  _cnt_of_array( _box_with_nms_limit_kernel_param_def )
-#define SCORE_THRESHOLD         (7)
-#define MAX_NUM_DETECTIONS      (8)
-#define NMS_KERNEL_METHOD       (9)
-#define IOU_THRESHOLD           (10)
-#define SIGMA                   (11)
-#define NMS_SCORE_THRESHOLD     (12)
-
-static float hard_nms_kernel
-    (
-    float iou,
-    float iouThreshold
-    )
-{
-    return iou < iouThreshold ? 1.0f : 0.0f;
-}
-
-static float linear_nms_kernel
-    (
-    float iou,
-    float iouThreshold
-    )
-{
-    return iou < iouThreshold ? 1.0f : 1.0f - iou;
-}
-
-static float gaussian_nms_kernel
-    (
-    float iou,
-    float sigma
-    )
-{
-    return (float)(exp(-1.0f * iou * iou / sigma));
-}
-
-void swap_element
-    (
-    uint32_t* list,
-    uint32_t first,
-    uint32_t second
-    )
-{
-    uint32_t temp = list[first];
-    list[first] = list[second];
-    list[second] = temp;
-}
-
-uint32_t max_element
-    (
-    float* data,
-    uint32_t* index_list,
-    uint32_t len
-    )
-{
-    uint32_t i;
-    uint32_t max_index = 0;
-    float max_val = data[index_list[0]];
-    for(i = 1; i < len; i++)
-    {
-        float val = data[index_list[i]];
-        if (max_val < val)
-        {
-            max_val = val;
-            max_index = i;
-        }
-    }
-    return max_index;
-}
-
-static uint32_t max_comp_func
-    (
-    void* data,
-    int32_t left,
-    int32_t right
-    )
-{
-    float* fdata = (float*)data;
-    return fdata[left] >= fdata[right];
-}
-
-void sort_element_by_score
-    (
-    float* data,
-    uint32_t* index_list,
-    uint32_t len
-    )
-{
-    vsi_nn_partition(data, 0, len - 1, max_comp_func, TRUE, index_list);
-}
-
-typedef struct
-{
-    float* fdata;
-    uint32_t numClasses;
-} class_comp_param;
-
-static uint32_t class_comp_func
-    (
-    void* data,
-    int32_t left,
-    int32_t right
-    )
-{
-    class_comp_param *p = (class_comp_param*)data;
-    float* fdata = p->fdata;
-    uint32_t numClasses = p->numClasses;
-    uint32_t lhsClass = left % numClasses, rhsClass = right % numClasses;
-    return lhsClass == rhsClass ? fdata[left] > fdata[right]
-                : lhsClass < rhsClass;
-}
-
-static void sort_element_by_class
-    (
-    float* data,
-    uint32_t* index_list,
-    uint32_t len,
-    uint32_t numClasses
-    )
-{
-    class_comp_param class_comp;
-    class_comp.fdata = data;
-    class_comp.numClasses = numClasses;
-    vsi_nn_partition(&class_comp, 0, len - 1, class_comp_func, TRUE, index_list);
-}
-
-// Taking two indices of bounding boxes, return the intersection-of-union.
-float getIoUAxisAligned
-    (
-    const float* roi1,
-    const float* roi2
-    )
-{
-    const float area1 = (roi1[2] - roi1[0]) * (roi1[3] - roi1[1]);
-    const float area2 = (roi2[2] - roi2[0]) * (roi2[3] - roi2[1]);
-    const float x1 = vsi_nn_max(roi1[0], roi2[0]);
-    const float x2 = vsi_nn_min(roi1[2], roi2[2]);
-    const float y1 = vsi_nn_max(roi1[1], roi2[1]);
-    const float y2 = vsi_nn_min(roi1[3], roi2[3]);
-    const float w = vsi_nn_max(x2 - x1, 0.0f);
-    const float h = vsi_nn_max(y2 - y1, 0.0f);
-    const float areaIntersect = w * h;
-    const float areaUnion = area1 + area2 - areaIntersect;
-    return areaIntersect / areaUnion;
-}
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    int32_t* int32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    int32_t* int32_out_buffer[_OUTPUT_NUM] = {0};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i = 0;
-    float score_threshold = 0;
-    int32_t max_num_detections = 0;
-    int32_t nms_kernel_method = 0;
-    float iou_threshold = 0;
-    float sigma = 0;
-    float nms_score_threshold = 0;
-    uint32_t j = 0, n = 0, b = 0, c = 0;
-    const uint32_t kRoiDim = 4;
-    uint32_t numRois = 0;
-    uint32_t numClasses = 0;
-    int32_t ind = 0;
-    uint32_t * batch_data = NULL;
-    int32_t numBatch = 0;
-    uint32_t * select = NULL;
-    uint32_t select_size = 0;
-    uint32_t scores_index = 0;
-    uint32_t roi_index = 0;
-    uint32_t roi_out_index = 0;
-
-    /* prepare data */
-    for (i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        if (i == 2)
-        {
-            int32_in_buffer[i] = (int32_t*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-            CHECK_PTR_FAIL_GOTO( int32_in_buffer[i], "Create input buffer fail.", final );
-        }
-        else
-        {
-            f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-            CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input buffer fail.", final );
-        }
-    }
-
-    for (i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        if (i < 2)
-        {
-            f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-            CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-            memset( f32_out_buffer[i], 0, out_bytes[i] );
-        }
-        else
-        {
-            int32_out_buffer[i] = (int32_t *)malloc( out_bytes[i] );
-            CHECK_PTR_FAIL_GOTO( int32_out_buffer[i], "Create output buffer fail.", final );
-            memset( int32_out_buffer[i], 0, out_bytes[i] );
-        }
-    }
-
-#define VSI_NN_KERNEL_READ_SCALAR(type, idx, pointer) \
-    vsi_nn_kernel_scalar_read_##type((vsi_nn_kernel_scalar_t)param[idx], pointer)
-
-    status   = VSI_NN_KERNEL_READ_SCALAR(float32, SCORE_THRESHOLD, &score_threshold);
-    status  |= VSI_NN_KERNEL_READ_SCALAR(int32, MAX_NUM_DETECTIONS, &max_num_detections);
-    status  |= VSI_NN_KERNEL_READ_SCALAR(int32, NMS_KERNEL_METHOD, &nms_kernel_method);
-    status  |= VSI_NN_KERNEL_READ_SCALAR(float32, IOU_THRESHOLD, &iou_threshold);
-    status  |= VSI_NN_KERNEL_READ_SCALAR(float32, SIGMA, &sigma);
-    status  |= VSI_NN_KERNEL_READ_SCALAR(float32, NMS_SCORE_THRESHOLD, &nms_score_threshold);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-#undef VSI_NN_KERNEL_READ_SCALAR
-
-    numRois = (uint32_t)in_attr[0]->shape->data[1];
-    numClasses = (uint32_t)in_attr[0]->shape->data[0];
-
-    batch_data = (uint32_t*)malloc(numRois * sizeof(uint32_t));
-    CHECK_PTR_FAIL_GOTO( batch_data, "Create batch_data fail.", final );
-    memset(batch_data, 0, numRois * sizeof(uint32_t));
-
-    for (i = 0, ind = -1; i < numRois; i++)
-    {
-        if (int32_in_buffer[2][i] != ind)
-        {
-            ind = int32_in_buffer[2][i];
-            numBatch++;
-        }
-        batch_data[numBatch - 1]++;
-    }
-    select = (uint32_t*)malloc(numBatch * numRois
-        * numClasses * sizeof(uint32_t));
-    CHECK_PTR_FAIL_GOTO( select, "Create select fail.", final );
-    memset(select, 0, numBatch * numRois * numClasses * sizeof(uint32_t));
-    for (n = 0; n < (uint32_t)numBatch; n++)
-    {
-        int32_t numDetections_batch = 0;
-        uint32_t select_start_batch = select_size;
-        uint32_t select_len = 0;
-        // Exclude class 0 (background)
-        for (c = 1; c < numClasses; c++)
-        {
-            uint32_t select_start = select_size;
-            int32_t maxNumDetections0 = max_num_detections;
-            uint32_t numDetections = 0;
-            for (b = 0; b < batch_data[n]; b++)
-            {
-                uint32_t index = b * numClasses + c;
-                float score = f32_in_buffer[0][scores_index + index];
-                if (score > score_threshold) {
-                    select[select_size] = index;
-                    select_size++;
-                }
-            }
-            select_len = select_size - select_start;
-
-            if (maxNumDetections0 < 0)
-            {
-                maxNumDetections0 = select_len;
-            }
-
-            for (j = 0; (j < select_len && numDetections < (uint32_t)maxNumDetections0); j++)
-            {
-                // find max score and swap to the front.
-                int32_t max_index = max_element(&(f32_in_buffer[0][scores_index]),
-                    &(select[select_start + j]), select_len - j) + j;
-
-                swap_element(&(select[select_start]), max_index, j);
-
-                // Calculate IoU of the rest, swap to the end (disgard) if needed.
-                for (i = j + 1; i < select_len; i++)
-                {
-                    int32_t roiBase0 = roi_index + select[select_start + i] * kRoiDim;
-                    int32_t roiBase1 = roi_index + select[select_start + j] * kRoiDim;
-                    float iou = getIoUAxisAligned(&(f32_in_buffer[1][roiBase0]),
-                        &(f32_in_buffer[1][roiBase1]));
-                    float kernel_iou;
-                    if (nms_kernel_method == 0)
-                    {
-                        kernel_iou = hard_nms_kernel(iou, iou_threshold);
-                    }
-                    else if (nms_kernel_method == 1)
-                    {
-                        kernel_iou = linear_nms_kernel(iou, iou_threshold);
-                    }
-                    else
-                    {
-                        kernel_iou = gaussian_nms_kernel(iou, sigma);
-                    }
-                    f32_in_buffer[0][scores_index + select[select_start + i]] *= kernel_iou;
-                    if (f32_in_buffer[0][scores_index + select[select_start + i]] < nms_score_threshold)
-                    {
-                        swap_element(&(select[select_start]), i, select_len - 1);
-                        i--;
-                        select_len--;
-                    }
-                }
-                numDetections++;
-            }
-            select_size = select_start + select_len;
-            numDetections_batch += numDetections;
-        }
-
-        // Take top max_num_detections.
-        sort_element_by_score(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]),
-            numDetections_batch);
-
-        if (numDetections_batch > max_num_detections && max_num_detections >= 0)
-        {
-            select_size = select_start_batch + max_num_detections;
-        }
-        select_len = select_size - select_start_batch;
-        // Sort again by class.
-        sort_element_by_class(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]),
-            select_len, numClasses);
-
-        for (i = 0; i < select_len; i++)
-        {
-            int32_t in_index0 = scores_index + select[select_start_batch + i];
-            int32_t in_index1 = roi_index + select[select_start_batch + i] * kRoiDim;
-            f32_out_buffer[0][roi_out_index] = f32_in_buffer[0][in_index0];
-            memcpy(&(f32_out_buffer[1][roi_out_index * kRoiDim]),
-                &f32_in_buffer[1][in_index1], kRoiDim * sizeof(float));
-            int32_out_buffer[2][roi_out_index] = select[select_start_batch + i] % numClasses;
-            int32_out_buffer[3][roi_out_index] = n;
-            roi_out_index++;
-        }
-
-        scores_index += batch_data[n] * numClasses;
-        roi_index += batch_data[n] * numClasses * kRoiDim;
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (i < 2)
-        {
-            status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        }
-        else
-        {
-            status = vsi_nn_kernel_tensor_write( output[i], out_attr[i],
-                int32_out_buffer[i], out_bytes[i] );
-        }
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-final:
-    vsi_nn_safe_free(batch_data);
-    vsi_nn_safe_free(select);
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        vsi_nn_safe_free(f32_in_buffer[i]);
-        vsi_nn_safe_free(int32_in_buffer[i]);
-
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        vsi_nn_safe_free(f32_out_buffer[i]);
-        vsi_nn_safe_free(int32_out_buffer[i]);
-
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _box_with_nms_limit_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _box_with_nms_limit_kernel_param_def );
-
-    return status;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_BOX_WITH_NMS_LIMIT_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    float score_threshold  = vsi_nn_kernel_param_get_float32( params, "score_threshold" );
-    int32_t max_num_detections  = vsi_nn_kernel_param_get_int32( params, "max_num_detections" );
-    int32_t nms_kernel_method  = vsi_nn_kernel_param_get_int32( params, "nms_kernel_method" );
-    float iou_threshold  = vsi_nn_kernel_param_get_float32( params, "iou_threshold" );
-    float sigma  = vsi_nn_kernel_param_get_float32( params, "sigma" );
-    float nms_score_threshold  = vsi_nn_kernel_param_get_float32( params, "nms_score_threshold" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if ( VSI_SUCCESS == status )
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _BOX_WITH_NMS_LIMIT_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCORE_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &score_threshold );
-            node_params[MAX_NUM_DETECTIONS] = vsi_nn_kernel_scalar_create( graph, I32, &max_num_detections );
-            node_params[NMS_KERNEL_METHOD] = vsi_nn_kernel_scalar_create( graph, I32, &nms_kernel_method );
-            node_params[IOU_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &iou_threshold );
-            node_params[SIGMA] = vsi_nn_kernel_scalar_create( graph, F32, &sigma );
-            node_params[NMS_SCORE_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &nms_score_threshold );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _BOX_WITH_NMS_LIMIT_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &node_params[SCORE_THRESHOLD] );
-            vsi_nn_kernel_scalar_release( &node_params[MAX_NUM_DETECTIONS] );
-            vsi_nn_kernel_scalar_release( &node_params[NMS_KERNEL_METHOD] );
-            vsi_nn_kernel_scalar_release( &node_params[IOU_THRESHOLD] );
-            vsi_nn_kernel_scalar_release( &node_params[SIGMA] );
-            vsi_nn_kernel_scalar_release( &node_params[NMS_SCORE_THRESHOLD] );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( box_with_nms_limit, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/bucketize_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/bucketize_cpu.c
@ -1,229 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.bucketize")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _bucketize_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _BUCKETIZE_PARAM_NUM  _cnt_of_array( _bucketize_kernel_param_def )
-#define SCALAR_RIGHT_VALUE          (3)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i = 0, j = 0;
-    int32_t right = 0;
-    uint32_t boundaries_size = 0;
-
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_RIGHT_VALUE], &(right));
-
-    boundaries_size = (uint32_t)in_attr[1]->shape->data[0];
-
-    for (i = 0; i < out_elements[0]; i++)
-    {
-        float src0 = f32_in_buffer[0][i];
-        float dst = 0;
-
-        for (j = 0; j < boundaries_size; j++)
-        {
-            float src1 = f32_in_buffer[1][j];
-
-            if (right == 1)
-            {
-                dst += (src0 >= src1 ? 1.0f : 0.0f);
-            }
-            else
-            {
-                dst += (src0 > src1 ? 1.0f : 0.0f);
-            }
-        }
-
-        f32_out_buffer[0][i] = dst;
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _bucketize_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _bucketize_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_BUCKETIZE_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t right  = vsi_nn_kernel_param_get_int32( params, "right" );
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _BUCKETIZE_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            node_params[SCALAR_RIGHT_VALUE] = vsi_nn_kernel_scalar_create( graph, I32, &right );
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _BUCKETIZE_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_RIGHT_VALUE] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( bucketize, _setup )
-
--- a/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c
@ -1,217 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "utils/vsi_nn_dtype_util_prv.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.cast")
-
-/*
- * Kernel params
- */
-static vx_param_description_t _cast_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _CAST_PARAM_NUM  _cnt_of_array( _cast_kernel_param_def )
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i;
-    double     max_value = 0.0f, min_value = 0.0f;
-    vsi_bool clamp_flag = FALSE;
-    vsi_nn_type_e out_type;
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        in_attr[i]->quant             = VSI_NN_KERNEL_QUANT_NONE;
-        in_attr[i]->dfp.fl            = 0;
-        in_attr[i]->asymm.scale       = 1.0f;
-        in_attr[i]->asymm.zero_point  = 0;
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    out_type = vsi_nn_dtype_map_kernel(out_attr[0]->dtype);
-
-    if( type_is_integer( out_type ) )
-    {
-        clamp_flag = TRUE;
-        type_get_range(out_type, &max_value, &min_value);
-    }
-
-    for (i = 0; i < out_elements[0]; i++)
-    {
-        float val = f32_in_buffer[0][i];
-        if (clamp_flag)
-        {
-            val = vsi_nn_clamp(val, (float)min_value, (float)max_value);
-        }
-        f32_out_buffer[0][i] = val;
-    }
-
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        out_attr[i]->quant             = VSI_NN_KERNEL_QUANT_NONE;
-        out_attr[i]->dfp.fl            = 0;
-        out_attr[i]->asymm.scale       = 1.0f;
-        out_attr[i]->asymm.zero_point  = 0;
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _cast_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _cast_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_CAST_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _CAST_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CAST_PARAM_NUM );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( cast, _setup )
--- a/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c
@ -1,217 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.clip")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _clip_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _CLIP_PARAM_NUM  _cnt_of_array( _clip_kernel_param_def )
-
-#define SCALAR_MIN_VALUE          (2)
-#define SCALAR_MAX_VALUE          (3)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i;
-    float     min_value = 0.0f;
-    float     max_value = 0.0f;
-
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MIN_VALUE], &(min_value));
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MAX_VALUE], &(max_value));
-
-    for (i = 0; i < out_elements[0]; i++)
-    {
-        f32_out_buffer[0][i] = vsi_nn_clamp(f32_in_buffer[0][i], min_value, max_value);
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _clip_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _clip_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_CLIP_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    float   min_value  = vsi_nn_kernel_param_get_float32( params, "min_value" );
-    float   max_value  = vsi_nn_kernel_param_get_float32( params, "max_value" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _CLIP_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_MIN_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &min_value );
-            node_params[SCALAR_MAX_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &max_value );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CLIP_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_MIN_VALUE] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_VALUE] );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( clip, _setup )
--- a/Show More
+++ b/Show More